[Qemu-devel] [PATCH 1/5] qemu-fd-exchange: provide common methods for exchange fd

2014-01-23 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/qemu/fd-exchange.h |   25 +++
 util/Makefile.objs |1 +
 util/qemu-fd-exchange.c|   97 
 3 files changed, 123 insertions(+), 0 deletions(-)
 create mode 100644 include/qemu/fd-exchange.h
 create mode 100644 util/qemu-fd-exchange.c

diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h
new file mode 100644
index 000..8502960
--- /dev/null
+++ b/include/qemu/fd-exchange.h
@@ -0,0 +1,25 @@
+/*
+ * Internal common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef FD_EXCHANGE_H
+#define FD_EXCHANGE_H
+
+#include sys/socket.h
+
+union MsgControl {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+};
+
+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len);
+
+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len);
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index af3e5cb..2fb42bf 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -13,3 +13,4 @@ util-obj-y += hexdump.o
 util-obj-y += crc32c.o
 util-obj-y += throttle.o
 util-obj-y += getauxval.o
+util-obj-y += qemu-fd-exchange.o
diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c
new file mode 100644
index 000..bee3fc1
--- /dev/null
+++ b/util/qemu-fd-exchange.c
@@ -0,0 +1,97 @@
+/*
+ * Internal common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include qemu/fd-exchange.h
+#include qemu-common.h
+
+
+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len)
+{
+struct msghdr msg;
+struct iovec iov;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+
+iov.iov_base = (char *)buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = len;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+if (passed_fd  0) {
+*(char *)buf = passed_fd;
+} else {
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+cmsg = msg_control.cmsg;
+cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd));
+cmsg-cmsg_level = SOL_SOCKET;
+cmsg-cmsg_type = SCM_RIGHTS;
+memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd));
+
+}
+
+do {
+retval = sendmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+return retval;
+}
+
+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len)
+{
+struct iovec iov;
+struct msghdr msg;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+char data;
+
+iov.iov_base = data;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+do {
+retval = recvmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+if (retval = 0) {
+return retval;
+}
+
+if (data != *(char *)buf) {
+*passed_fd = data;
+return 0;
+}
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS) {
+continue;
+}
+
+memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd));
+return 0;
+}
+
+*passed_fd = -ENFILE;
+return retval;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 5/5] virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd

2014-01-23 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 hw/9pfs/virtio-9p-proxy.c |   60 ++--
 1 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/hw/9pfs/virtio-9p-proxy.c b/hw/9pfs/virtio-9p-proxy.c
index 5f44bb7..f34b845 100644
--- a/hw/9pfs/virtio-9p-proxy.c
+++ b/hw/9pfs/virtio-9p-proxy.c
@@ -14,6 +14,7 @@
 #include hw/virtio/virtio.h
 #include virtio-9p.h
 #include qemu/error-report.h
+#include qemu/fd-exchange.h
 #include fsdev/qemu-fsdev.h
 #include virtio-9p-proxy.h
 
@@ -24,62 +25,6 @@ typedef struct V9fsProxy {
 struct iovec out_iovec;
 } V9fsProxy;
 
-/*
- * Return received file descriptor on success in *status.
- * errno is also returned on *status (which will be  0)
- * return  0 on transport error.
- */
-static int v9fs_receivefd(int sockfd, int *status)
-{
-struct iovec iov;
-struct msghdr msg;
-struct cmsghdr *cmsg;
-int retval, data, fd;
-union MsgControl msg_control;
-
-iov.iov_base = data;
-iov.iov_len = sizeof(data);
-
-memset(msg, 0, sizeof(msg));
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-msg.msg_control = msg_control;
-msg.msg_controllen = sizeof(msg_control);
-
-do {
-retval = recvmsg(sockfd, msg, 0);
-} while (retval  0  errno == EINTR);
-if (retval = 0) {
-return retval;
-}
-/*
- * data is set to V9FS_FD_VALID, if ancillary data is sent.  If this
- * request doesn't need ancillary data (fd) or an error occurred,
- * data is set to negative errno value.
- */
-if (data != V9FS_FD_VALID) {
-*status = data;
-return 0;
-}
-/*
- * File descriptor (fd) is sent in the ancillary data. Check if we
- * indeed received it. One of the reasons to fail to receive it is if
- * we exceeded the maximum number of file descriptors!
- */
-for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
-if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
-cmsg-cmsg_level != SOL_SOCKET ||
-cmsg-cmsg_type != SCM_RIGHTS) {
-continue;
-}
-fd = *((int *)CMSG_DATA(cmsg));
-*status = fd;
-return 0;
-}
-*status = -ENFILE;  /* Ancillary data sent but not received */
-return 0;
-}
-
 static ssize_t socket_read(int sockfd, void *buff, size_t size)
 {
 ssize_t retval, total = 0;
@@ -307,6 +252,7 @@ static int v9fs_request(V9fsProxy *proxy, int type,
 V9fsString *name, *value;
 V9fsString *path, *oldpath;
 struct iovec *iovec = NULL, *reply = NULL;
+int data = V9FS_FD_VALID;
 
 qemu_mutex_lock(proxy-mutex);
 
@@ -548,7 +494,7 @@ static int v9fs_request(V9fsProxy *proxy, int type,
  * A file descriptor is returned as response for
  * T_OPEN,T_CREATE on success
  */
-if (v9fs_receivefd(proxy-sockfd, retval)  0) {
+if (qemu_recv_with_fd(proxy-sockfd, retval, data, sizeof(data))  
0) {
 goto close_error;
 }
 break;
-- 
1.7.7.6




[Qemu-devel] [PATCH 0/5 v2] Provide common methods for exchange FD

2014-01-23 Thread Lei Li
This patch series tries to refactor the functions used for 
exchange of FD in current code, provide common methods
for it.

I just tested it through page flipping migration, and tap/
bridge-helper a bit, but have some environment problem on
proxy fs driver. So it'd be appreciated if someone could
help on verifying whether it has impact on it. :)

Please let me know if there is anything needs to be improved.

Thanks


Changes since V1:
  -- Copy right and typo fixes pointed out by Eric.
  -- Don't cast 'char *' to 'int *' from Daniel.
  -- Get rid of local migration part.

Lei Li (5):
  fd-exchange: provide common methods for exchange of fd
  qemu-bridge-helper: replace send_fd with qemu_send_with_fd
  net/tap: replace recv_fd with qemu_recv_with_fd
  virtfs-proxy-helper: replace send_fd with qemu_send_with_fd
  virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd

 Makefile|2 +-
 fsdev/virtfs-proxy-helper.c |   51 ---
 hw/9pfs/virtio-9p-proxy.c   |   60 +-
 hw/9pfs/virtio-9p-proxy.h   |5 --
 include/qemu/fd-exchange.h  |   25 +++
 net/tap.c   |   40 +
 qemu-bridge-helper.c|   31 +
 util/Makefile.objs  |1 +
 util/qemu-fd-exchange.c |   97 +++
 10 files changed, 144 insertions(+), 220 deletions(-)
 create mode 100644 include/qemu/fd-exchange.h
 create mode 100644 util/qemu-fd-exchange.c




[Qemu-devel] [PATCH 4/5] virtfs-proxy-helper: replace send_fd with qemu_send_with_fd

2014-01-23 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 fsdev/virtfs-proxy-helper.c |   51 ++
 hw/9pfs/virtio-9p-proxy.h   |5 
 2 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c
index 713a7b2..44c6e61 100644
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -23,6 +23,7 @@
 #include qemu-common.h
 #include qemu/sockets.h
 #include qemu/xattr.h
+#include qemu/fd-exchange.h
 #include virtio-9p-marshal.h
 #include hw/9pfs/virtio-9p-proxy.h
 #include fsdev/virtio-9p-marshal.h
@@ -203,48 +204,6 @@ static int read_request(int sockfd, struct iovec *iovec, 
ProxyHeader *header)
 return 0;
 }
 
-static int send_fd(int sockfd, int fd)
-{
-struct msghdr msg;
-struct iovec iov;
-int retval, data;
-struct cmsghdr *cmsg;
-union MsgControl msg_control;
-
-iov.iov_base = data;
-iov.iov_len = sizeof(data);
-
-memset(msg, 0, sizeof(msg));
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-/* No ancillary data on error */
-if (fd  0) {
-/* fd is really negative errno if the request failed  */
-data = fd;
-} else {
-data = V9FS_FD_VALID;
-msg.msg_control = msg_control;
-msg.msg_controllen = sizeof(msg_control);
-
-cmsg = msg_control.cmsg;
-cmsg-cmsg_len = CMSG_LEN(sizeof(fd));
-cmsg-cmsg_level = SOL_SOCKET;
-cmsg-cmsg_type = SCM_RIGHTS;
-memcpy(CMSG_DATA(cmsg), fd, sizeof(fd));
-}
-
-do {
-retval = sendmsg(sockfd, msg, 0);
-} while (retval  0  errno == EINTR);
-if (fd = 0) {
-close(fd);
-}
-if (retval  0) {
-return retval;
-}
-return 0;
-}
-
 static int send_status(int sockfd, struct iovec *iovec, int status)
 {
 ProxyHeader header;
@@ -784,11 +743,17 @@ static void usage(char *prog)
 static int process_reply(int sock, int type,
  struct iovec *out_iovec, int retval)
 {
+int data = V9FS_FD_VALID;
+
 switch (type) {
 case T_OPEN:
 case T_CREATE:
-if (send_fd(sock, retval)  0) {
+if (qemu_send_with_fd(sock, retval, data, sizeof(data))  0) {
 return -1;
+} else {
+if (retval = 0) {
+close(retval);
+}
 }
 break;
 case T_MKNOD:
diff --git a/hw/9pfs/virtio-9p-proxy.h b/hw/9pfs/virtio-9p-proxy.h
index 005c1ad..e359ac5 100644
--- a/hw/9pfs/virtio-9p-proxy.h
+++ b/hw/9pfs/virtio-9p-proxy.h
@@ -24,11 +24,6 @@
 #define proxy_marshal(out_sg, offset, fmt, args...) \
 v9fs_marshal(out_sg, 1, offset, 0, fmt, ##args)
 
-union MsgControl {
-struct cmsghdr cmsg;
-char control[CMSG_SPACE(sizeof(int))];
-};
-
 typedef struct {
 uint32_t type;
 uint32_t size;
-- 
1.7.7.6




[Qemu-devel] [PATCH 2/5] qemu-bridge-helper: replace send_fd with qemu_send_with_fd

2014-01-23 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 Makefile |2 +-
 qemu-bridge-helper.c |   31 +++
 2 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index bdff4e4..6850f35 100644
--- a/Makefile
+++ b/Makefile
@@ -195,7 +195,7 @@ qemu-img$(EXESUF): qemu-img.o $(block-obj-y) libqemuutil.a 
libqemustub.a
 qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) libqemuutil.a libqemustub.a
 qemu-io$(EXESUF): qemu-io.o $(block-obj-y) libqemuutil.a libqemustub.a
 
-qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o
+qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a
 
 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o 
fsdev/virtio-9p-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap
diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c
index 6a0974e..8303b6b 100644
--- a/qemu-bridge-helper.c
+++ b/qemu-bridge-helper.c
@@ -40,6 +40,7 @@
 #endif
 
 #include qemu/queue.h
+#include qemu/fd-exchange.h
 
 #include net/tap-linux.h
 
@@ -174,33 +175,6 @@ static void prep_ifreq(struct ifreq *ifr, const char 
*ifname)
 snprintf(ifr-ifr_name, IFNAMSIZ, %s, ifname);
 }
 
-static int send_fd(int c, int fd)
-{
-char msgbuf[CMSG_SPACE(sizeof(fd))];
-struct msghdr msg = {
-.msg_control = msgbuf,
-.msg_controllen = sizeof(msgbuf),
-};
-struct cmsghdr *cmsg;
-struct iovec iov;
-char req[1] = { 0x00 };
-
-cmsg = CMSG_FIRSTHDR(msg);
-cmsg-cmsg_level = SOL_SOCKET;
-cmsg-cmsg_type = SCM_RIGHTS;
-cmsg-cmsg_len = CMSG_LEN(sizeof(fd));
-msg.msg_controllen = cmsg-cmsg_len;
-
-iov.iov_base = req;
-iov.iov_len = sizeof(req);
-
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-memcpy(CMSG_DATA(cmsg), fd, sizeof(fd));
-
-return sendmsg(c, msg, 0);
-}
-
 #ifdef CONFIG_LIBCAP
 static int drop_privileges(void)
 {
@@ -239,6 +213,7 @@ int main(int argc, char **argv)
 ACLList acl_list;
 int access_allowed, access_denied;
 int ret = EXIT_SUCCESS;
+char req[1] = { 0x00 };
 
 #ifdef CONFIG_LIBCAP
 /* if we're run from an suid binary, immediately drop privileges preserving
@@ -424,7 +399,7 @@ int main(int argc, char **argv)
 }
 
 /* write fd to the domain socket */
-if (send_fd(unixfd, fd) == -1) {
+if (qemu_send_with_fd(unixfd, fd, req, sizeof(req)) == -1) {
 fprintf(stderr, failed to write fd to unix socket: %s\n,
 strerror(errno));
 ret = EXIT_FAILURE;
-- 
1.7.7.6




[Qemu-devel] [PATCH 3/5] net/tap: replace recv_fd with qemu_recv_with_fd

2014-01-23 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 net/tap.c |   40 +++-
 1 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index 39c1cda..97ee2e8 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -39,6 +39,7 @@
 #include sysemu/sysemu.h
 #include qemu-common.h
 #include qemu/error-report.h
+#include qemu/fd-exchange.h
 
 #include net/tap.h
 
@@ -385,40 +386,6 @@ static int launch_script(const char *setup_script, const 
char *ifname, int fd)
 return -1;
 }
 
-static int recv_fd(int c)
-{
-int fd;
-uint8_t msgbuf[CMSG_SPACE(sizeof(fd))];
-struct msghdr msg = {
-.msg_control = msgbuf,
-.msg_controllen = sizeof(msgbuf),
-};
-struct cmsghdr *cmsg;
-struct iovec iov;
-uint8_t req[1];
-ssize_t len;
-
-cmsg = CMSG_FIRSTHDR(msg);
-cmsg-cmsg_level = SOL_SOCKET;
-cmsg-cmsg_type = SCM_RIGHTS;
-cmsg-cmsg_len = CMSG_LEN(sizeof(fd));
-msg.msg_controllen = cmsg-cmsg_len;
-
-iov.iov_base = req;
-iov.iov_len = sizeof(req);
-
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-
-len = recvmsg(c, msg, 0);
-if (len  0) {
-memcpy(fd, CMSG_DATA(cmsg), sizeof(fd));
-return fd;
-}
-
-return len;
-}
-
 static int net_bridge_run_helper(const char *helper, const char *bridge)
 {
 sigset_t oldmask, mask;
@@ -489,12 +456,11 @@ static int net_bridge_run_helper(const char *helper, 
const char *bridge)
 
 } else if (pid  0) {
 int fd;
+char req[1] = { 0x00 };
 
 close(sv[1]);
 
-do {
-fd = recv_fd(sv[0]);
-} while (fd == -1  errno == EINTR);
+qemu_recv_with_fd(sv[0], fd, req, sizeof(req));
 
 close(sv[0]);
 
-- 
1.7.7.6




Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd

2014-01-19 Thread Lei Li

On 01/17/2014 06:02 PM, Daniel P. Berrange wrote:

On Wed, Jan 08, 2014 at 05:12:51PM +0800, Lei Li wrote:

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  include/qemu/fd-exchange.h |   25 +++
  util/Makefile.objs |1 +
  util/qemu-fd-exchange.c|   97 
  3 files changed, 123 insertions(+), 0 deletions(-)
  create mode 100644 include/qemu/fd-exchange.h
  create mode 100644 util/qemu-fd-exchange.c

diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h
new file mode 100644
index 000..6929026
--- /dev/null
+++ b/include/qemu/fd-exchange.h
@@ -0,0 +1,25 @@
+/*
+ * Internel common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef FD_EXCHANGE_H
+#define FD_EXCHANGE_H
+
+#include sys/socket.h
+
+union MsgControl {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+};
+
+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len);
+
+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len);
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index af3e5cb..2fb42bf 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -13,3 +13,4 @@ util-obj-y += hexdump.o
  util-obj-y += crc32c.o
  util-obj-y += throttle.o
  util-obj-y += getauxval.o
+util-obj-y += qemu-fd-exchange.o
diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c
new file mode 100644
index 000..70a3206
--- /dev/null
+++ b/util/qemu-fd-exchange.c
@@ -0,0 +1,97 @@
+/*
+ * Internel common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include qemu/fd-exchange.h
+#include qemu-common.h
+
+
+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len)
+{
+struct msghdr msg;
+struct iovec iov;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+
+iov.iov_base = (int *)buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = len;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+if (passed_fd  0) {
+*(int *)buf = passed_fd;

You are casting 'char *buf' to an 'int *' but many of the
callers only pass in a pointer to a 'char buf[1]'. So you
are overflowing the array and also likely causing alignment
violations on ARM platforms.


You are right, will fix it.

Thanks.




+} else {
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+cmsg = msg_control.cmsg;
+cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd));
+cmsg-cmsg_level = SOL_SOCKET;
+cmsg-cmsg_type = SCM_RIGHTS;
+memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd));
+
+}
+
+do {
+retval = sendmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+return retval;
+}
+
+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len)
+{
+struct iovec iov;
+struct msghdr msg;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+int data = *(int *)buf;
+
+iov.iov_base = buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+do {
+retval = recvmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+if (retval = 0) {
+return retval;
+}
+
+if (data != *(int *)buf) {
+*passed_fd = data;
+return 0;
+}

Again cast issues


+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS) {
+continue;
+}
+
+memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd));
+return 0;
+}
+
+*passed_fd = -ENFILE;
+return retval;
+}
--

Regards,
Daniel



--
Lei




Re: [Qemu-devel] [PATCH resend 0/6 RFC] Provide common methods for exchange FD

2014-01-16 Thread Lei Li

Any comments?

On 01/08/2014 05:12 PM, Lei Li wrote:

This patch series tries to refactor the functions used for
exchange of FD in current code, provide common methods
for it.

The series is based on the localhost migration with side channel
for ram series as it was already a good shape. But if you want
to merge this first, I'll get rid of the migration part.

I just tested page flipping migration, and tap/bridge-helper a
bit, but have some environment problem on proxy fs driver. So
it'd be appreciated if someone could help on verifying whether
it has impact on it. :)

Please let me know if there is anything needs to be improved.

Thanks.


Lei Li (6):
   fd-exchange: provide common methods for exchange of fd
   qemu-bridge-helper: replace send_fd with qemu_send_with_fd
   net/tap: replace recv_fd with qemu_recv_with_fd
   virtfs-proxy-helper: replace send_fd with qemu_send_with_fd
   virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd
   migration-local: replace send_pipefd with qemu_send_with_fd

  Makefile|2 +-
  fsdev/virtfs-proxy-helper.c |   51 ---
  hw/9pfs/virtio-9p-proxy.c   |   60 +-
  hw/9pfs/virtio-9p-proxy.h   |5 --
  include/qemu/fd-exchange.h  |   25 +++
  migration-local.c   |   52 +--
  net/tap.c   |   40 +
  qemu-bridge-helper.c|   31 +
  util/Makefile.objs  |1 +
  util/qemu-fd-exchange.c |   97 +++
  10 files changed, 144 insertions(+), 220 deletions(-)
  create mode 100644 include/qemu/fd-exchange.h
  create mode 100644 util/qemu-fd-exchange.c





--
Lei




Re: [Qemu-devel] [PATCH 5/6] virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd

2014-01-16 Thread Lei Li

On 01/16/2014 06:16 PM, Daniel P. Berrange wrote:

On Wed, Jan 08, 2014 at 05:12:55PM +0800, Lei Li wrote:

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  hw/9pfs/virtio-9p-proxy.c |   60 ++--
  1 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/hw/9pfs/virtio-9p-proxy.c b/hw/9pfs/virtio-9p-proxy.c
index 5f44bb7..f34b845 100644
--- a/hw/9pfs/virtio-9p-proxy.c
+++ b/hw/9pfs/virtio-9p-proxy.c
-do {
-retval = recvmsg(sockfd, msg, 0);
-} while (retval  0  errno == EINTR);
-if (retval = 0) {
-return retval;
-}
-/*
- * data is set to V9FS_FD_VALID, if ancillary data is sent.  If this
- * request doesn't need ancillary data (fd) or an error occurred,
- * data is set to negative errno value.
- */
-if (data != V9FS_FD_VALID) {
-*status = data;
-return 0;
-}

This code is handling the 'data' value...



@@ -307,6 +252,7 @@ static int v9fs_request(V9fsProxy *proxy, int type,
  V9fsString *name, *value;
  V9fsString *path, *oldpath;
  struct iovec *iovec = NULL, *reply = NULL;
+int data = V9FS_FD_VALID;
  
  qemu_mutex_lock(proxy-mutex);
  
@@ -548,7 +494,7 @@ static int v9fs_request(V9fsProxy *proxy, int type,

   * A file descriptor is returned as response for
   * T_OPEN,T_CREATE on success
   */
-if (v9fs_receivefd(proxy-sockfd, retval)  0) {
+if (qemu_recv_with_fd(proxy-sockfd, retval, data, sizeof(data))  
0) {
  goto close_error;
  }

...but this code is ignoring the return value in 'data'.


It is not be ignored. The above logical is put into the common
method, like:

if (data != *(int *)buf) {
*passed_fd = data;
return 0;
}




Daniel



--
Lei




Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd

2014-01-16 Thread Lei Li

On 01/16/2014 11:16 PM, Eric Blake wrote:

On 01/08/2014 02:12 AM, Lei Li wrote:

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  include/qemu/fd-exchange.h |   25 +++
  util/Makefile.objs |1 +
  util/qemu-fd-exchange.c|   97 
  3 files changed, 123 insertions(+), 0 deletions(-)
  create mode 100644 include/qemu/fd-exchange.h
  create mode 100644 util/qemu-fd-exchange.c

diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h
new file mode 100644
index 000..6929026
--- /dev/null
+++ b/include/qemu/fd-exchange.h
@@ -0,0 +1,25 @@
+/*
+ * Internel common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.

Any reason you can't use GPLv2+?  Limiting to exactly version 2 means
your file cannot be copied into programs that want a wider array of
licensing optoins.


Er... it's my miss copy, apologize to this. :(






--
Lei




Re: [Qemu-devel] [PATCH 4/6] virtfs-proxy-helper: replace send_fd with qemu_send_with_fd

2014-01-16 Thread Lei Li

On 01/16/2014 06:15 PM, Daniel P. Berrange wrote:

On Wed, Jan 08, 2014 at 05:12:54PM +0800, Lei Li wrote:

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  fsdev/virtfs-proxy-helper.c |   51 ++
  hw/9pfs/virtio-9p-proxy.h   |5 
  2 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c
index 713a7b2..44c6e61 100644
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
-static int send_fd(int sockfd, int fd)
-{

...

-/* No ancillary data on error */
-if (fd  0) {
-/* fd is really negative errno if the request failed  */
-data = fd;
-} else {
-data = V9FS_FD_VALID;

The way data is initialized here...


@@ -784,11 +743,17 @@ static void usage(char *prog)
  static int process_reply(int sock, int type,
   struct iovec *out_iovec, int retval)
  {
+int data = V9FS_FD_VALID;


Doesn't match what you do here.


Well, it looks like it does not match the original order,
because the 'data' has to be passed to the common methods by
the parameter *buf first, as there would be different data
value set for the check by those callers. But the logical
is the same:

if the passed_fd is negative, 'data' will be set to the
negative fd; otherwise it'll be the check value.




+
  switch (type) {
  case T_OPEN:
  case T_CREATE:
-if (send_fd(sock, retval)  0) {
+if (qemu_send_with_fd(sock, retval, data, sizeof(data))  0) {
  return -1;
+} else {
+if (retval = 0) {
+close(retval);
+}
  }
  break;
  case T_MKNOD:

Regards,
Daniel



--
Lei




Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd

2014-01-16 Thread Lei Li

On 01/16/2014 11:26 PM, Eric Blake wrote:

On 01/08/2014 02:12 AM, Lei Li wrote:

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  include/qemu/fd-exchange.h |   25 +++
  util/Makefile.objs |1 +
  util/qemu-fd-exchange.c|   97 
  3 files changed, 123 insertions(+), 0 deletions(-)
  create mode 100644 include/qemu/fd-exchange.h
  create mode 100644 util/qemu-fd-exchange.c

diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h
new file mode 100644
index 000..6929026
--- /dev/null
+++ b/include/qemu/fd-exchange.h
@@ -0,0 +1,25 @@
+/*
+ * Internel common methods for exchange of FD

s/Internel/Internal/



+++ b/util/qemu-fd-exchange.c
@@ -0,0 +1,97 @@
+/*
+ * Internel common methods for exchange of FD

and again.


Good catch! Thanks.




+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len)
+{
+struct msghdr msg;
+struct iovec iov;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+
+iov.iov_base = (int *)buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = len;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+if (passed_fd  0) {
+*(int *)buf = passed_fd;

Is it safe to assume that buf is aligned well enough to be casting it to
int* then dereferencing it?  Why not just type the parameter correctly


That's because there would be different type for this parameter.


to begin with?  And why are you even writing into the caller's buffer
when they pass a negative fd, but leaving it alone when they pass a
non-negative fd?


That's just the original logical of exchange fd for proxy fs driver,

if (fd  0) {
data = fd;
} else {
data = V9FS_FD_VALID;

}

This common method don't leave it alone when a non-negative fd passed,
it'll be the same as the check value passed from the caller.


+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len)
+{
+struct iovec iov;
+struct msghdr msg;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+int data = *(int *)buf;

Again, why not type buf correctly, since otherwise you risk a user
passing in a buffer that is unsuitably aligned for dereferencing as an
int pointer.


+
+iov.iov_base = buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+

Should you take advantage of Linux' ability to use MSG_CMSG_CLOEXEC to
guarantee the received fd is atomically marked cloexec when possible?


Whether close the fd in the common method depends on the process
of these current users (they are not the same). It'd be better to
let the users handling the close of fd to fit it.






+do {
+retval = recvmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+if (retval = 0) {
+return retval;
+}
+
+if (data != *(int *)buf) {
+*passed_fd = data;
+return 0;
+}
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS) {
+continue;
+}
+
+memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd));
+return 0;
+}

And even when MSG_CMSG_CLOEXEC is not available, shouldn't you ensure
that cloexec is set after the fact?


That's a good suggestion, thanks.



--
Lei




[Qemu-devel] [PATCH 2/6] qemu-bridge-helper: replace send_fd with qemu_send_with_fd

2014-01-08 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 Makefile |2 +-
 qemu-bridge-helper.c |   31 +++
 2 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index bdff4e4..6850f35 100644
--- a/Makefile
+++ b/Makefile
@@ -195,7 +195,7 @@ qemu-img$(EXESUF): qemu-img.o $(block-obj-y) libqemuutil.a 
libqemustub.a
 qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) libqemuutil.a libqemustub.a
 qemu-io$(EXESUF): qemu-io.o $(block-obj-y) libqemuutil.a libqemustub.a
 
-qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o
+qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a
 
 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o 
fsdev/virtio-9p-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap
diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c
index 6a0974e..8303b6b 100644
--- a/qemu-bridge-helper.c
+++ b/qemu-bridge-helper.c
@@ -40,6 +40,7 @@
 #endif
 
 #include qemu/queue.h
+#include qemu/fd-exchange.h
 
 #include net/tap-linux.h
 
@@ -174,33 +175,6 @@ static void prep_ifreq(struct ifreq *ifr, const char 
*ifname)
 snprintf(ifr-ifr_name, IFNAMSIZ, %s, ifname);
 }
 
-static int send_fd(int c, int fd)
-{
-char msgbuf[CMSG_SPACE(sizeof(fd))];
-struct msghdr msg = {
-.msg_control = msgbuf,
-.msg_controllen = sizeof(msgbuf),
-};
-struct cmsghdr *cmsg;
-struct iovec iov;
-char req[1] = { 0x00 };
-
-cmsg = CMSG_FIRSTHDR(msg);
-cmsg-cmsg_level = SOL_SOCKET;
-cmsg-cmsg_type = SCM_RIGHTS;
-cmsg-cmsg_len = CMSG_LEN(sizeof(fd));
-msg.msg_controllen = cmsg-cmsg_len;
-
-iov.iov_base = req;
-iov.iov_len = sizeof(req);
-
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-memcpy(CMSG_DATA(cmsg), fd, sizeof(fd));
-
-return sendmsg(c, msg, 0);
-}
-
 #ifdef CONFIG_LIBCAP
 static int drop_privileges(void)
 {
@@ -239,6 +213,7 @@ int main(int argc, char **argv)
 ACLList acl_list;
 int access_allowed, access_denied;
 int ret = EXIT_SUCCESS;
+char req[1] = { 0x00 };
 
 #ifdef CONFIG_LIBCAP
 /* if we're run from an suid binary, immediately drop privileges preserving
@@ -424,7 +399,7 @@ int main(int argc, char **argv)
 }
 
 /* write fd to the domain socket */
-if (send_fd(unixfd, fd) == -1) {
+if (qemu_send_with_fd(unixfd, fd, req, sizeof(req)) == -1) {
 fprintf(stderr, failed to write fd to unix socket: %s\n,
 strerror(errno));
 ret = EXIT_FAILURE;
-- 
1.7.7.6




[Qemu-devel] [PATCH resend 0/6 RFC] Provide common methods for exchange FD

2014-01-08 Thread Lei Li
This patch series tries to refactor the functions used for 
exchange of FD in current code, provide common methods
for it.

The series is based on the localhost migration with side channel
for ram series as it was already a good shape. But if you want
to merge this first, I'll get rid of the migration part. 

I just tested page flipping migration, and tap/bridge-helper a
bit, but have some environment problem on proxy fs driver. So 
it'd be appreciated if someone could help on verifying whether
it has impact on it. :)

Please let me know if there is anything needs to be improved.

Thanks.


Lei Li (6):
  fd-exchange: provide common methods for exchange of fd
  qemu-bridge-helper: replace send_fd with qemu_send_with_fd
  net/tap: replace recv_fd with qemu_recv_with_fd
  virtfs-proxy-helper: replace send_fd with qemu_send_with_fd
  virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd
  migration-local: replace send_pipefd with qemu_send_with_fd

 Makefile|2 +-
 fsdev/virtfs-proxy-helper.c |   51 ---
 hw/9pfs/virtio-9p-proxy.c   |   60 +-
 hw/9pfs/virtio-9p-proxy.h   |5 --
 include/qemu/fd-exchange.h  |   25 +++
 migration-local.c   |   52 +--
 net/tap.c   |   40 +
 qemu-bridge-helper.c|   31 +
 util/Makefile.objs  |1 +
 util/qemu-fd-exchange.c |   97 +++
 10 files changed, 144 insertions(+), 220 deletions(-)
 create mode 100644 include/qemu/fd-exchange.h
 create mode 100644 util/qemu-fd-exchange.c




[Qemu-devel] [PATCH 4/6] virtfs-proxy-helper: replace send_fd with qemu_send_with_fd

2014-01-08 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 fsdev/virtfs-proxy-helper.c |   51 ++
 hw/9pfs/virtio-9p-proxy.h   |5 
 2 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c
index 713a7b2..44c6e61 100644
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -23,6 +23,7 @@
 #include qemu-common.h
 #include qemu/sockets.h
 #include qemu/xattr.h
+#include qemu/fd-exchange.h
 #include virtio-9p-marshal.h
 #include hw/9pfs/virtio-9p-proxy.h
 #include fsdev/virtio-9p-marshal.h
@@ -203,48 +204,6 @@ static int read_request(int sockfd, struct iovec *iovec, 
ProxyHeader *header)
 return 0;
 }
 
-static int send_fd(int sockfd, int fd)
-{
-struct msghdr msg;
-struct iovec iov;
-int retval, data;
-struct cmsghdr *cmsg;
-union MsgControl msg_control;
-
-iov.iov_base = data;
-iov.iov_len = sizeof(data);
-
-memset(msg, 0, sizeof(msg));
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-/* No ancillary data on error */
-if (fd  0) {
-/* fd is really negative errno if the request failed  */
-data = fd;
-} else {
-data = V9FS_FD_VALID;
-msg.msg_control = msg_control;
-msg.msg_controllen = sizeof(msg_control);
-
-cmsg = msg_control.cmsg;
-cmsg-cmsg_len = CMSG_LEN(sizeof(fd));
-cmsg-cmsg_level = SOL_SOCKET;
-cmsg-cmsg_type = SCM_RIGHTS;
-memcpy(CMSG_DATA(cmsg), fd, sizeof(fd));
-}
-
-do {
-retval = sendmsg(sockfd, msg, 0);
-} while (retval  0  errno == EINTR);
-if (fd = 0) {
-close(fd);
-}
-if (retval  0) {
-return retval;
-}
-return 0;
-}
-
 static int send_status(int sockfd, struct iovec *iovec, int status)
 {
 ProxyHeader header;
@@ -784,11 +743,17 @@ static void usage(char *prog)
 static int process_reply(int sock, int type,
  struct iovec *out_iovec, int retval)
 {
+int data = V9FS_FD_VALID;
+
 switch (type) {
 case T_OPEN:
 case T_CREATE:
-if (send_fd(sock, retval)  0) {
+if (qemu_send_with_fd(sock, retval, data, sizeof(data))  0) {
 return -1;
+} else {
+if (retval = 0) {
+close(retval);
+}
 }
 break;
 case T_MKNOD:
diff --git a/hw/9pfs/virtio-9p-proxy.h b/hw/9pfs/virtio-9p-proxy.h
index 005c1ad..e359ac5 100644
--- a/hw/9pfs/virtio-9p-proxy.h
+++ b/hw/9pfs/virtio-9p-proxy.h
@@ -24,11 +24,6 @@
 #define proxy_marshal(out_sg, offset, fmt, args...) \
 v9fs_marshal(out_sg, 1, offset, 0, fmt, ##args)
 
-union MsgControl {
-struct cmsghdr cmsg;
-char control[CMSG_SPACE(sizeof(int))];
-};
-
 typedef struct {
 uint32_t type;
 uint32_t size;
-- 
1.7.7.6




[Qemu-devel] [PATCH 5/6] virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd

2014-01-08 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 hw/9pfs/virtio-9p-proxy.c |   60 ++--
 1 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/hw/9pfs/virtio-9p-proxy.c b/hw/9pfs/virtio-9p-proxy.c
index 5f44bb7..f34b845 100644
--- a/hw/9pfs/virtio-9p-proxy.c
+++ b/hw/9pfs/virtio-9p-proxy.c
@@ -14,6 +14,7 @@
 #include hw/virtio/virtio.h
 #include virtio-9p.h
 #include qemu/error-report.h
+#include qemu/fd-exchange.h
 #include fsdev/qemu-fsdev.h
 #include virtio-9p-proxy.h
 
@@ -24,62 +25,6 @@ typedef struct V9fsProxy {
 struct iovec out_iovec;
 } V9fsProxy;
 
-/*
- * Return received file descriptor on success in *status.
- * errno is also returned on *status (which will be  0)
- * return  0 on transport error.
- */
-static int v9fs_receivefd(int sockfd, int *status)
-{
-struct iovec iov;
-struct msghdr msg;
-struct cmsghdr *cmsg;
-int retval, data, fd;
-union MsgControl msg_control;
-
-iov.iov_base = data;
-iov.iov_len = sizeof(data);
-
-memset(msg, 0, sizeof(msg));
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-msg.msg_control = msg_control;
-msg.msg_controllen = sizeof(msg_control);
-
-do {
-retval = recvmsg(sockfd, msg, 0);
-} while (retval  0  errno == EINTR);
-if (retval = 0) {
-return retval;
-}
-/*
- * data is set to V9FS_FD_VALID, if ancillary data is sent.  If this
- * request doesn't need ancillary data (fd) or an error occurred,
- * data is set to negative errno value.
- */
-if (data != V9FS_FD_VALID) {
-*status = data;
-return 0;
-}
-/*
- * File descriptor (fd) is sent in the ancillary data. Check if we
- * indeed received it. One of the reasons to fail to receive it is if
- * we exceeded the maximum number of file descriptors!
- */
-for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
-if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
-cmsg-cmsg_level != SOL_SOCKET ||
-cmsg-cmsg_type != SCM_RIGHTS) {
-continue;
-}
-fd = *((int *)CMSG_DATA(cmsg));
-*status = fd;
-return 0;
-}
-*status = -ENFILE;  /* Ancillary data sent but not received */
-return 0;
-}
-
 static ssize_t socket_read(int sockfd, void *buff, size_t size)
 {
 ssize_t retval, total = 0;
@@ -307,6 +252,7 @@ static int v9fs_request(V9fsProxy *proxy, int type,
 V9fsString *name, *value;
 V9fsString *path, *oldpath;
 struct iovec *iovec = NULL, *reply = NULL;
+int data = V9FS_FD_VALID;
 
 qemu_mutex_lock(proxy-mutex);
 
@@ -548,7 +494,7 @@ static int v9fs_request(V9fsProxy *proxy, int type,
  * A file descriptor is returned as response for
  * T_OPEN,T_CREATE on success
  */
-if (v9fs_receivefd(proxy-sockfd, retval)  0) {
+if (qemu_recv_with_fd(proxy-sockfd, retval, data, sizeof(data))  
0) {
 goto close_error;
 }
 break;
-- 
1.7.7.6




[Qemu-devel] [PATCH 6/6] migration-local: replace send_pipefd with qemu_send_with_fd

2014-01-08 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   52 +++-
 1 files changed, 3 insertions(+), 49 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index ce4c070..c01ba06 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -26,6 +26,7 @@
 #include sysemu/sysemu.h
 #include block/block.h
 #include qemu/sockets.h
+#include qemu/fd-exchange.h
 #include migration/block.h
 #include qemu/thread.h
 #include qmp-commands.h
@@ -169,8 +170,6 @@ static int qemu_local_close(void *opaque)
 return 0;
 }
 
-static int send_pipefd(int sockfd, int pipefd);
-
 static size_t qemu_local_save_ram(QEMUFile *f, void *opaque,
   MemoryRegion *mr, ram_addr_t offset,
   size_t size, int *bytes_sent)
@@ -179,13 +178,14 @@ static size_t qemu_local_save_ram(QEMUFile *f, void 
*opaque,
 ram_addr_t current_addr = mr-ram_addr + offset;
 void *ram_addr;
 ssize_t ret;
+char req[1] = { 0x01 };
 
 if (s-unix_page_flipping) {
 qemu_put_be64(s-file, current_addr | RAM_SAVE_FLAG_HOOK);
 qemu_fflush(s-file);
 
 if (!s-pipefd_passed) {
-ret = send_pipefd(s-sockfd, s-pipefd[0]);
+ret = qemu_send_with_fd(s-sockfd, s-pipefd[0], req, 
sizeof(req));
 if (ret  0) {
 fprintf(stderr, failed to pass PIPE\n);
 return ret;
@@ -342,49 +342,3 @@ fail:
 g_free(s);
 return NULL;
 }
-
-
-/*
- * Pass a pipe file descriptor to another process.
- *
- * Return negative value If pipefd  0. Return 0 on
- * success.
- *
- */
-static int send_pipefd(int sockfd, int pipefd)
-{
-struct msghdr msg;
-struct iovec iov[1];
-ssize_t ret;
-char req[1] = { 0x01 };
-
-union {
-  struct cmsghdr cm;
-  char control[CMSG_SPACE(sizeof(int))];
-} control_un;
-struct cmsghdr *cmptr;
-
-msg.msg_control = control_un.control;
-msg.msg_controllen = sizeof(control_un.control);
-
-cmptr = CMSG_FIRSTHDR(msg);
-cmptr-cmsg_len = CMSG_LEN(sizeof(int));
-cmptr-cmsg_level = SOL_SOCKET;
-cmptr-cmsg_type = SCM_RIGHTS;
-*((int *) CMSG_DATA(cmptr)) = pipefd;
-
-msg.msg_name = NULL;
-msg.msg_namelen = 0;
-
-iov[0].iov_base = req;
-iov[0].iov_len = sizeof(req);
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-
-ret = sendmsg(sockfd, msg, 0);
-if (ret = 0) {
-DPRINTF(sendmsg error: %s\n, strerror(errno));
-}
-
-return ret;
-}
-- 
1.7.7.6




[Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd

2014-01-08 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/qemu/fd-exchange.h |   25 +++
 util/Makefile.objs |1 +
 util/qemu-fd-exchange.c|   97 
 3 files changed, 123 insertions(+), 0 deletions(-)
 create mode 100644 include/qemu/fd-exchange.h
 create mode 100644 util/qemu-fd-exchange.c

diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h
new file mode 100644
index 000..6929026
--- /dev/null
+++ b/include/qemu/fd-exchange.h
@@ -0,0 +1,25 @@
+/*
+ * Internel common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef FD_EXCHANGE_H
+#define FD_EXCHANGE_H
+
+#include sys/socket.h
+
+union MsgControl {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+};
+
+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len);
+
+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len);
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index af3e5cb..2fb42bf 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -13,3 +13,4 @@ util-obj-y += hexdump.o
 util-obj-y += crc32c.o
 util-obj-y += throttle.o
 util-obj-y += getauxval.o
+util-obj-y += qemu-fd-exchange.o
diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c
new file mode 100644
index 000..70a3206
--- /dev/null
+++ b/util/qemu-fd-exchange.c
@@ -0,0 +1,97 @@
+/*
+ * Internel common methods for exchange of FD
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include qemu/fd-exchange.h
+#include qemu-common.h
+
+
+ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
+  const void *buf, size_t len)
+{
+struct msghdr msg;
+struct iovec iov;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+
+iov.iov_base = (int *)buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = len;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+if (passed_fd  0) {
+*(int *)buf = passed_fd;
+} else {
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+cmsg = msg_control.cmsg;
+cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd));
+cmsg-cmsg_level = SOL_SOCKET;
+cmsg-cmsg_type = SCM_RIGHTS;
+memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd));
+
+}
+
+do {
+retval = sendmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+return retval;
+}
+
+ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
+  void *buf, size_t len)
+{
+struct iovec iov;
+struct msghdr msg;
+struct cmsghdr *cmsg;
+union MsgControl msg_control;
+int retval;
+int data = *(int *)buf;
+
+iov.iov_base = buf;
+iov.iov_len = len;
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+do {
+retval = recvmsg(sockfd, msg, 0);
+} while (retval  0  errno == EINTR);
+
+if (retval = 0) {
+return retval;
+}
+
+if (data != *(int *)buf) {
+*passed_fd = data;
+return 0;
+}
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS) {
+continue;
+}
+
+memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd));
+return 0;
+}
+
+*passed_fd = -ENFILE;
+return retval;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 3/6] net/tap: replace recv_fd with qemu_recv_with_fd

2014-01-08 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 net/tap.c |   40 +++-
 1 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index 39c1cda..97ee2e8 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -39,6 +39,7 @@
 #include sysemu/sysemu.h
 #include qemu-common.h
 #include qemu/error-report.h
+#include qemu/fd-exchange.h
 
 #include net/tap.h
 
@@ -385,40 +386,6 @@ static int launch_script(const char *setup_script, const 
char *ifname, int fd)
 return -1;
 }
 
-static int recv_fd(int c)
-{
-int fd;
-uint8_t msgbuf[CMSG_SPACE(sizeof(fd))];
-struct msghdr msg = {
-.msg_control = msgbuf,
-.msg_controllen = sizeof(msgbuf),
-};
-struct cmsghdr *cmsg;
-struct iovec iov;
-uint8_t req[1];
-ssize_t len;
-
-cmsg = CMSG_FIRSTHDR(msg);
-cmsg-cmsg_level = SOL_SOCKET;
-cmsg-cmsg_type = SCM_RIGHTS;
-cmsg-cmsg_len = CMSG_LEN(sizeof(fd));
-msg.msg_controllen = cmsg-cmsg_len;
-
-iov.iov_base = req;
-iov.iov_len = sizeof(req);
-
-msg.msg_iov = iov;
-msg.msg_iovlen = 1;
-
-len = recvmsg(c, msg, 0);
-if (len  0) {
-memcpy(fd, CMSG_DATA(cmsg), sizeof(fd));
-return fd;
-}
-
-return len;
-}
-
 static int net_bridge_run_helper(const char *helper, const char *bridge)
 {
 sigset_t oldmask, mask;
@@ -489,12 +456,11 @@ static int net_bridge_run_helper(const char *helper, 
const char *bridge)
 
 } else if (pid  0) {
 int fd;
+char req[1] = { 0x00 };
 
 close(sv[1]);
 
-do {
-fd = recv_fd(sv[0]);
-} while (fd == -1  errno == EINTR);
+qemu_recv_with_fd(sv[0], fd, req, sizeof(req));
 
 close(sv[0]);
 
-- 
1.7.7.6




Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-12-03 Thread Lei Li

On 12/02/2013 05:33 PM, Daniel P. Berrange wrote:

On Mon, Dec 02, 2013 at 05:19:06PM +0800, Lei Li wrote:

This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  migration-local.c |   46 ++
  1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 929ed60..f479530 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -167,3 +167,49 @@ fail:
  g_free(s);
  return NULL;
  }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+char req[1] = { 0x01 };
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}

Just a reminder about my comments from previous posting. This is
introducing a 3rd private function for sending FDs. The existing
code should be refactored into qemu-socket.{c,h} and shared.


Hi Daniel,

Yes, I remembered your suggestion. As my reply in the previous version,
I'll make this refactoring in a separate thread. There are some differences
between these private functions (like data type and length of bytes
transmitted), may need a little time to get the common method settle down,
and would be better to do some test to make sure there is no impact on them.
And now this is a complete series as an experimental version, do you mind if
the refactoring would be posted after this series?



Daniel



--
Lei




Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-12-03 Thread Lei Li

On 12/03/2013 07:35 PM, Daniel P. Berrange wrote:

On Tue, Dec 03, 2013 at 07:19:40PM +0800, Lei Li wrote:

On 12/02/2013 05:33 PM, Daniel P. Berrange wrote:

On Mon, Dec 02, 2013 at 05:19:06PM +0800, Lei Li wrote:

This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  migration-local.c |   46 ++
  1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 929ed60..f479530 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -167,3 +167,49 @@ fail:
  g_free(s);
  return NULL;
  }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+char req[1] = { 0x01 };
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}

Just a reminder about my comments from previous posting. This is
introducing a 3rd private function for sending FDs. The existing
code should be refactored into qemu-socket.{c,h} and shared.

Hi Daniel,

Yes, I remembered your suggestion. As my reply in the previous version,
I'll make this refactoring in a separate thread. There are some differences
between these private functions (like data type and length of bytes
transmitted), may need a little time to get the common method settle down,
and would be better to do some test to make sure there is no impact on them.
And now this is a complete series as an experimental version, do you mind if
the refactoring would be posted after this series?

IMHO the refactoring should be a pre-requisite of this series. I've seen
too many times where future refactoring was promised but never arrived
because the motivation to fix it is gone once the main series is committed.
It is up to QEMU maintainers though - this is just my personal opinion.


Just this is already a good shape and the refactoring may need
a little more time since some details might needs to be considered
and better to discuss in a separate thread.

I am happy to take any chance to contribute to community, as I can
learn a lot from you guys and it's really good experience that my
work could be useful to lots of people. And I believe this is not
my last patch for it. :)



Regards,
Daniel



--
Lei




Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-12-03 Thread Lei Li

On 12/03/2013 07:52 PM, Paolo Bonzini wrote:

Il 03/12/2013 12:19, Lei Li ha scritto:

On 12/02/2013 05:33 PM, Daniel P. Berrange wrote:

On Mon, Dec 02, 2013 at 05:19:06PM +0800, Lei Li wrote:

This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
   migration-local.c |   46
++
   1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 929ed60..f479530 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -167,3 +167,49 @@ fail:
   g_free(s);
   return NULL;
   }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+char req[1] = { 0x01 };
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}

Just a reminder about my comments from previous posting. This is
introducing a 3rd private function for sending FDs. The existing
code should be refactored into qemu-socket.{c,h} and shared.

Hi Daniel,

Yes, I remembered your suggestion. As my reply in the previous version,
I'll make this refactoring in a separate thread. There are some differences
between these private functions (like data type and length of bytes
transmitted), may need a little time to get the common method settle down,
and would be better to do some test to make sure there is no impact on
them.

You would have to implement it in such a way that the buffer is
specified in the function, for example:

ssize_t qemu_send_with_fd(int sockfd, int passed_fd, const void *buf,
   size_t len);
ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, void *buf,
   size_t len);

The functions can go in util/ (I think not in qemu-socket.c, a new file
is preferrable).

I don't think it's particularly important, but it's definitely welcome.


Hi Paolo,

Thanks for your specified suggestion! As it needs to test the related
code (tap/bridge  Proxy FS  flipping migration), I will work on it
after back from my vacation next week.  :-)



Paolo




--
Lei




Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-12-02 Thread Lei Li

On 11/29/2013 07:14 PM, Daniel P. Berrange wrote:

On Fri, Nov 29, 2013 at 06:06:13PM +0800, Lei Li wrote:

This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  migration-local.c |   46 ++
  1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 929ed60..f479530 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -167,3 +167,49 @@ fail:
  g_free(s);
  return NULL;
  }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+char req[1] = { 0x01 };
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}

There are already two copies of this function in QEMU, not to mention
several copies of code for receving FDs.

Rather than adding yet more copies of this functionality it would be
much better to add 2 methods to util/qemu-sockets.{c,h} for sending
and receiving file descriptors and update all existing code to use
them.


Hi Daniel,

Make sense, sounds like a good plan to me.

Just take a quick look, seems there are some differences between
them, I will have a try in a separate thread after back from my
vacation next week.

Thanks for your suggestion.



Daniel



--
Lei




Re: [Qemu-devel] [PATCH 0/17 v4] Localhost migration with side channel for ram

2013-12-02 Thread Lei Li

On 11/29/2013 06:26 PM, Paolo Bonzini wrote:

Il 29/11/2013 11:06, Lei Li ha scritto:

This patch series tries to introduce a mechanism using side
channel pipe for RAM via SCM_RIGHTS with unix domain socket
protocol migration.

This side channel is used for the page flipping by vmsplice,
which is the internal mechanism for localhost migration that
we are trying to add to QEMU. The backgroud info and previous
patch series for reference,

Localhost migration
http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html

migration: Introduce side channel for RAM
http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html

I have picked patches from the localhost migration series and rebased
it on the series of side channel, now it is a complete series that
passed the basic test.

One change: please rename the capability to x-unix-page-flipping for
now.  No need to rename the function migrate_unix_page_flipping(), only
the capability name in qapi-schema.json (and references to the enum
MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING).

With that change, v5 will get Reviewed-by: Paolo Bonzini
pbonz...@redhat.com.


Hi Paolo,

Will update soon, thanks!



Thanks!

Paolo


Please let me know if there is anything needs to be fixed or improved.
Your suggestions and comments are very welcome, and thanks to Paolo
for his continued review and useful suggestions.

Changes since V3:
   Address comments from Paolo including:

 - Get rid of useless check in send_pipefd() and the override
   of before_ram_iterate, send pipefd in the first save_page
   call, qemu_get_byte() in the first ram_load correspondingly.
 - Add new argument ram_addr_t to hook_ram_load to cut half of
   the data transferred on the socket.
 - Add transition from 'debug' to 'memory-stale'.
 - Other minor fixes.

Changes since V2:
   Address comments from Paolo including:

 - Doc improvement for QAPI.
 - Use callback get_buffer as the only one receiver.
 - Rename the new RunState flipping-migrate to memory-stale, and
   add transition from 'prelaunch' to 'memory-stale'.
 - Other minor fixes.

Changes since V1:
   Address suggestions from Paolo Bonzini including:

 - Use Unix socket QEMUFile as basis of code and adjust the way
   of overriding RDMA hooks.
 - Involve the vmsplice for page flipping.
 - Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to
   runstate_needs_reset() for the adjustment of the current
   migration process with page flipping.



Lei Li (17):
   QAPI: introduce magration capability unix_page_flipping
   migration: add migrate_unix_page_flipping()
   qmp-command.hx: add missing docs for migration capabilites
   migration-local: add QEMUFileLocal with socket based QEMUFile
   migration-local: introduce qemu_fopen_socket_local()
   migration-local: add send_pipefd()
   save_page: replace block_offset with a MemoryRegion
   migration-local: override save_page for page transmit
   savevm: adjust ram_control_save_page with page flipping
   add unix_msgfd_lookup() to callback get_buffer
   add argument ram_addr_t to hook_ram_load
   migration-local: override hook_ram_load
   migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local
   add new RanState RAN_STATE_MEMORY_STALE
   migration-unix: page flipping support on unix outgoing
   migration: adjust migration_thread() process for unix_page_flipping
   hmp: better fomat for info migrate_capabilities

  Makefile.target   |   1 +
  arch_init.c   |   4 +-
  migration-local.c | 512 ++
  hmp.c |   5 +-
  include/migration/migration.h |   3 +
  include/migration/qemu-file.h |   2 +
  migration-unix.c  |  27 ++-
  migration-rdma.c  |   4 +-
  migration.c   |  18 +-
  qapi-schema.json  |  18 +-
  qmp-commands.hx   |   8 +
  savevm.c  |  21 +-
  vl.c  |  12 +-
  13 files changed, 617 insertions(+), 27 deletions(-)
  create mode 100644 migration-local.c






--
Lei




[Qemu-devel] [PATCH 02/17] migration: add migrate_unix_page_flipping()

2013-12-02 Thread Lei Li
Add migrate_unix_page_flipping() to check if
MIGRATION_CAPABILITY_X_UNIX_PAGE_FLIPPING is enabled.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/migration.h |3 +++
 migration.c   |9 +
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 140e6b4..7e5d01a 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason);
 void migrate_del_blocker(Error *reason);
 
 bool migrate_rdma_pin_all(void);
+
 bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
 
+bool migrate_unix_page_flipping(void);
+
 int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
  uint8_t *dst, int dlen);
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
diff --git a/migration.c b/migration.c
index 2b1ab20..e012cd4 100644
--- a/migration.c
+++ b/migration.c
@@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void)
 return s-xbzrle_cache_size;
 }
 
+bool migrate_unix_page_flipping(void)
+{
+MigrationState *s;
+
+s = migrate_get_current();
+
+return s-enabled_capabilities[MIGRATION_CAPABILITY_X_UNIX_PAGE_FLIPPING];
+}
+
 /* migration thread support */
 
 static void *migration_thread(void *opaque)
-- 
1.7.7.6




[Qemu-devel] [PATCH 01/17] QAPI: introduce migration capability x_unix_page_flipping

2013-12-02 Thread Lei Li
Introduce x_unix_page_flipping to MigrationCapability for
localhost migration.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |   12 +++-
 1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 83fa485..ea910ef 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -685,10 +685,20 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #  to speed up convergence of RAM migration. (since 1.6)
 #
+# @x-unix-page-flipping: If enabled, QEMU can optimize migration when the
+#  destination is a QEMU process that runs on the same host as
+#  the source (as is the case for live upgrade).  If the migration
+#  transport is a Unix socket, QEMU will flip RAM pages directly to
+#  the destination, so that memory is only allocated twice for the
+#  source and destination processes. Disabled by default.
+#  Experimental: will get rid of the x tag after further testing with
+#  the new vmsplice. (since 2.0)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks',
+   'x-unix-page-flipping'] }
 
 ##
 # @MigrationCapabilityStatus
-- 
1.7.7.6




[Qemu-devel] [PATCH 04/17] migration-local: add QEMUFileLocal with socket based QEMUFile

2013-12-02 Thread Lei Li
This patch adds QEMUFileLocal with copy of socket based QEMUFile, will
be used as the basis code for Unix socket protocol migration and page
flipping migration.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 Makefile.target   |1 +
 migration-local.c |  123 +
 2 files changed, 124 insertions(+), 0 deletions(-)
 create mode 100644 migration-local.c

diff --git a/Makefile.target b/Makefile.target
index af6ac7e..aa09960 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -117,6 +117,7 @@ obj-$(CONFIG_KVM) += kvm-all.o
 obj-y += memory.o savevm.o cputlb.o
 obj-y += memory_mapping.o
 obj-y += dump.o
+obj-y += migration-local.o
 LIBS+=$(libs_softmmu)
 
 # xen support
diff --git a/migration-local.c b/migration-local.c
new file mode 100644
index 000..ca01a20
--- /dev/null
+++ b/migration-local.c
@@ -0,0 +1,123 @@
+/*
+ * QEMU localhost migration with page flipping
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *   Lei Li   li...@linux.vnet.ibm.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include config-host.h
+#include qemu-common.h
+#include migration/migration.h
+#include exec/cpu-common.h
+#include config.h
+#include exec/cpu-all.h
+#include exec/memory.h
+#include exec/memory-internal.h
+#include monitor/monitor.h
+#include migration/qemu-file.h
+#include qemu/iov.h
+#include sysemu/arch_init.h
+#include sysemu/sysemu.h
+#include block/block.h
+#include qemu/sockets.h
+#include migration/block.h
+#include qemu/thread.h
+#include qmp-commands.h
+#include trace.h
+#include qemu/osdep.h
+
+//#define DEBUG_MIGRATION_LOCAL
+
+#ifdef DEBUG_MIGRATION_LOCAL
+#define DPRINTF(fmt, ...) \
+do { printf(migration-local:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+
+typedef struct QEMUFileLocal {
+QEMUFile *file;
+int sockfd;
+int pipefd[2];
+int pipefd_passed;
+int pipefd_received;
+bool unix_page_flipping;
+} QEMUFileLocal;
+
+static int qemu_local_get_sockfd(void *opaque)
+{
+QEMUFileLocal *s = opaque;
+
+return s-sockfd;
+}
+
+static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
+ int64_t pos, int size)
+{
+QEMUFileLocal *s = opaque;
+ssize_t len;
+
+for (;;) {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
+
+if (socket_error() == EAGAIN) {
+yield_until_fd_readable(s-sockfd);
+} else if (socket_error() != EINTR) {
+break;
+}
+}
+
+if (len == -1) {
+len = -socket_error();
+}
+
+return len;
+}
+
+static ssize_t qemu_local_writev_buffer(void *opaque, struct iovec *iov,
+int iovcnt, int64_t pos)
+{
+QEMUFileLocal *s = opaque;
+ssize_t len;
+ssize_t size = iov_size(iov, iovcnt);
+
+len = iov_send(s-sockfd, iov, iovcnt, 0, size);
+if (len  size) {
+len = -socket_error();
+}
+
+return len;
+}
+
+static int qemu_local_close(void *opaque)
+{
+QEMUFileLocal *s = opaque;
+
+closesocket(s-sockfd);
+g_free(s);
+
+return 0;
+}
+
+static const QEMUFileOps pipe_read_ops = {
+.get_fd= qemu_local_get_sockfd,
+.get_buffer= qemu_local_get_buffer,
+.close = qemu_local_close,
+};
+
+static const QEMUFileOps pipe_write_ops = {
+.get_fd = qemu_local_get_sockfd,
+.writev_buffer  = qemu_local_writev_buffer,
+.close  = qemu_local_close,
+};
-- 
1.7.7.6




[Qemu-devel] [PATCH 0/17 v5] Localhost migration with side channel for ram

2013-12-02 Thread Lei Li
This patch series tries to introduce a mechanism using side
channel pipe for RAM via SCM_RIGHTS with unix domain socket
protocol migration.

This side channel is used for the page flipping by vmsplice,
which is the internal mechanism for localhost migration that
we are trying to add to QEMU. The backgroud info and previous
patch series for reference,

Localhost migration
http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html

migration: Introduce side channel for RAM
http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html

I have picked patches from the localhost migration series and rebased 
it on the series of side channel, now it is a complete series that
passed the basic test.

Please let me know if there is anything needs to be fixed or improved.
Your suggestions and comments are very welcome, and thanks to Paolo
for his continued review and useful suggestions.


Changes since V4:
  Rename the capability to x-unix-page-flipping for now. (Paolo)

Changes since V3:
  Address comments from Paolo including:

- Get rid of useless check in send_pipefd() and the override
  of before_ram_iterate, send pipefd in the first save_page
  call, qemu_get_byte() in the first ram_load correspondingly.
- Add new argument ram_addr_t to hook_ram_load to cut half of
  the data transferred on the socket.
- Add transition from 'debug' to 'memory-stale'.
- Other minor fixes.

Changes since V2:
  Address comments from Paolo including:

- Doc improvement for QAPI.
- Use callback get_buffer as the only one receiver.
- Rename the new RunState flipping-migrate to memory-stale, and
  add transition from 'prelaunch' to 'memory-stale'.
- Other minor fixes.

Changes since V1:
  Address suggestions from Paolo Bonzini including:

- Use Unix socket QEMUFile as basis of code and adjust the way
  of overriding RDMA hooks.
- Involve the vmsplice for page flipping.
- Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to
  runstate_needs_reset() for the adjustment of the current
  migration process with page flipping.



Lei Li (17):
  QAPI: introduce magration capability unix_page_flipping
  migration: add migrate_unix_page_flipping()
  qmp-command.hx: add missing docs for migration capabilites
  migration-local: add QEMUFileLocal with socket based QEMUFile
  migration-local: introduce qemu_fopen_socket_local()
  migration-local: add send_pipefd()
  save_page: replace block_offset with a MemoryRegion
  migration-local: override save_page for page transmit
  savevm: adjust ram_control_save_page with page flipping
  add unix_msgfd_lookup() to callback get_buffer
  add argument ram_addr_t to hook_ram_load
  migration-local: override hook_ram_load 
  migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local
  add new RanState RAN_STATE_MEMORY_STALE
  migration-unix: page flipping support on unix outgoing
  migration: adjust migration_thread() process for unix_page_flipping
  hmp: better format for info migrate_capabilities

 Makefile.target   |   1 +
 arch_init.c   |   4 +-
 migration-local.c | 512 ++
 hmp.c |   5 +-
 include/migration/migration.h |   3 +
 include/migration/qemu-file.h |   2 +
 migration-unix.c  |  27 ++-
 migration-rdma.c  |   4 +-
 migration.c   |  18 +-
 qapi-schema.json  |  18 +-
 qmp-commands.hx   |   8 +
 savevm.c  |  21 +-
 vl.c  |  12 +-
 13 files changed, 617 insertions(+), 27 deletions(-)
 create mode 100644 migration-local.c




[Qemu-devel] [PATCH 05/17] migration-local: introduce qemu_fopen_socket_local()

2013-12-02 Thread Lei Li
Add qemu_fopen_socket_local() to open QEMUFileLocal introduced
earlier. It will create a pipe in write mode if unix_page_flipping
is enabled, adjust qemu_local_close() to close pipe as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/qemu-file.h |2 +
 migration-local.c |   46 +
 2 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 0f757fb..f9b104a 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -99,6 +99,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd, const char *mode);
 QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
+QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode);
+
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
diff --git a/migration-local.c b/migration-local.c
index ca01a20..929ed60 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -105,6 +105,12 @@ static int qemu_local_close(void *opaque)
 QEMUFileLocal *s = opaque;
 
 closesocket(s-sockfd);
+
+if (s-unix_page_flipping) {
+close(s-pipefd[0]);
+close(s-pipefd[1]);
+}
+
 g_free(s);
 
 return 0;
@@ -121,3 +127,43 @@ static const QEMUFileOps pipe_write_ops = {
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
 };
+
+QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
+{
+QEMUFileLocal *s;
+int pipefd[2];
+
+if (qemu_file_mode_is_not_valid(mode)) {
+return NULL;
+}
+
+s = g_malloc0(sizeof(QEMUFileLocal));
+s-sockfd = sockfd;
+
+if (migrate_unix_page_flipping()) {
+s-unix_page_flipping = 1;
+}
+
+if (mode[0] == 'w') {
+if (s-unix_page_flipping) {
+if (pipe(pipefd)  0) {
+fprintf(stderr, failed to create PIPE\n);
+goto fail;
+}
+
+s-pipefd[0] = pipefd[0];
+s-pipefd[1] = pipefd[1];
+}
+
+qemu_set_block(s-sockfd);
+s-file = qemu_fopen_ops(s, pipe_write_ops);
+} else {
+s-file = qemu_fopen_ops(s, pipe_read_ops);
+}
+
+return s-file;
+
+fail:
+g_free(s);
+return NULL;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 03/17] qmp-command.hx: add missing docs for migration capabilites

2013-12-02 Thread Lei Li
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qmp-commands.hx |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/qmp-commands.hx b/qmp-commands.hx
index fba15cd..0df08c0 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2898,6 +2898,10 @@ migrate-set-capabilities
 Enable/Disable migration capabilities
 
 - xbzrle: XBZRLE support
+- x-rdma-pin-all: Pin all pages during RDMA support
+- zero-blocks: Compress zero blocks during block migration
+- auto-converge: Block VCPU to help convergence of migration
+- x-unix-page-flipping: Page flipping for live QEMU upgrade
 
 Arguments:
 
@@ -2922,6 +2926,10 @@ Query current migration capabilities
 
 - capabilities: migration capabilities state
  - xbzrle : XBZRLE state (json-bool)
+ - x-rdma-pin-all: RDMA state (json-bool)
+ - zero-blocks: zero-blocks state (json-bool)
+ - auto-converge: Auto converge state (json-bool)
+ - x-unix-page-flipping: Page flipping state (json-bool)
 
 Arguments:
 
-- 
1.7.7.6




[Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-12-02 Thread Lei Li
This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   46 ++
 1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 929ed60..f479530 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -167,3 +167,49 @@ fail:
 g_free(s);
 return NULL;
 }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+char req[1] = { 0x01 };
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 08/17] migration-local: override save_page for page transmit

2013-12-02 Thread Lei Li
This patch implements save_page callback for the outside
of page flipping. It will write the address of the page
on the Unix socket and flip the page data on pipe by
vmsplice(). Every page address would have a header flag
RAM_SAVE_FLAG_HOOK, which will trigger the load hook to
receive it in incoming side as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   63 +
 1 files changed, 63 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index f479530..9453ec8 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -116,6 +116,68 @@ static int qemu_local_close(void *opaque)
 return 0;
 }
 
+static int send_pipefd(int sockfd, int pipefd);
+
+static size_t qemu_local_save_ram(QEMUFile *f, void *opaque,
+  MemoryRegion *mr, ram_addr_t offset,
+  size_t size, int *bytes_sent)
+{
+QEMUFileLocal *s = opaque;
+ram_addr_t current_addr = mr-ram_addr + offset;
+void *ram_addr;
+ssize_t ret;
+
+if (s-unix_page_flipping) {
+qemu_put_be64(s-file, current_addr | RAM_SAVE_FLAG_HOOK);
+qemu_fflush(s-file);
+
+if (!s-pipefd_passed) {
+ret = send_pipefd(s-sockfd, s-pipefd[0]);
+if (ret  0) {
+fprintf(stderr, failed to pass PIPE\n);
+return ret;
+}
+s-pipefd_passed = true;
+}
+
+ram_addr = memory_region_get_ram_ptr(mr) + offset;
+
+/* vmsplice page data to pipe */
+struct iovec iov = {
+.iov_base = ram_addr,
+.iov_len  = size,
+};
+
+/*
+ * The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will move pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently fall
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+
+ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice save error: %s\n, strerror(errno));
+return ret;
+}
+} else {
+if (bytes_sent) {
+*bytes_sent = size;
+}
+DPRINTF(block_offset: %lu, offset: %lu\n, mr-ram_addr, offset);
+return 0;
+}
+}
+
+return RAM_SAVE_CONTROL_NOT_SUPP;
+}
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
@@ -126,6 +188,7 @@ static const QEMUFileOps pipe_write_ops = {
 .get_fd = qemu_local_get_sockfd,
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
+.save_page  = qemu_local_save_ram
 };
 
 QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
-- 
1.7.7.6




[Qemu-devel] [PATCH 09/17] savevm: adjust ram_control_save_page for page flipping

2013-12-02 Thread Lei Li
As callback save_page will always be opened by
qemu_fopen_socket_local(), and without unix_page_flipping
it will return RAM_SAVE_CONTROL_NOT_SUPP, it leads to a
wrong qemu_file_set_error() based on the current logic.
So this patch adds RAM_SAVE_CONTROL_NOT_SUPP to the check.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 savevm.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/savevm.c b/savevm.c
index 06c1f29..137e74f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -668,7 +668,8 @@ size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, 
ram_addr_t offset,
 int ret = f-ops-save_page(f, f-opaque, mr, offset,
 size, bytes_sent);
 
-if (ret != RAM_SAVE_CONTROL_DELAYED) {
+if (ret != RAM_SAVE_CONTROL_DELAYED 
+ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (bytes_sent  *bytes_sent  0) {
 qemu_update_position(f, *bytes_sent);
 } else if (ret  0) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load

2013-12-02 Thread Lei Li
Override hook_ram_load to receive the pipe file descriptor
passed by source process and page address which will be
extracted to vmsplice the page data from pipe.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   59 +
 1 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 5f98a01..ce4c070 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -231,10 +231,69 @@ static size_t qemu_local_save_ram(QEMUFile *f, void 
*opaque,
 return RAM_SAVE_CONTROL_NOT_SUPP;
 }
 
+static int qemu_local_ram_load(QEMUFile *f, void *opaque,
+   ram_addr_t addr, uint64_t flags)
+{
+QEMUFileLocal *s = opaque;
+struct iovec iov;
+ssize_t ret = -EINVAL;
+
+if (!s-pipefd_received) {
+/*
+ * send_pipefd was called at this point, and it wrote one
+ * byte to the stream.
+ */
+qemu_get_byte(s-file);
+s-pipefd_received = true;
+}
+
+if (s-pipefd_passed) {
+void *host;
+/*
+ * Extract the page address from the 8-byte record and
+ * read the page data from the pipe.
+ */
+host = qemu_get_ram_ptr(addr);
+
+iov.iov_base = host;
+iov.iov_len = TARGET_PAGE_SIZE;
+
+/*
+ * The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will move pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently fall
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+
+ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice() load error: %s, strerror(errno));
+return ret;
+}
+DPRINTF(vmsplice load error\n);
+} else if (ret == 0) {
+DPRINTF(stderr, load_page: zero read\n);
+}
+
+DPRINTF(vmsplice (read): %zu\n, ret);
+return ret;
+}
+
+return -EINVAL;
+}
+
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
 .close = qemu_local_close,
+.hook_ram_load = qemu_local_ram_load
 };
 
 static const QEMUFileOps pipe_write_ops = {
-- 
1.7.7.6




[Qemu-devel] [PATCH 07/17] save_page: replace block_offset with a MemoryRegion

2013-12-02 Thread Lei Li
This patch exports MemoryRegion to save_page hook, replacing
argument ram_addr_t block_offset with a MemoryRegion suggested
by Paolo Bonzini.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 arch_init.c   |4 ++--
 include/migration/migration.h |2 +-
 include/migration/qemu-file.h |8 
 migration-rdma.c  |4 ++--
 savevm.c  |8 
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index e0acbc5..daaa519 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 
 /* In doubt sent page as normal */
 bytes_sent = -1;
-ret = ram_control_save_page(f, block-offset,
-   offset, TARGET_PAGE_SIZE, bytes_sent);
+ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE,
+bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 7e5d01a..ca852a8 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags);
 #define RAM_SAVE_CONTROL_NOT_SUPP -1000
 #define RAM_SAVE_CONTROL_DELAYED  -2000
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr,
  ram_addr_t offset, size_t size,
  int *bytes_sent);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index f9b104a..6646e89 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, 
uint64_t flags);
  * is saved (such as RDMA, for example.)
  */
 typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
-   ram_addr_t block_offset,
-   ram_addr_t offset,
-   size_t size,
-   int *bytes_sent);
+ MemoryRegion *mr,
+ ram_addr_t offset,
+ size_t size,
+ int *bytes_sent);
 
 typedef struct QEMUFileOps {
 QEMUFilePutBufferFunc *put_buffer;
diff --git a/migration-rdma.c b/migration-rdma.c
index f94f3b4..ae04de4 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque)
  *  the protocol because most transfers are sent 
asynchronously.
  */
 static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
-  ram_addr_t block_offset, ram_addr_t offset,
+  MemoryRegion *mr, ram_addr_t offset,
   size_t size, int *bytes_sent)
 {
 QEMUFileRDMA *rfile = opaque;
@@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
  * is full, or the page doen't belong to the current chunk,
  * an actual RDMA write will occur and a new chunk will be formed.
  */
-ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
+ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size);
 if (ret  0) {
 fprintf(stderr, rdma migration: write error! %d\n, ret);
 goto err;
diff --git a/savevm.c b/savevm.c
index 3f912dd..06c1f29 100644
--- a/savevm.c
+++ b/savevm.c
@@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags)
 }
 }
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
- ram_addr_t offset, size_t size, int *bytes_sent)
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset,
+ size_t size, int *bytes_sent)
 {
 if (f-ops-save_page) {
-int ret = f-ops-save_page(f, f-opaque, block_offset,
-offset, size, bytes_sent);
+int ret = f-ops-save_page(f, f-opaque, mr, offset,
+size, bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_sent  *bytes_sent  0) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 14/17] add new RunState RUN_STATE_MEMORY_STALE

2013-12-02 Thread Lei Li
Introduce new RunState RUN_STATE_MEMORY_STALE and
add it to runstate_needs_reset().

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |7 +--
 vl.c |   13 -
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index ea910ef..6ff46ff 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -176,12 +176,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @memory-stale: guest is paused to start unix_page_flipping migration
+# process, the destination QEMU will has the newer contents of the memory
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
-'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-'guest-panicked' ] }
+'running', 'save-vm', 'shutdown', 'suspended', 'memory-stale',
+'watchdog', 'guest-panicked' ] }
 
 ##
 # @SnapshotInfo
diff --git a/vl.c b/vl.c
index 8d5d874..3ea96b2 100644
--- a/vl.c
+++ b/vl.c
@@ -601,6 +601,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 
 { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
 { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_PAUSED, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
@@ -608,6 +609,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING },
 { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE },
 { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+{ RUN_STATE_PRELAUNCH, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
@@ -624,23 +626,31 @@ static const RunStateTransition 
runstate_transitions_def[] = {
 { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
 { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
 { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+{ RUN_STATE_RUNNING, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
 
 { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED },
 { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SHUTDOWN, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED },
+{ RUN_STATE_DEBUG, RUN_STATE_MEMORY_STALE },
 { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
 { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
 { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
 { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_WATCHDOG, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_GUEST_PANICKED, RUN_STATE_MEMORY_STALE },
 
+{ RUN_STATE_MEMORY_STALE, RUN_STATE_RUNNING },
+{ RUN_STATE_MEMORY_STALE, RUN_STATE_POSTMIGRATE },
 { RUN_STATE_MAX, RUN_STATE_MAX },
 };
 
@@ -685,7 +695,8 @@ int runstate_is_running(void)
 bool runstate_needs_reset(void)
 {
 return runstate_check(RUN_STATE_INTERNAL_ERROR) ||
-runstate_check(RUN_STATE_SHUTDOWN);
+runstate_check(RUN_STATE_SHUTDOWN) ||
+runstate_check(RUN_STATE_MEMORY_STALE);
 }
 
 StatusInfo *qmp_query_status(Error **errp)
-- 
1.7.7.6




[Qemu-devel] [PATCH 17/17] hmp: better format for info migrate_capabilities

2013-12-02 Thread Lei Li
As there might be more capabilities introduced, better to display
it in lines.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 hmp.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hmp.c b/hmp.c
index 32ee285..dcfa2f9 100644
--- a/hmp.c
+++ b/hmp.c
@@ -226,13 +226,12 @@ void hmp_info_migrate_capabilities(Monitor *mon, const 
QDict *qdict)
 caps = qmp_query_migrate_capabilities(NULL);
 
 if (caps) {
-monitor_printf(mon, capabilities: );
+monitor_printf(mon, Capabilities:\n);
 for (cap = caps; cap; cap = cap-next) {
-monitor_printf(mon, %s: %s ,
+monitor_printf(mon, %s: %s\n,
MigrationCapability_lookup[cap-value-capability],
cap-value-state ? on : off);
 }
-monitor_printf(mon, \n);
 }
 
 qapi_free_MigrationCapabilityStatusList(caps);
-- 
1.7.7.6




[Qemu-devel] [PATCH 15/17] migration-unix: page flipping support on unix outgoing

2013-12-02 Thread Lei Li
Add page flipping support on unix outgoing part by stopping
VM with the new RunState RUN_STATE_MEMORY_STALE before
invoking migration if unix_page_flipping enabled.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-unix.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/migration-unix.c b/migration-unix.c
index 9beeafe..cbf2087 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -19,6 +19,7 @@
 #include migration/migration.h
 #include migration/qemu-file.h
 #include block/block.h
+#include sysemu/sysemu.h
 
 //#define DEBUG_MIGRATION_UNIX
 
@@ -33,6 +34,7 @@
 static void unix_wait_for_connect(int fd, void *opaque)
 {
 MigrationState *s = opaque;
+int ret;
 
 if (fd  0) {
 DPRINTF(migrate connect error\n);
@@ -47,6 +49,15 @@ static void unix_wait_for_connect(int fd, void *opaque)
 goto fail;
 }
 
+/* Stop VM before invoking migration if unix_page_flipping enabled */
+if (migrate_unix_page_flipping()) {
+ret = vm_stop_force_state(RUN_STATE_MEMORY_STALE);
+if (ret  0) {
+DPRINTF(failed to stop VM\n);
+goto fail;
+}
+}
+
 migrate_fd_connect(s);
 return;
 }
-- 
1.7.7.6




[Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping

2013-12-02 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration.c |7 +--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/migration.c b/migration.c
index e012cd4..7e0ec33 100644
--- a/migration.c
+++ b/migration.c
@@ -582,7 +582,7 @@ static void *migration_thread(void *opaque)
 if (pending_size  pending_size = max_size) {
 qemu_savevm_state_iterate(s-file);
 } else {
-int ret;
+int ret = 0;
 
 DPRINTF(done iterating\n);
 qemu_mutex_lock_iothread();
@@ -590,7 +590,10 @@ static void *migration_thread(void *opaque)
 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 old_vm_running = runstate_is_running();
 
-ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+if (!runstate_needs_reset()) {
+ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}
+
 if (ret = 0) {
 qemu_file_set_rate_limit(s-file, INT_MAX);
 qemu_savevm_state_complete(s-file);
-- 
1.7.7.6




[Qemu-devel] [PATCH 10/17] add unix_msgfd_lookup() to callback get_buffer

2013-12-02 Thread Lei Li
The control message for exchange of pipe file descriptor should
be received by recvmsg, and it might be eaten to stream file by
qemu_recv() when receiving by two callbacks. So this patch adds
unix_msgfd_lookup() to callback get_buffer as the only one receiver,
where the pipe file descriptor would be caughted.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   59 ++--
 1 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 9453ec8..5f98a01 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -59,16 +59,69 @@ static int qemu_local_get_sockfd(void *opaque)
 return s-sockfd;
 }
 
+static int unix_msgfd_lookup(void *opaque, struct msghdr *msg)
+{
+QEMUFileLocal *s = opaque;
+struct cmsghdr *cmsg;
+bool found = false;
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS)
+continue;
+
+/* PIPE file descriptor to be received */
+s-pipefd[0] = *((int *)CMSG_DATA(cmsg));
+}
+
+if (s-pipefd[0]  0) {
+fprintf(stderr, no pipe fd can be received\n);
+return found;
+}
+
+DPRINTF(pipefd successfully received\n);
+return s-pipefd[0];
+}
+
 static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
  int64_t pos, int size)
 {
 QEMUFileLocal *s = opaque;
 ssize_t len;
+struct msghdr msg = { NULL, };
+struct iovec iov[1];
+union {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+} msg_control;
+
+iov[0].iov_base = buf;
+iov[0].iov_len = size;
+
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
 
 for (;;) {
-len = qemu_recv(s-sockfd, buf, size, 0);
-if (len != -1) {
-break;
+if (!s-pipefd_passed) {
+/*
+ * recvmsg is called here to catch the control message for
+ * the exchange of PIPE file descriptor until it is received.
+ */
+len = recvmsg(s-sockfd, msg, 0);
+if (len != -1) {
+if (unix_msgfd_lookup(s, msg)  0) {
+s-pipefd_passed = 1;
+}
+break;
+}
+} else {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
 }
 
 if (socket_error() == EAGAIN) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 11/17] add argument ram_addr_t to hook_ram_load

2013-12-02 Thread Lei Li
Adds argument ram_addr_t to hook_ram_load, and replaces
QEMURamHookFunc with QEMURamLoadHookFunc for it. With this
new argument, it will allow cut almost half of the data
transferred on the Unix socket using by page flipping
migraton.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 arch_init.c   |2 +-
 include/migration/migration.h |2 +-
 include/migration/qemu-file.h |   11 ++-
 migration-rdma.c  |2 +-
 savevm.c  |4 ++--
 5 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index daaa519..0621893 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -945,7 +945,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 goto done;
 }
 } else if (flags  RAM_SAVE_FLAG_HOOK) {
-ram_control_load_hook(f, flags);
+ram_control_load_hook(f, addr, flags);
 }
 error = qemu_file_get_error(f);
 if (error) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index ca852a8..300e52c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -149,7 +149,7 @@ int64_t xbzrle_cache_resize(int64_t new_size);
 
 void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
-void ram_control_load_hook(QEMUFile *f, uint64_t flags);
+void ram_control_load_hook(QEMUFile *f, ram_addr_t addr, uint64_t flags);
 
 /* Whenever this is found in the data stream, the flags
  * will be passed to ram_control_load_hook in the incoming-migration
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 6646e89..176c2d9 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -65,6 +65,15 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, 
struct iovec *iov,
 typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags);
 
 /*
+ * This function provides load hook for RAM migration, allows
+ * override of where the RAM page is loaded (such as page
+ * flipping for example).
+ */
+typedef int (QEMURamLoadHookFunc)(QEMUFile *f, void *opaque,
+  ram_addr_t addr,
+  uint64_t flags);
+
+/*
  * Constants used by ram_control_* hooks
  */
 #define RAM_CONTROL_SETUP0
@@ -90,7 +99,7 @@ typedef struct QEMUFileOps {
 QEMUFileWritevBufferFunc *writev_buffer;
 QEMURamHookFunc *before_ram_iterate;
 QEMURamHookFunc *after_ram_iterate;
-QEMURamHookFunc *hook_ram_load;
+QEMURamLoadHookFunc *hook_ram_load;
 QEMURamSaveFunc *save_page;
 } QEMUFileOps;
 
diff --git a/migration-rdma.c b/migration-rdma.c
index ae04de4..732ec1a 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -2938,7 +2938,7 @@ err_rdma_dest_wait:
  * Keep doing this until the source tells us to stop.
  */
 static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
- uint64_t flags)
+ ram_addr_t offset, uint64_t flags)
 {
 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
.type = RDMA_CONTROL_REGISTER_RESULT,
diff --git a/savevm.c b/savevm.c
index 137e74f..75e397c 100644
--- a/savevm.c
+++ b/savevm.c
@@ -647,12 +647,12 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t 
flags)
 }
 }
 
-void ram_control_load_hook(QEMUFile *f, uint64_t flags)
+void ram_control_load_hook(QEMUFile *f, ram_addr_t offset, uint64_t flags)
 {
 int ret = -EINVAL;
 
 if (f-ops-hook_ram_load) {
-ret = f-ops-hook_ram_load(f, f-opaque, flags);
+ret = f-ops-hook_ram_load(f, f-opaque, offset, flags);
 if (ret  0) {
 qemu_file_set_error(f, ret);
 }
-- 
1.7.7.6




[Qemu-devel] [PATCH 05/17] migration-local: introduce qemu_fopen_socket_local()

2013-11-29 Thread Lei Li
Add qemu_fopen_socket_local() to open QEMUFileLocal introduced
earlier. It will create a pipe in write mode if unix_page_flipping
is enabled, adjust qemu_local_close() to close pipe as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/qemu-file.h |2 +
 migration-local.c |   46 +
 2 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 0f757fb..f9b104a 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -99,6 +99,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd, const char *mode);
 QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
+QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode);
+
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
diff --git a/migration-local.c b/migration-local.c
index ca01a20..929ed60 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -105,6 +105,12 @@ static int qemu_local_close(void *opaque)
 QEMUFileLocal *s = opaque;
 
 closesocket(s-sockfd);
+
+if (s-unix_page_flipping) {
+close(s-pipefd[0]);
+close(s-pipefd[1]);
+}
+
 g_free(s);
 
 return 0;
@@ -121,3 +127,43 @@ static const QEMUFileOps pipe_write_ops = {
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
 };
+
+QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
+{
+QEMUFileLocal *s;
+int pipefd[2];
+
+if (qemu_file_mode_is_not_valid(mode)) {
+return NULL;
+}
+
+s = g_malloc0(sizeof(QEMUFileLocal));
+s-sockfd = sockfd;
+
+if (migrate_unix_page_flipping()) {
+s-unix_page_flipping = 1;
+}
+
+if (mode[0] == 'w') {
+if (s-unix_page_flipping) {
+if (pipe(pipefd)  0) {
+fprintf(stderr, failed to create PIPE\n);
+goto fail;
+}
+
+s-pipefd[0] = pipefd[0];
+s-pipefd[1] = pipefd[1];
+}
+
+qemu_set_block(s-sockfd);
+s-file = qemu_fopen_ops(s, pipe_write_ops);
+} else {
+s-file = qemu_fopen_ops(s, pipe_read_ops);
+}
+
+return s-file;
+
+fail:
+g_free(s);
+return NULL;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 01/17] QAPI: introduce migration capability unix_page_flipping

2013-11-29 Thread Lei Li
Introduce unix_page_flipping to MigrationCapability for
localhost migration.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |   10 +-
 1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 83fa485..b290a0f 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -685,10 +685,18 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #  to speed up convergence of RAM migration. (since 1.6)
 #
+# @unix-page-flipping: If enabled, QEMU can optimize migration when the
+#  destination is a QEMU process that runs on the same host as
+#  the source (as is the case for live upgrade).  If the migration
+#  transport is a Unix socket, QEMU will flip RAM pages directly to
+#  the destination, so that memory is only allocated twice for the
+#  source and destination processes. Disabled by default. (since 1.8)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks',
+   'unix-page-flipping'] }
 
 ##
 # @MigrationCapabilityStatus
-- 
1.7.7.6




[Qemu-devel] [PATCH 0/17 v4] Localhost migration with side channel for ram

2013-11-29 Thread Lei Li
This patch series tries to introduce a mechanism using side
channel pipe for RAM via SCM_RIGHTS with unix domain socket
protocol migration.

This side channel is used for the page flipping by vmsplice,
which is the internal mechanism for localhost migration that
we are trying to add to QEMU. The backgroud info and previous
patch series for reference,

Localhost migration
http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html

migration: Introduce side channel for RAM
http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html

I have picked patches from the localhost migration series and rebased 
it on the series of side channel, now it is a complete series that
passed the basic test.

Please let me know if there is anything needs to be fixed or improved.
Your suggestions and comments are very welcome, and thanks to Paolo
for his continued review and useful suggestions.

Changes since V3:
  Address comments from Paolo including:

- Get rid of useless check in send_pipefd() and the override
  of before_ram_iterate, send pipefd in the first save_page
  call, qemu_get_byte() in the first ram_load correspondingly.
- Add new argument ram_addr_t to hook_ram_load to cut half of
  the data transferred on the socket.
- Add transition from 'debug' to 'memory-stale'.
- Other minor fixes.

Changes since V2:
  Address comments from Paolo including:

- Doc improvement for QAPI.
- Use callback get_buffer as the only one receiver.
- Rename the new RunState flipping-migrate to memory-stale, and
  add transition from 'prelaunch' to 'memory-stale'.
- Other minor fixes.

Changes since V1:
  Address suggestions from Paolo Bonzini including:

- Use Unix socket QEMUFile as basis of code and adjust the way
  of overriding RDMA hooks.
- Involve the vmsplice for page flipping.
- Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to
  runstate_needs_reset() for the adjustment of the current
  migration process with page flipping.



Lei Li (17):
  QAPI: introduce magration capability unix_page_flipping
  migration: add migrate_unix_page_flipping()
  qmp-command.hx: add missing docs for migration capabilites
  migration-local: add QEMUFileLocal with socket based QEMUFile
  migration-local: introduce qemu_fopen_socket_local()
  migration-local: add send_pipefd()
  save_page: replace block_offset with a MemoryRegion
  migration-local: override save_page for page transmit
  savevm: adjust ram_control_save_page with page flipping
  add unix_msgfd_lookup() to callback get_buffer
  add argument ram_addr_t to hook_ram_load
  migration-local: override hook_ram_load 
  migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local
  add new RanState RAN_STATE_MEMORY_STALE
  migration-unix: page flipping support on unix outgoing
  migration: adjust migration_thread() process for unix_page_flipping
  hmp: better fomat for info migrate_capabilities

 Makefile.target   |   1 +
 arch_init.c   |   4 +-
 migration-local.c | 512 ++
 hmp.c |   5 +-
 include/migration/migration.h |   3 +
 include/migration/qemu-file.h |   2 +
 migration-unix.c  |  27 ++-
 migration-rdma.c  |   4 +-
 migration.c   |  18 +-
 qapi-schema.json  |  18 +-
 qmp-commands.hx   |   8 +
 savevm.c  |  21 +-
 vl.c  |  12 +-
 13 files changed, 617 insertions(+), 27 deletions(-)
 create mode 100644 migration-local.c




[Qemu-devel] [PATCH 03/17] qmp-command.hx: add missing docs for migration capabilites

2013-11-29 Thread Lei Li
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qmp-commands.hx |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/qmp-commands.hx b/qmp-commands.hx
index fba15cd..dcec433 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2898,6 +2898,10 @@ migrate-set-capabilities
 Enable/Disable migration capabilities
 
 - xbzrle: XBZRLE support
+- x-rdma-pin-all: Pin all pages during RDMA support
+- zero-blocks: Compress zero blocks during block migration
+- auto-converge: Block VCPU to help convergence of migration
+- unix-page-flipping: Page flipping for live QEMU upgrade
 
 Arguments:
 
@@ -2922,6 +2926,10 @@ Query current migration capabilities
 
 - capabilities: migration capabilities state
  - xbzrle : XBZRLE state (json-bool)
+ - x-rdma-pin-all: RDMA state (json-bool)
+ - zero-blocks: zero-blocks state (json-bool)
+ - auto-converge: Auto converge state (json-bool)
+ - unix-page-flipping: Page flipping state (json-bool)
 
 Arguments:
 
-- 
1.7.7.6




[Qemu-devel] [PATCH 02/17] migration: add migrate_unix_page_flipping()

2013-11-29 Thread Lei Li
Add migrate_unix_page_flipping() to check if
MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING is enabled.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/migration.h |3 +++
 migration.c   |9 +
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 140e6b4..7e5d01a 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason);
 void migrate_del_blocker(Error *reason);
 
 bool migrate_rdma_pin_all(void);
+
 bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
 
+bool migrate_unix_page_flipping(void);
+
 int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
  uint8_t *dst, int dlen);
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
diff --git a/migration.c b/migration.c
index 2b1ab20..4ac466b 100644
--- a/migration.c
+++ b/migration.c
@@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void)
 return s-xbzrle_cache_size;
 }
 
+bool migrate_unix_page_flipping(void)
+{
+MigrationState *s;
+
+s = migrate_get_current();
+
+return s-enabled_capabilities[MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING];
+}
+
 /* migration thread support */
 
 static void *migration_thread(void *opaque)
-- 
1.7.7.6




[Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-11-29 Thread Lei Li
This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   46 ++
 1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 929ed60..f479530 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -167,3 +167,49 @@ fail:
 g_free(s);
 return NULL;
 }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+char req[1] = { 0x01 };
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 10/17] add unix_msgfd_lookup() to callback get_buffer

2013-11-29 Thread Lei Li
The control message for exchange of pipe file descriptor should
be received by recvmsg, and it might be eaten to stream file by
qemu_recv() when receiving by two callbacks. So this patch adds
unix_msgfd_lookup() to callback get_buffer as the only one receiver,
where the pipe file descriptor would be caughted.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   59 ++--
 1 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 0a41c69..76ec306 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -59,16 +59,69 @@ static int qemu_local_get_sockfd(void *opaque)
 return s-sockfd;
 }
 
+static int unix_msgfd_lookup(void *opaque, struct msghdr *msg)
+{
+QEMUFileLocal *s = opaque;
+struct cmsghdr *cmsg;
+bool found = false;
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS)
+continue;
+
+/* PIPE file descriptor to be received */
+s-pipefd[0] = *((int *)CMSG_DATA(cmsg));
+}
+
+if (s-pipefd[0]  0) {
+fprintf(stderr, no pipe fd can be received\n);
+return found;
+}
+
+DPRINTF(pipefd successfully received\n);
+return s-pipefd[0];
+}
+
 static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
  int64_t pos, int size)
 {
 QEMUFileLocal *s = opaque;
 ssize_t len;
+struct msghdr msg = { NULL, };
+struct iovec iov[1];
+union {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+} msg_control;
+
+iov[0].iov_base = buf;
+iov[0].iov_len = size;
+
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
 
 for (;;) {
-len = qemu_recv(s-sockfd, buf, size, 0);
-if (len != -1) {
-break;
+if (!s-pipefd_passed) {
+/*
+ * recvmsg is called here to catch the control message for
+ * the exchange of PIPE file descriptor until it is received.
+ */
+len = recvmsg(s-sockfd, msg, 0);
+if (len != -1) {
+if (unix_msgfd_lookup(s, msg)  0) {
+s-pipefd_passed = 1;
+}
+break;
+}
+} else {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
 }
 
 if (socket_error() == EAGAIN) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_MEMORY_STALE

2013-11-29 Thread Lei Li
Introduce new RanState RAN_STATE_MEMORY_STALE and
add it to runstate_needs_reset().

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |7 +--
 vl.c |   13 -
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index b290a0f..4d9e712 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -176,12 +176,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @memory-stale: guest is paused to start unix_page_flipping migration
+# process, the destination QEMU will has the newer contents of the memory
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
-'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-'guest-panicked' ] }
+'running', 'save-vm', 'shutdown', 'suspended', 'memory-stale',
+'watchdog', 'guest-panicked' ] }
 
 ##
 # @SnapshotInfo
diff --git a/vl.c b/vl.c
index 8d5d874..3ea96b2 100644
--- a/vl.c
+++ b/vl.c
@@ -601,6 +601,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 
 { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
 { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_PAUSED, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
@@ -608,6 +609,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING },
 { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE },
 { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+{ RUN_STATE_PRELAUNCH, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
@@ -624,23 +626,31 @@ static const RunStateTransition 
runstate_transitions_def[] = {
 { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
 { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
 { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+{ RUN_STATE_RUNNING, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
 
 { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED },
 { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SHUTDOWN, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED },
+{ RUN_STATE_DEBUG, RUN_STATE_MEMORY_STALE },
 { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
 { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
 { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
 { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_WATCHDOG, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_GUEST_PANICKED, RUN_STATE_MEMORY_STALE },
 
+{ RUN_STATE_MEMORY_STALE, RUN_STATE_RUNNING },
+{ RUN_STATE_MEMORY_STALE, RUN_STATE_POSTMIGRATE },
 { RUN_STATE_MAX, RUN_STATE_MAX },
 };
 
@@ -685,7 +695,8 @@ int runstate_is_running(void)
 bool runstate_needs_reset(void)
 {
 return runstate_check(RUN_STATE_INTERNAL_ERROR) ||
-runstate_check(RUN_STATE_SHUTDOWN);
+runstate_check(RUN_STATE_SHUTDOWN) ||
+runstate_check(RUN_STATE_MEMORY_STALE);
 }
 
 StatusInfo *qmp_query_status(Error **errp)
-- 
1.7.7.6




[Qemu-devel] [PATCH 07/17] save_page: replace block_offset with a MemoryRegion

2013-11-29 Thread Lei Li
This patch exports MemoryRegion to save_page hook, replacing
argument ram_addr_t block_offset with a MemoryRegion suggested
by Paolo Bonzini.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 arch_init.c   |4 ++--
 include/migration/migration.h |2 +-
 include/migration/qemu-file.h |8 
 migration-rdma.c  |4 ++--
 savevm.c  |8 
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index e0acbc5..daaa519 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 
 /* In doubt sent page as normal */
 bytes_sent = -1;
-ret = ram_control_save_page(f, block-offset,
-   offset, TARGET_PAGE_SIZE, bytes_sent);
+ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE,
+bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 7e5d01a..ca852a8 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags);
 #define RAM_SAVE_CONTROL_NOT_SUPP -1000
 #define RAM_SAVE_CONTROL_DELAYED  -2000
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr,
  ram_addr_t offset, size_t size,
  int *bytes_sent);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index f9b104a..6646e89 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, 
uint64_t flags);
  * is saved (such as RDMA, for example.)
  */
 typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
-   ram_addr_t block_offset,
-   ram_addr_t offset,
-   size_t size,
-   int *bytes_sent);
+ MemoryRegion *mr,
+ ram_addr_t offset,
+ size_t size,
+ int *bytes_sent);
 
 typedef struct QEMUFileOps {
 QEMUFilePutBufferFunc *put_buffer;
diff --git a/migration-rdma.c b/migration-rdma.c
index f94f3b4..ae04de4 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque)
  *  the protocol because most transfers are sent 
asynchronously.
  */
 static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
-  ram_addr_t block_offset, ram_addr_t offset,
+  MemoryRegion *mr, ram_addr_t offset,
   size_t size, int *bytes_sent)
 {
 QEMUFileRDMA *rfile = opaque;
@@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
  * is full, or the page doen't belong to the current chunk,
  * an actual RDMA write will occur and a new chunk will be formed.
  */
-ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
+ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size);
 if (ret  0) {
 fprintf(stderr, rdma migration: write error! %d\n, ret);
 goto err;
diff --git a/savevm.c b/savevm.c
index 3f912dd..06c1f29 100644
--- a/savevm.c
+++ b/savevm.c
@@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags)
 }
 }
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
- ram_addr_t offset, size_t size, int *bytes_sent)
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset,
+ size_t size, int *bytes_sent)
 {
 if (f-ops-save_page) {
-int ret = f-ops-save_page(f, f-opaque, block_offset,
-offset, size, bytes_sent);
+int ret = f-ops-save_page(f, f-opaque, mr, offset,
+size, bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_sent  *bytes_sent  0) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 11/17] add argument ram_addr_t to hook_ram_load

2013-11-29 Thread Lei Li
Adds argument ram_addr_t to hook_ram_load, and replaces
QEMURamHookFunc with QEMURamLoadHookFunc for it. With this
new argument, it will allows cut almost half of the data
transferred on the Unix socket using by page flipping
migraton.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 arch_init.c   |2 +-
 include/migration/migration.h |2 +-
 include/migration/qemu-file.h |   11 ++-
 migration-rdma.c  |2 +-
 savevm.c  |4 ++--
 5 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index daaa519..0621893 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -945,7 +945,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 goto done;
 }
 } else if (flags  RAM_SAVE_FLAG_HOOK) {
-ram_control_load_hook(f, flags);
+ram_control_load_hook(f, addr, flags);
 }
 error = qemu_file_get_error(f);
 if (error) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index ca852a8..300e52c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -149,7 +149,7 @@ int64_t xbzrle_cache_resize(int64_t new_size);
 
 void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
-void ram_control_load_hook(QEMUFile *f, uint64_t flags);
+void ram_control_load_hook(QEMUFile *f, ram_addr_t addr, uint64_t flags);
 
 /* Whenever this is found in the data stream, the flags
  * will be passed to ram_control_load_hook in the incoming-migration
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 6646e89..176c2d9 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -65,6 +65,15 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, 
struct iovec *iov,
 typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags);
 
 /*
+ * This function provides load hook for RAM migration, allows
+ * override of where the RAM page is loaded (such as page
+ * flipping for example).
+ */
+typedef int (QEMURamLoadHookFunc)(QEMUFile *f, void *opaque,
+  ram_addr_t addr,
+  uint64_t flags);
+
+/*
  * Constants used by ram_control_* hooks
  */
 #define RAM_CONTROL_SETUP0
@@ -90,7 +99,7 @@ typedef struct QEMUFileOps {
 QEMUFileWritevBufferFunc *writev_buffer;
 QEMURamHookFunc *before_ram_iterate;
 QEMURamHookFunc *after_ram_iterate;
-QEMURamHookFunc *hook_ram_load;
+QEMURamLoadHookFunc *hook_ram_load;
 QEMURamSaveFunc *save_page;
 } QEMUFileOps;
 
diff --git a/migration-rdma.c b/migration-rdma.c
index ae04de4..732ec1a 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -2938,7 +2938,7 @@ err_rdma_dest_wait:
  * Keep doing this until the source tells us to stop.
  */
 static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
- uint64_t flags)
+ ram_addr_t offset, uint64_t flags)
 {
 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
.type = RDMA_CONTROL_REGISTER_RESULT,
diff --git a/savevm.c b/savevm.c
index 137e74f..75e397c 100644
--- a/savevm.c
+++ b/savevm.c
@@ -647,12 +647,12 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t 
flags)
 }
 }
 
-void ram_control_load_hook(QEMUFile *f, uint64_t flags)
+void ram_control_load_hook(QEMUFile *f, ram_addr_t offset, uint64_t flags)
 {
 int ret = -EINVAL;
 
 if (f-ops-hook_ram_load) {
-ret = f-ops-hook_ram_load(f, f-opaque, flags);
+ret = f-ops-hook_ram_load(f, f-opaque, offset, flags);
 if (ret  0) {
 qemu_file_set_error(f, ret);
 }
-- 
1.7.7.6




[Qemu-devel] [PATCH 08/17] migration-local: override save_page for page transmit

2013-11-29 Thread Lei Li
This patch implements save_page callback for the outside
of page flipping. It will write the address of the page
on the Unix socket and flip the page data on pipe by
vmsplice(). Every page address would have a header flag
RAM_SAVE_FLAG_HOOK, which will trigger the load hook to
receive it in incoming side as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   63 +
 1 files changed, 63 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index f479530..0a41c69 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -116,6 +116,68 @@ static int qemu_local_close(void *opaque)
 return 0;
 }
 
+static int send_pipefd(int sockfd, int pipefd);
+
+static size_t qemu_local_save_ram(QEMUFile *f, void *opaque,
+  MemoryRegion *mr, ram_addr_t offset,
+  size_t size, int *bytes_sent)
+{
+QEMUFileLocal *s = opaque;
+ram_addr_t current_addr = mr-ram_addr + offset;
+void *ram_addr;
+ssize_t ret;
+
+if (s-unix_page_flipping) {
+qemu_put_be64(s-file, current_addr | RAM_SAVE_FLAG_HOOK);
+qemu_fflush(s-file);
+
+if (!s-pipefd_passed) {
+ret = send_pipefd(s-sockfd, s-pipefd[0]);
+if (ret  0) {
+fprintf(stderr, failed to pass PIPE\n);
+return ret;
+}
+s-pipefd_passed = true;
+}
+
+ram_addr = memory_region_get_ram_ptr(mr) + offset;
+
+/* vmsplice page data to pipe */
+struct iovec iov = {
+.iov_base = ram_addr,
+.iov_len  = size,
+};
+
+/*
+ * The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will movie pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently falls
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+
+ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice save error: %s\n, strerror(errno));
+return ret;
+}
+} else {
+if (bytes_sent) {
+*bytes_sent = 1;
+}
+DPRINTF(block_offset: %lu, offset: %lu\n, mr-ram_addr, offset);
+return 0;
+}
+}
+
+return RAM_SAVE_CONTROL_NOT_SUPP;
+}
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
@@ -126,6 +188,7 @@ static const QEMUFileOps pipe_write_ops = {
 .get_fd = qemu_local_get_sockfd,
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
+.save_page  = qemu_local_save_ram
 };
 
 QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
-- 
1.7.7.6




[Qemu-devel] [PATCH 17/17] hmp: better format for info migrate_capabilities

2013-11-29 Thread Lei Li
As there might be more capabilities introduced, better to display
it in lines.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 hmp.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hmp.c b/hmp.c
index 32ee285..dcfa2f9 100644
--- a/hmp.c
+++ b/hmp.c
@@ -226,13 +226,12 @@ void hmp_info_migrate_capabilities(Monitor *mon, const 
QDict *qdict)
 caps = qmp_query_migrate_capabilities(NULL);
 
 if (caps) {
-monitor_printf(mon, capabilities: );
+monitor_printf(mon, Capabilities:\n);
 for (cap = caps; cap; cap = cap-next) {
-monitor_printf(mon, %s: %s ,
+monitor_printf(mon, %s: %s\n,
MigrationCapability_lookup[cap-value-capability],
cap-value-state ? on : off);
 }
-monitor_printf(mon, \n);
 }
 
 qapi_free_MigrationCapabilityStatusList(caps);
-- 
1.7.7.6




[Qemu-devel] [PATCH 09/17] savevm: adjust ram_control_save_page for page flipping

2013-11-29 Thread Lei Li
As callback save_page will always be opened by
qemu_fopen_socket_local(), and without unix_page_flipping
it will return RAM_SAVE_CONTROL_NOT_SUPP, it leads to a
wrong qemu_file_set_error() based on the current logic.
So this patch adds RAM_SAVE_CONTROL_NOT_SUPP to the check.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 savevm.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/savevm.c b/savevm.c
index 06c1f29..137e74f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -668,7 +668,8 @@ size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, 
ram_addr_t offset,
 int ret = f-ops-save_page(f, f-opaque, mr, offset,
 size, bytes_sent);
 
-if (ret != RAM_SAVE_CONTROL_DELAYED) {
+if (ret != RAM_SAVE_CONTROL_DELAYED 
+ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (bytes_sent  *bytes_sent  0) {
 qemu_update_position(f, *bytes_sent);
 } else if (ret  0) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 13/17] migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local

2013-11-29 Thread Lei Li
Relace qemu_fopen_socket with qemu_fopen_socket_local in Unix
protocol migration.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-unix.c |   18 ++
 1 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/migration-unix.c b/migration-unix.c
index 651fc5b..9beeafe 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -37,12 +37,22 @@ static void unix_wait_for_connect(int fd, void *opaque)
 if (fd  0) {
 DPRINTF(migrate connect error\n);
 s-file = NULL;
-migrate_fd_error(s);
+goto fail;
 } else {
 DPRINTF(migrate connect success\n);
-s-file = qemu_fopen_socket(fd, wb);
+
+s-file = qemu_fopen_socket_local(fd, wb);
+if (s-file == NULL) {
+fprintf(stderr, failed to open Unix socket\n);
+goto fail;
+}
+
 migrate_fd_connect(s);
+return;
 }
+
+fail:
+migrate_fd_error(s);
 }
 
 void unix_start_outgoing_migration(MigrationState *s, const char *path, Error 
**errp)
@@ -71,9 +81,9 @@ static void unix_accept_incoming_migration(void *opaque)
 goto out;
 }
 
-f = qemu_fopen_socket(c, rb);
+f = qemu_fopen_socket_local(c, rb);
 if (f == NULL) {
-fprintf(stderr, could not qemu_fopen socket\n);
+fprintf(stderr, failed to open Unix socket\n);
 goto out;
 }
 
-- 
1.7.7.6




[Qemu-devel] [PATCH 15/17] migration-unix: page flipping support on unix outgoing

2013-11-29 Thread Lei Li
Add page flipping support on unix outgoing part by stopping
VM with the new RunState RUN_STATE_MEMORY_STALE before
invoking migration if unix_page_flipping enabled.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-unix.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/migration-unix.c b/migration-unix.c
index 9beeafe..cbf2087 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -19,6 +19,7 @@
 #include migration/migration.h
 #include migration/qemu-file.h
 #include block/block.h
+#include sysemu/sysemu.h
 
 //#define DEBUG_MIGRATION_UNIX
 
@@ -33,6 +34,7 @@
 static void unix_wait_for_connect(int fd, void *opaque)
 {
 MigrationState *s = opaque;
+int ret;
 
 if (fd  0) {
 DPRINTF(migrate connect error\n);
@@ -47,6 +49,15 @@ static void unix_wait_for_connect(int fd, void *opaque)
 goto fail;
 }
 
+/* Stop VM before invoking migration if unix_page_flipping enabled */
+if (migrate_unix_page_flipping()) {
+ret = vm_stop_force_state(RUN_STATE_MEMORY_STALE);
+if (ret  0) {
+DPRINTF(failed to stop VM\n);
+goto fail;
+}
+}
+
 migrate_fd_connect(s);
 return;
 }
-- 
1.7.7.6




[Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping

2013-11-29 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration.c |7 +--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/migration.c b/migration.c
index 4ac466b..68b5b02 100644
--- a/migration.c
+++ b/migration.c
@@ -582,7 +582,7 @@ static void *migration_thread(void *opaque)
 if (pending_size  pending_size = max_size) {
 qemu_savevm_state_iterate(s-file);
 } else {
-int ret;
+int ret = 0;
 
 DPRINTF(done iterating\n);
 qemu_mutex_lock_iothread();
@@ -590,7 +590,10 @@ static void *migration_thread(void *opaque)
 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 old_vm_running = runstate_is_running();
 
-ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+if (!runstate_needs_reset()) {
+ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}
+
 if (ret = 0) {
 qemu_file_set_rate_limit(s-file, INT_MAX);
 qemu_savevm_state_complete(s-file);
-- 
1.7.7.6




[Qemu-devel] [PATCH 04/17] migration-local: add QEMUFileLocal with socket based QEMUFile

2013-11-29 Thread Lei Li
This patch adds QEMUFileLocal with copy of socket based QEMUFile, will
be used as the basis code for Unix socket protocol migration and page
flipping migration.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 Makefile.target   |1 +
 migration-local.c |  123 +
 2 files changed, 124 insertions(+), 0 deletions(-)
 create mode 100644 migration-local.c

diff --git a/Makefile.target b/Makefile.target
index af6ac7e..aa09960 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -117,6 +117,7 @@ obj-$(CONFIG_KVM) += kvm-all.o
 obj-y += memory.o savevm.o cputlb.o
 obj-y += memory_mapping.o
 obj-y += dump.o
+obj-y += migration-local.o
 LIBS+=$(libs_softmmu)
 
 # xen support
diff --git a/migration-local.c b/migration-local.c
new file mode 100644
index 000..ca01a20
--- /dev/null
+++ b/migration-local.c
@@ -0,0 +1,123 @@
+/*
+ * QEMU localhost migration with page flipping
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *   Lei Li   li...@linux.vnet.ibm.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include config-host.h
+#include qemu-common.h
+#include migration/migration.h
+#include exec/cpu-common.h
+#include config.h
+#include exec/cpu-all.h
+#include exec/memory.h
+#include exec/memory-internal.h
+#include monitor/monitor.h
+#include migration/qemu-file.h
+#include qemu/iov.h
+#include sysemu/arch_init.h
+#include sysemu/sysemu.h
+#include block/block.h
+#include qemu/sockets.h
+#include migration/block.h
+#include qemu/thread.h
+#include qmp-commands.h
+#include trace.h
+#include qemu/osdep.h
+
+//#define DEBUG_MIGRATION_LOCAL
+
+#ifdef DEBUG_MIGRATION_LOCAL
+#define DPRINTF(fmt, ...) \
+do { printf(migration-local:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+
+typedef struct QEMUFileLocal {
+QEMUFile *file;
+int sockfd;
+int pipefd[2];
+int pipefd_passed;
+int pipefd_received;
+bool unix_page_flipping;
+} QEMUFileLocal;
+
+static int qemu_local_get_sockfd(void *opaque)
+{
+QEMUFileLocal *s = opaque;
+
+return s-sockfd;
+}
+
+static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
+ int64_t pos, int size)
+{
+QEMUFileLocal *s = opaque;
+ssize_t len;
+
+for (;;) {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
+
+if (socket_error() == EAGAIN) {
+yield_until_fd_readable(s-sockfd);
+} else if (socket_error() != EINTR) {
+break;
+}
+}
+
+if (len == -1) {
+len = -socket_error();
+}
+
+return len;
+}
+
+static ssize_t qemu_local_writev_buffer(void *opaque, struct iovec *iov,
+int iovcnt, int64_t pos)
+{
+QEMUFileLocal *s = opaque;
+ssize_t len;
+ssize_t size = iov_size(iov, iovcnt);
+
+len = iov_send(s-sockfd, iov, iovcnt, 0, size);
+if (len  size) {
+len = -socket_error();
+}
+
+return len;
+}
+
+static int qemu_local_close(void *opaque)
+{
+QEMUFileLocal *s = opaque;
+
+closesocket(s-sockfd);
+g_free(s);
+
+return 0;
+}
+
+static const QEMUFileOps pipe_read_ops = {
+.get_fd= qemu_local_get_sockfd,
+.get_buffer= qemu_local_get_buffer,
+.close = qemu_local_close,
+};
+
+static const QEMUFileOps pipe_write_ops = {
+.get_fd = qemu_local_get_sockfd,
+.writev_buffer  = qemu_local_writev_buffer,
+.close  = qemu_local_close,
+};
-- 
1.7.7.6




[Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load

2013-11-29 Thread Lei Li
Override hook_ram_load to receive the pipe file descriptor
passed by source process and page address which will be
extracted to vmsplice the page data from pipe.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   59 +
 1 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 76ec306..b086f38 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -231,10 +231,69 @@ static size_t qemu_local_save_ram(QEMUFile *f, void 
*opaque,
 return RAM_SAVE_CONTROL_NOT_SUPP;
 }
 
+static int qemu_local_ram_load(QEMUFile *f, void *opaque,
+   ram_addr_t addr, uint64_t flags)
+{
+QEMUFileLocal *s = opaque;
+struct iovec iov;
+ssize_t ret = -EINVAL;
+
+if (!s-pipefd_received) {
+/*
+ * send_pipefd was called at this point, and it wrote one
+ * byte to the stream.
+ */
+qemu_get_byte(s-file);
+s-pipefd_received = true;
+}
+
+if (s-pipefd_passed) {
+void *host;
+/*
+ * Extract the page address from the 8-byte record and
+ * read the page data from the pipe.
+ */
+host = qemu_get_ram_ptr(addr);
+
+iov.iov_base = host;
+iov.iov_len = TARGET_PAGE_SIZE;
+
+/*
+ * The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will movie pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently falls
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+
+ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice() load error: %s, strerror(errno));
+return ret;
+}
+DPRINTF(vmsplice load error\n);
+} else if (ret == 0) {
+DPRINTF(stderr, load_page: zero read\n);
+}
+
+DPRINTF(vmsplice (read): %zu\n, ret);
+return ret;
+}
+
+return -EINVAL;
+}
+
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
 .close = qemu_local_close,
+.hook_ram_load = qemu_local_ram_load
 };
 
 static const QEMUFileOps pipe_write_ops = {
-- 
1.7.7.6




Re: [Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping

2013-11-28 Thread Lei Li

On 11/26/2013 10:11 PM, Paolo Bonzini wrote:

Il 26/11/2013 14:53, Lei Li ha scritto:

1) ram_save_setup stage, it will send all the bytes in this stages
to destination, and send_pipefd by ram_control_before_iterate
at the end of it.

ram_save_setup runs doesn't send anything from guest RAM.  It sends the
lengths of the various blocks.  As you said, at the end of
ram_save_setup you send the pipefd.

ram_save_iterate runs before ram_save_complete.  ram_save_iterate and
ram_save_complete write data with exactly the same format.  Both of them
can use ram_save_page

It should not matter if some pages are sent as part of ram_save_iterate
and others as part of ram_save_complete.

One possibility is that you are hitting a bug due to the way you ignore
the 0x01 byte that send_pipefd places on the socket.


Oops.  I might have said this before thinking about postcopy and/or
before seeing the benchmark results from Juan's patches.  If this part
of the patch is just an optimization, I'd rather leave it out for now.

I am afraid that page flipping can not proceed correctly without this..

I really would like to understand why, because it really shouldn't (this
shouldn't be a place where you need a hook).


Hi Paolo,

Sorry for the late reply.

Yes, you are right!!  I just have a try with this adjustment removed, it
works well...

I remembered that it can not proceed correctly when debugging in previous
version without this as in theory it should like your explanation above. I
guess the only answer is that there was a bug regarding the one byte fd
control message just like the possibility you listed!
 



Paolo





--
Lei




Re: [Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram

2013-11-26 Thread Lei Li

On 11/25/2013 05:48 PM, Paolo Bonzini wrote:

Il 25/11/2013 08:29, Lei Li ha scritto:


In this case, if the migration would fail just because the misconfiguration
of device state on destination, in the meantime the outgoing migration has
no aware of this failure, I think it should add such handling (like synchronize
of the device state list in incoming side?) to the current migration protocol
as it is kind of missing... It can not just rely on the resume of source
guest for such failure... or maybe it should be handled in management
app to force the configuration right?

It is already handled by libvirt, indeed.

Basically, -incoming without -S is a broken option because of the
missing handshake at the end of migration.  With -S something else
(either a human or a program) can check that everything went well and
choose whether to restart the source or the destination.


I see, thanks for your explanation.  :-)

BTW, do you think we should add such handling to the current migration
protocol?




Postcopy would fix this (assuming the postcopy phase is reliable) by
migrating device data before any page flipping occurs.

Are you suggesting that page flipping should be coupled with the postcopy
migration for live upgrade of QEMU as your comments in the previous
version?

In order to make live upgrade reliable, it should.


The whole procedure for page flipping migration is straight forward, and
the cases of failure I listed are in theory, which never happened at least
since many times I have tested (except the case you raised above). But I
agree with you on coupling with postcopy migration to make it more reliable,
specially for the undetected problems.

For this, I am not quite sure I understand it correctly, seems the latest
update of post copy migration was sent on last Oct, would you please give
some insights on what else could I do for the coupling with postcopy migration?

If no, now page flipping is implemented as a migration capability, and it's
a good shape already as your comments in the previous version. Although it
still needs a little more time to get the numbers of the new vmsplice, I'd to
ask your opinion that do you consider it could be merged as an experimental
version for now?




Paolo




--
Lei




Re: [Qemu-devel] [PATCH 08/17] add unix_msgfd_lookup() to callback get_buffer

2013-11-26 Thread Lei Li

On 11/21/2013 05:11 PM, Lei Li wrote:

The control message for exchange of pipe file descriptor should
be received by recvmsg, and it might be eaten to stream file by
qemu_recv() when receiving by two callbacks. So this patch adds
unix_msgfd_lookup() to callback get_buffer as the only one receiver,
where the pipe file descriptor would be caughted.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  migration-local.c |   68 ++--
  1 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index e028beb..0f0896b 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -50,6 +50,8 @@ typedef struct QEMUFileLocal {
  bool unix_page_flipping;
  } QEMUFileLocal;

+static bool pipefd_passed;
+
  static int qemu_local_get_sockfd(void *opaque)
  {
  QEMUFileLocal *s = opaque;
@@ -57,16 +59,76 @@ static int qemu_local_get_sockfd(void *opaque)
  return s-sockfd;
  }

+static int unix_msgfd_lookup(void *opaque, struct msghdr *msg)
+{
+QEMUFileLocal *s = opaque;
+struct cmsghdr *cmsg;
+bool found = false;
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS)
+continue;
+
+/* PIPE file descriptor to be received */
+s-pipefd[0] = *((int *)CMSG_DATA(cmsg));
+}
+
+if (s-pipefd[0] = 0) {


And this should be if (s-pipefd[0]  0)..


+fprintf(stderr, no pipe fd can be received\n);
+return found;
+}
+
+DPRINTF(pipefd successfully received\n);
+return s-pipefd[0];
+}
+
  static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
   int64_t pos, int size)
  {
  QEMUFileLocal *s = opaque;
  ssize_t len;
+struct msghdr msg = { NULL, };
+struct iovec iov[1];
+union {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+} msg_control;
+
+iov[0].iov_base = buf;
+iov[0].iov_len = size;
+
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);

  for (;;) {
-len = qemu_recv(s-sockfd, buf, size, 0);
-if (len != -1) {
-break;
+if (!pipefd_passed) {
+/*
+ * recvmsg is called here to catch the control message for
+ * the exchange of PIPE file descriptor until it is received.
+ */
+len = recvmsg(s-sockfd, msg, 0);
+if (len != -1) {
+if (unix_msgfd_lookup(s, msg)  0) {
+pipefd_passed = 1;
+/*
+ * Do not count one byte taken by the PIPE file
+ * descriptor.
+ */
+len--;
+} else {
+len = -1;
+}


Just found that this 'else' should go away as it will break the normal
Unix migration since pipefd_passed will always be 0 for it. I have
fixed this in my code, seems I mis-send it for some reason, sorry
for this...:-[  


+break;
+}
+} else {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
  }

  if (socket_error() == EAGAIN) {



--
Lei




Re: [Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping

2013-11-26 Thread Lei Li

On 11/26/2013 07:32 PM, Paolo Bonzini wrote:

Il 21/11/2013 10:11, Lei Li ha scritto:

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  migration.c |   10 +++---
  1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/migration.c b/migration.c
index 4ac466b..0f98ac1 100644
--- a/migration.c
+++ b/migration.c
@@ -579,10 +579,11 @@ static void *migration_thread(void *opaque)
  pending_size = qemu_savevm_state_pending(s-file, max_size);
  DPRINTF(pending size % PRIu64  max % PRIu64 \n,
  pending_size, max_size);
-if (pending_size  pending_size = max_size) {
+if (pending_size  pending_size = max_size 
+!runstate_needs_reset()) {
  qemu_savevm_state_iterate(s-file);

I'm not sure why you need this.


The adjustment here is to avoid the iteration stage for page flipping.
Because pending_size = ram_save_remaining() * TARGET_PAGE_SIZE which is
not 0 and pending_size  max_size (0) at start.

In the previous version it was like this:

if (pending_size  pending_size = max_size 
!migrate_unix_page_flipping()) {

And you said 'This is a bit ugly but I understand the need. Perhaps 
!runstate_needs_reset() like below?' :)




  } else {
-int ret;
+int ret = 0;
  
  DPRINTF(done iterating\n);

  qemu_mutex_lock_iothread();
@@ -590,7 +591,10 @@ static void *migration_thread(void *opaque)
  qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
  old_vm_running = runstate_is_running();
  
-ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);

+if (!runstate_needs_reset()) {
+ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}

This however is okay.

Paolo


  if (ret = 0) {
  qemu_file_set_rate_limit(s-file, INT_MAX);
  qemu_savevm_state_complete(s-file);






--
Lei




Re: [Qemu-devel] [PATCH 10/17] migration-local: override save_page for page transmit

2013-11-26 Thread Lei Li

On 11/26/2013 07:22 PM, Paolo Bonzini wrote:

Il 21/11/2013 10:11, Lei Li ha scritto:

This patch implements save_page callback for the outside
of page flipping. It will write the address of the page
on the Unix socket and flip the page data on pipe by
vmsplice(). Every page address would have a header flag
RAM_SAVE_FLAG_HOOK, which will trigger the load hook to
receive it in incoming side as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  migration-local.c |   54 +
  1 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 0f0896b..14207e9 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -200,6 +200,59 @@ static int qemu_local_send_pipefd(QEMUFile *f, void 
*opaque,
  return 0;
  }
  
+static size_t qemu_local_save_ram(QEMUFile *f, void *opaque,

+  MemoryRegion *mr, ram_addr_t offset,
+  size_t size, int *bytes_sent)
+{
+QEMUFileLocal *s = opaque;
+ram_addr_t current_addr = mr-ram_addr + offset;
+void *ram_addr;
+ssize_t ret;
+
+if (s-unix_page_flipping) {
+qemu_fflush(s-file);
+qemu_put_be64(s-file, RAM_SAVE_FLAG_HOOK);
+
+/* Write page address to unix socket */
+qemu_put_be64(s-file, current_addr);
+

You can write current_addr | RAM_SAVE_FLAG_HOOK.  The value will be in
the flags argument of the hook_ram_load, you can extract it with flags
 ~RAM_SAVE_FLAG_HOOK.  This cuts by half the data written to the Unix
socket.


OK, thanks.


Paolo


+ram_addr = memory_region_get_ram_ptr(mr) + offset;
+
+/* vmsplice page data to pipe */
+struct iovec iov = {
+.iov_base = ram_addr,
+.iov_len  = size,
+};
+
+/*
+ * The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will movie pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently falls
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice save error: %s\n, strerror(errno));
+return ret;
+}
+} else {
+if (bytes_sent) {
+*bytes_sent = 1;
+}
+DPRINTF(block_offset: %lu, offset: %lu\n, block_offset, offset);
+return 0;
+}
+}
+
+return RAM_SAVE_CONTROL_NOT_SUPP;
+}
+
  static const QEMUFileOps pipe_read_ops = {
  .get_fd= qemu_local_get_sockfd,
  .get_buffer= qemu_local_get_buffer,
@@ -211,6 +264,7 @@ static const QEMUFileOps pipe_write_ops = {
  .writev_buffer  = qemu_local_writev_buffer,
  .close  = qemu_local_close,
  .before_ram_iterate = qemu_local_send_pipefd,
+.save_page  = qemu_local_save_ram
  };
  
  QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)







--
Lei




Re: [Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load

2013-11-26 Thread Lei Li

On 11/26/2013 07:25 PM, Paolo Bonzini wrote:

Il 21/11/2013 10:11, Lei Li ha scritto:

+static int qemu_local_ram_load(QEMUFile *f, void *opaque,
+   uint64_t flags)
+{
+QEMUFileLocal *s = opaque;
+ram_addr_t addr;
+struct iovec iov;
+ssize_t ret = -EINVAL;
+
+/*
+ * PIPE file descriptor will be received by another callback
+ * get_buffer.
+ */
+if (pipefd_passed) {
+void *host;
+/*
+ * Extract the page address from the 8-byte record and
+ * read the page data from the pipe.
+ */
+addr = qemu_get_be64(s-file);
+host = qemu_get_ram_ptr(addr);
+
+iov.iov_base = host;
+iov.iov_len = TARGET_PAGE_SIZE;
+
+/* The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will movie pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently falls
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice() load error: %s, strerror(errno));
+return ret;
+}
+DPRINTF(vmsplice load error\n);
+} else if (ret == 0) {
+DPRINTF(stderr, load_page: zero read\n);
+}
+
+DPRINTF(vmsplice (read): %zu\n, ret);
+return ret;
+}
+
+return 0;
+}

I think you need to return -EINVAL if there is no pipe.


Yes, you are right..



Paolo




--
Lei




Re: [Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping

2013-11-26 Thread Lei Li

On 11/26/2013 08:54 PM, Paolo Bonzini wrote:

Il 26/11/2013 13:03, Lei Li ha scritto:

+if (pending_size  pending_size = max_size 
+!runstate_needs_reset()) {
   qemu_savevm_state_iterate(s-file);

I'm not sure why you need this.

The adjustment here is to avoid the iteration stage for page flipping.
Because pending_size = ram_save_remaining() * TARGET_PAGE_SIZE which is
not 0 and pending_size  max_size (0) at start.

It's still not clear to me that avoiding the iteration stage is


The purpose of it is not just for optimization, but to avoid the
iteration for better alignment.

The current flow of page flipping basically has two stages:

1) ram_save_setup stage, it will send all the bytes in this stages
   to destination, and send_pipefd by ram_control_before_iterate
   at the end of it.
2) ram_save_complete, it will start to transmit the ram page
   in ram_save_block, and send the device state after that.

So it needs to adjust the current migration process to avoid
the iteration stage.


necessary.  I think it's just an optimization to avoid scanning the
bitmap, but:

(1) Juan's bitmap optimization will make this mostly unnecessary

(2) getting good downtime from page flipping will require postcopy anyway.


And you said 'This is a bit ugly but I understand the need. Perhaps 
!runstate_needs_reset() like below?' :)

Oops.  I might have said this before thinking about postcopy and/or
before seeing the benchmark results from Juan's patches.  If this part
of the patch is just an optimization, I'd rather leave it out for now.


I am afraid that page flipping can not proceed correctly without this..



Thanks for putting up with me. :)

Paolo




--
Lei




Re: [Qemu-devel] [PATCH 08/17] add unix_msgfd_lookup() to callback get_buffer

2013-11-26 Thread Lei Li

On 11/26/2013 07:31 PM, Paolo Bonzini wrote:

Il 21/11/2013 10:11, Lei Li ha scritto:

+/*
+ * recvmsg is called here to catch the control message for
+ * the exchange of PIPE file descriptor until it is received.
+ */
+len = recvmsg(s-sockfd, msg, 0);
+if (len != -1) {
+if (unix_msgfd_lookup(s, msg)  0) {
+pipefd_passed = 1;
+/*
+ * Do not count one byte taken by the PIPE file
+ * descriptor.
+ */
+len--;

I think adding a byte in the middle of the stream is not reliable.

Rather, you should transmit the socket always at the same place, for
example in the first call of qemu_local_save_ram, after it has written
the 64-bit field.


I guess 'transmit the socket' you mean transmit the fd?

Sorry that I am quite understand your suggestion here.. Do you
mean that send_pipefd in the first call of qemu_local_save_ram
after it has written the 64-bit field? In this way, get rid of
qemu_local_send_pipefd?

Currently, the fd control message is sent at the end of the stream
in ram_save_setup stage, followed by the ram page. The control
message of fd is always at the same place.



The matching code in qemu_local_ram_load will be like this:

static int qemu_local_ram_load(QEMUFile *f, void *opaque,
uint64_t flags)
{
 QEMUFileLocal *s = opaque;
 ram_addr_t addr;
 struct iovec iov;
 ssize_t ret = -EINVAL;

 if (!s-pipefd_received) {
 /*
  * send_pipefd was called at this point, and it wrote one byte
  * to the stream.
  */
 qemu_get_byte(s);
 s-pipefd_received = true;
 }

 if (pipefd_passed) {
 ...
 }
 return -EINVAL;
}

Also, please move pipefd_passed within QEMUFileLocal.

Thanks,

Paolo




--
Lei




Re: [Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_MEMORY_STALE

2013-11-26 Thread Lei Li

On 11/26/2013 08:28 PM, Paolo Bonzini wrote:

Il 21/11/2013 10:11, Lei Li ha scritto:
  
  { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED },

DEBUG - MEMORY_STALE is missing.


Good catch, I will add it, thanks. :)



Paolo


  { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
  { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
  { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE },
  





--
Lei




Re: [Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram

2013-11-24 Thread Lei Li

On 11/22/2013 07:36 PM, Paolo Bonzini wrote:

Il 22/11/2013 12:29, Lei Li ha scritto:

During the page flipping migration, ram page of source guest would
be flipped to the destination, that's why the source guest can not
be resumed. AFAICT, the page flipping migration may fail at the
connection stage (including the exchange of pipe fd) and migration
register stage (say any blocker like unsupported migration device),

Unfortunately, some migration problems (e.g. misconfiguration of the
destination QEMU) cannot be detected until the device data is migrated.
  This happens after RAM migration, so there is indeed a reliability problem.


Hi Paolo,

'Some migration problems cannot be detected until the device data is migrated',
do you mean that the outgoing migration has no idea the failure of incoming
side caused by the misconfiguration of the destination QEMU?

In this case, if the migration would fail just because the misconfiguration
of device state on destination, in the meantime the outgoing migration has
no aware of this failure, I think it should add such handling (like synchronize
of the device state list in incoming side?) to the current migration protocol
as it is kind of missing... It can not just rely on the resume of source
guest for such failure... or maybe it should be handled in management app to
force the configuration right?



Postcopy would fix this (assuming the postcopy phase is reliable) by
migrating device data before any page flipping occurs.


Are you suggesting that page flipping should be coupled with the postcopy
migration for live upgrade of QEMU as your comments in the previous version?



Paolo


but it could be resumed for such situation since the memory has not
been flipped to another content. Once the connection is successfully
setup, it would proceed the transmission of ram page which hardly
fails. And for the failure handling in Libvirt, ZhengSheng has proposed
that restarts the old QEMU instead of resume. I know 'hardly' is not
an good answer to your concern, but it is the cost of the limited
memory IMO.

So if downtime is the key to the user, or if it's *zero toleration of
the restarting of QEMU, page flipping migration might not be a good
choice. From the perspective of management app like Libvirt, as the
'live upgrade' of QEMU will be done through localhost migration, and
there are other migration solutions which have lower downtime, like
the real live migration and the postcopy migration that Paolo mentioned
in the previous version [3]. Why not have more than one choice for it?





--
Lei




Re: [Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram

2013-11-22 Thread Lei Li

On 11/21/2013 06:19 PM, Daniel P. Berrange wrote:

On Thu, Nov 21, 2013 at 05:11:23PM +0800, Lei Li wrote:

This patch series tries to introduce a mechanism using side
channel pipe for RAM via SCM_RIGHTS with unix domain socket
protocol migration.

This side channel is used for the page flipping by vmsplice,
which is the internal mechanism for localhost migration that
we are trying to add to QEMU. The backgroud info and previous
patch series for reference,

Localhost migration
http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html

migration: Introduce side channel for RAM
http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html

I have picked patches from the localhost migration series and rebased
it on the series of side channel, now it is a complete series that
passed the basic test.

Please let me know if there is anything needs to be fixed or improved.
Your suggestions and comments are very welcome, and thanks to Paolo
for his continued review and useful suggestions.

In discussions about supporting this for libvirt, we were told that
when this localhost migration fails, you cannot re-start the guest
on the original source QEMU.

If this is true, this implementation is not satisfactory IMHO. One
of the main motivations of this feature is to allow for in-place
live upgrades of QEMU binaries, for people who can't tolerate the
downtime of restarting their guests, and whom don't have a spare
host to migrate them to.

If people are using this because they can't tolerate any downtime
of the guest, then we need to be able to fully deal with failure to
complete migration by switching back to the original QEMU process,
as we can do with normal non-localhost migration.


Hi Daniel,

Page flipping is introduced here not primarily for low downtime, but
more to avoid requiring that there is enough free memory to fit an
additional copy of the largest guest which is the requirement today
with current localhost migration as the additional explanation from
Anthony in first proposal version [1].

Of course low downtime is also important to the page flipping
migration as the use case of it is to allow 'live' upgrade of a
running QEMU instance, so we expect page flipping through vmsplice
is fast enough to meet it. As an initial implementation of this
feature right now, the downtime is not good, but we are working on
it as there has been some work on kernel side [2].

During the page flipping migration, ram page of source guest would
be flipped to the destination, that's why the source guest can not
be resumed. AFAICT, the page flipping migration may fail at the
connection stage (including the exchange of pipe fd) and migration
register stage (say any blocker like unsupported migration device),
but it could be resumed for such situation since the memory has not
been flipped to another content. Once the connection is successfully
setup, it would proceed the transmission of ram page which hardly
fails. And for the failure handling in Libvirt, ZhengSheng has proposed
that restarts the old QEMU instead of resume. I know 'hardly' is not
an good answer to your concern, but it is the cost of the limited
memory IMO.

So if downtime is the key to the user, or if it's *zero toleration of
the restarting of QEMU, page flipping migration might not be a good
choice. From the perspective of management app like Libvirt, as the
'live upgrade' of QEMU will be done through localhost migration, and
there are other migration solutions which have lower downtime, like
the real live migration and the postcopy migration that Paolo mentioned
in the previous version [3]. Why not have more than one choice for it?


[1]http://lists.gnu.org/archive/html/qemu-devel/2013-06/msg02577.html
[2]http://article.gmane.org/gmane.linux.kernel/1574277
[3]http://lists.gnu.org/archive/html/qemu-devel/2013-10/msg03212.html


Regards,
Daniel



--
Lei




Re: [Qemu-devel] [PATCH 0/17 v2] Localhost migration with side channel for ram

2013-11-21 Thread Lei Li

On 10/25/2013 08:24 PM, Lei Li wrote:

On 10/25/2013 03:30 PM, Paolo Bonzini wrote:

Il 25/10/2013 06:58, Lei Li ha scritto:

Right now just has inaccurate numbers without the new vmsplice, which
based on
the result from info migrate, as the guest ram size increases, 
although the

'total time' is number of times less compared with the current live
migration, but the 'downtime' performs badly.

Of course.

For a 1GB ram guest,

total time: 702 milliseconds
downtime: 692 milliseconds

And when the ram size of guest increasesexponentially, those numbers 
are

proportional to it.
  I will make a list of the performance with the new vmsplice later, 
I am

sure it'd be much better than this at least.

Yes, please.  Is the memory usage is still 2x without vmsplice?

I think you have a nice proof of concept, but on the other hand this
probably needs to be coupled with some kind of postcopy live migration,
that is:

* the source starts sending data

* but the destination starts running immediately

* if the machine needs a page that is missing, the destination asks the
source to send it

* as soon as it arrives, the destination can restart

Using postcopy is problematic for reliability: if the destination fails,
the virtual machine is lost because the source doesn't have the latest
content of memory.  However, this is a much, much smaller problem for
live QEMU upgrade where the network cannot fail.

If you do this, you can achieve pretty much instantaneous live upgrade,
well within your original 200 ms goals.  But the flipping code with
vmsplice should be needed anyway to avoid doubling memory usage, and


Yes, I have read the postcopy migration patches, it does perform very
good on downtime, as just send the vmstates then switch the execution
to destination host. And as you pointed out, it can not avoid
doubling memory usage.

The numbers list above are based on the old vmsplice as I have not yet
worked on the benchmark for performance, it actually copys data rather
than moving. As the feedback for this version is positive, now I am
trying to get a real result out with the new vmsplice.

BTW, kernel side is looking for huge page solution for the improvement of
performance.

The recently patches from kernel as link,

http://article.gmane.org/gmane.linux.kernel/1574277


Hi Paolo,

I have been working on the benchmark of the performance, I am afraid that it 
may take
a bit more time as there has some problems on the new vmsplice which kernel 
side is
working on right now.

I will post a v3 of the series with your comments in previous version fixed 
soon.




it's looking pretty good in this version already!  I'm relieved that the
RDMA code was designed right!


I am happy with it too. :)
Those RDMA hooks really make thingsmore flexible!



Paolo







--
Lei




[Qemu-devel] [PATCH 02/17] migration: add migrate_unix_page_flipping()

2013-11-21 Thread Lei Li
Add migrate_unix_page_flipping() to check if
MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING is enabled.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/migration.h |3 +++
 migration.c   |9 +
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 140e6b4..7e5d01a 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason);
 void migrate_del_blocker(Error *reason);
 
 bool migrate_rdma_pin_all(void);
+
 bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
 
+bool migrate_unix_page_flipping(void);
+
 int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
  uint8_t *dst, int dlen);
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
diff --git a/migration.c b/migration.c
index 2b1ab20..4ac466b 100644
--- a/migration.c
+++ b/migration.c
@@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void)
 return s-xbzrle_cache_size;
 }
 
+bool migrate_unix_page_flipping(void)
+{
+MigrationState *s;
+
+s = migrate_get_current();
+
+return s-enabled_capabilities[MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING];
+}
+
 /* migration thread support */
 
 static void *migration_thread(void *opaque)
-- 
1.7.7.6




[Qemu-devel] [PATCH 08/17] add unix_msgfd_lookup() to callback get_buffer

2013-11-21 Thread Lei Li
The control message for exchange of pipe file descriptor should
be received by recvmsg, and it might be eaten to stream file by
qemu_recv() when receiving by two callbacks. So this patch adds
unix_msgfd_lookup() to callback get_buffer as the only one receiver,
where the pipe file descriptor would be caughted.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   68 ++--
 1 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index e028beb..0f0896b 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -50,6 +50,8 @@ typedef struct QEMUFileLocal {
 bool unix_page_flipping;
 } QEMUFileLocal;
 
+static bool pipefd_passed;
+
 static int qemu_local_get_sockfd(void *opaque)
 {
 QEMUFileLocal *s = opaque;
@@ -57,16 +59,76 @@ static int qemu_local_get_sockfd(void *opaque)
 return s-sockfd;
 }
 
+static int unix_msgfd_lookup(void *opaque, struct msghdr *msg)
+{
+QEMUFileLocal *s = opaque;
+struct cmsghdr *cmsg;
+bool found = false;
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS)
+continue;
+
+/* PIPE file descriptor to be received */
+s-pipefd[0] = *((int *)CMSG_DATA(cmsg));
+}
+
+if (s-pipefd[0] = 0) {
+fprintf(stderr, no pipe fd can be received\n);
+return found;
+}
+
+DPRINTF(pipefd successfully received\n);
+return s-pipefd[0];
+}
+
 static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
  int64_t pos, int size)
 {
 QEMUFileLocal *s = opaque;
 ssize_t len;
+struct msghdr msg = { NULL, };
+struct iovec iov[1];
+union {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+} msg_control;
+
+iov[0].iov_base = buf;
+iov[0].iov_len = size;
+
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
 
 for (;;) {
-len = qemu_recv(s-sockfd, buf, size, 0);
-if (len != -1) {
-break;
+if (!pipefd_passed) {
+/*
+ * recvmsg is called here to catch the control message for
+ * the exchange of PIPE file descriptor until it is received.
+ */
+len = recvmsg(s-sockfd, msg, 0);
+if (len != -1) {
+if (unix_msgfd_lookup(s, msg)  0) {
+pipefd_passed = 1;
+/*
+ * Do not count one byte taken by the PIPE file
+ * descriptor.
+ */
+len--;
+} else {
+len = -1;
+}
+break;
+}
+} else {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
 }
 
 if (socket_error() == EAGAIN) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 01/17] QAPI: introduce migration capability unix_page_flipping

2013-11-21 Thread Lei Li
Introduce unix_page_flipping to MigrationCapability for
localhost migration.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |   10 +-
 1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 83fa485..b290a0f 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -685,10 +685,18 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #  to speed up convergence of RAM migration. (since 1.6)
 #
+# @unix-page-flipping: If enabled, QEMU can optimize migration when the
+#  destination is a QEMU process that runs on the same host as
+#  the source (as is the case for live upgrade).  If the migration
+#  transport is a Unix socket, QEMU will flip RAM pages directly to
+#  the destination, so that memory is only allocated twice for the
+#  source and destination processes. Disabled by default. (since 1.8)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks',
+   'unix-page-flipping'] }
 
 ##
 # @MigrationCapabilityStatus
-- 
1.7.7.6




[Qemu-devel] [PATCH 05/17] migration-local: introduce qemu_fopen_socket_local()

2013-11-21 Thread Lei Li
Add qemu_fopen_socket_local() to open QEMUFileLocal introduced
earlier. It will create a pipe in write mode if unix_page_flipping
is enabled, adjust qemu_local_close() to close pipe as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/qemu-file.h |2 +
 migration-local.c |   46 +
 2 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 0f757fb..f9b104a 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -99,6 +99,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd, const char *mode);
 QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
+QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode);
+
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
diff --git a/migration-local.c b/migration-local.c
index 8b9e10e..28da05b 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -103,6 +103,12 @@ static int qemu_local_close(void *opaque)
 QEMUFileLocal *s = opaque;
 
 closesocket(s-sockfd);
+
+if (s-unix_page_flipping) {
+close(s-pipefd[0]);
+close(s-pipefd[1]);
+}
+
 g_free(s);
 
 return 0;
@@ -119,3 +125,43 @@ static const QEMUFileOps pipe_write_ops = {
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
 };
+
+QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
+{
+QEMUFileLocal *s;
+int pipefd[2];
+
+if (qemu_file_mode_is_not_valid(mode)) {
+return NULL;
+}
+
+s = g_malloc0(sizeof(QEMUFileLocal));
+s-sockfd = sockfd;
+
+if (migrate_unix_page_flipping()) {
+s-unix_page_flipping = 1;
+}
+
+if (mode[0] == 'w') {
+if (s-unix_page_flipping) {
+if (pipe(pipefd)  0) {
+fprintf(stderr, failed to create PIPE\n);
+goto fail;
+}
+
+s-pipefd[0] = pipefd[0];
+s-pipefd[1] = pipefd[1];
+}
+
+qemu_set_block(s-sockfd);
+s-file = qemu_fopen_ops(s, pipe_write_ops);
+} else {
+s-file = qemu_fopen_ops(s, pipe_read_ops);
+}
+
+return s-file;
+
+fail:
+g_free(s);
+return NULL;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 03/17] qmp-command.hx: add missing docs for migration capabilites

2013-11-21 Thread Lei Li
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qmp-commands.hx |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/qmp-commands.hx b/qmp-commands.hx
index fba15cd..dcec433 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2898,6 +2898,10 @@ migrate-set-capabilities
 Enable/Disable migration capabilities
 
 - xbzrle: XBZRLE support
+- x-rdma-pin-all: Pin all pages during RDMA support
+- zero-blocks: Compress zero blocks during block migration
+- auto-converge: Block VCPU to help convergence of migration
+- unix-page-flipping: Page flipping for live QEMU upgrade
 
 Arguments:
 
@@ -2922,6 +2926,10 @@ Query current migration capabilities
 
 - capabilities: migration capabilities state
  - xbzrle : XBZRLE state (json-bool)
+ - x-rdma-pin-all: RDMA state (json-bool)
+ - zero-blocks: zero-blocks state (json-bool)
+ - auto-converge: Auto converge state (json-bool)
+ - unix-page-flipping: Page flipping state (json-bool)
 
 Arguments:
 
-- 
1.7.7.6




[Qemu-devel] [PATCH 07/17] migration-local: override before_ram_iterate to send pipefd

2013-11-21 Thread Lei Li
Override before_ram_iterate to send pipefd. It will qemu_fflush
the stream QEMUFile and send it in RAM_CONTROL_SETUP stage.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   25 +
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index f4265a1..e028beb 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -114,6 +114,30 @@ static int qemu_local_close(void *opaque)
 return 0;
 }
 
+static int send_pipefd(int sockfd, int pipefd);
+
+static int qemu_local_send_pipefd(QEMUFile *f, void *opaque,
+  uint64_t flags)
+{
+QEMUFileLocal *s = opaque;
+int ret;
+
+if (s-unix_page_flipping) {
+/* Avoid sending pipe fd again in ram_save_complete() stage */
+if (flags == RAM_CONTROL_SETUP) {
+qemu_fflush(f);
+ret = send_pipefd(s-sockfd, s-pipefd[0]);
+if (ret  0) {
+fprintf(stderr, failed to pass PIPE\n);
+return ret;
+}
+DPRINTF(PIPE fd was sent\n);
+}
+}
+
+return 0;
+}
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
@@ -124,6 +148,7 @@ static const QEMUFileOps pipe_write_ops = {
 .get_fd = qemu_local_get_sockfd,
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
+.before_ram_iterate = qemu_local_send_pipefd,
 };
 
 QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
-- 
1.7.7.6




[Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()

2013-11-21 Thread Lei Li
This patch adds send_pipefd() to pass the pipe file descriptor
to destination process.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   53 +
 1 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 28da05b..f4265a1 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -165,3 +165,56 @@ fail:
 g_free(s);
 return NULL;
 }
+
+
+/*
+ * Pass a pipe file descriptor to another process.
+ *
+ * Return negative value If pipefd  0. Return 0 on
+ * success.
+ *
+ */
+static int send_pipefd(int sockfd, int pipefd)
+{
+struct msghdr msg;
+struct iovec iov[1];
+ssize_t ret;
+
+union {
+  struct cmsghdr cm;
+  char control[CMSG_SPACE(sizeof(int))];
+} control_un;
+struct cmsghdr *cmptr;
+char req[1] = { 0x01 };
+
+if (pipefd  0) {
+msg.msg_control = NULL;
+msg.msg_controllen = 0;
+/* Negative status means error */
+req[0] = pipefd;
+} else {
+msg.msg_control = control_un.control;
+msg.msg_controllen = sizeof(control_un.control);
+
+cmptr = CMSG_FIRSTHDR(msg);
+cmptr-cmsg_len = CMSG_LEN(sizeof(int));
+cmptr-cmsg_level = SOL_SOCKET;
+cmptr-cmsg_type = SCM_RIGHTS;
+*((int *) CMSG_DATA(cmptr)) = pipefd;
+
+msg.msg_name = NULL;
+msg.msg_namelen = 0;
+
+iov[0].iov_base = req;
+iov[0].iov_len = sizeof(req);
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+}
+
+ret = sendmsg(sockfd, msg, 0);
+if (ret = 0) {
+DPRINTF(sendmsg error: %s\n, strerror(errno));
+}
+
+return ret;
+}
-- 
1.7.7.6




[Qemu-devel] [PATCH 13/17] migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local

2013-11-21 Thread Lei Li
Relace qemu_fopen_socket with qemu_fopen_socket_local in Unix
protocol migration.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-unix.c |   18 ++
 1 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/migration-unix.c b/migration-unix.c
index 651fc5b..9beeafe 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -37,12 +37,22 @@ static void unix_wait_for_connect(int fd, void *opaque)
 if (fd  0) {
 DPRINTF(migrate connect error\n);
 s-file = NULL;
-migrate_fd_error(s);
+goto fail;
 } else {
 DPRINTF(migrate connect success\n);
-s-file = qemu_fopen_socket(fd, wb);
+
+s-file = qemu_fopen_socket_local(fd, wb);
+if (s-file == NULL) {
+fprintf(stderr, failed to open Unix socket\n);
+goto fail;
+}
+
 migrate_fd_connect(s);
+return;
 }
+
+fail:
+migrate_fd_error(s);
 }
 
 void unix_start_outgoing_migration(MigrationState *s, const char *path, Error 
**errp)
@@ -71,9 +81,9 @@ static void unix_accept_incoming_migration(void *opaque)
 goto out;
 }
 
-f = qemu_fopen_socket(c, rb);
+f = qemu_fopen_socket_local(c, rb);
 if (f == NULL) {
-fprintf(stderr, could not qemu_fopen socket\n);
+fprintf(stderr, failed to open Unix socket\n);
 goto out;
 }
 
-- 
1.7.7.6




[Qemu-devel] [PATCH 11/17] savevm: adjust ram_control_save_page for page flipping

2013-11-21 Thread Lei Li
As callback save_page will always be opened by
qemu_fopen_socket_local(), and without unix_page_flipping
it will return RAM_SAVE_CONTROL_NOT_SUPP, it leads to a
wrong qemu_file_set_error() based on the current logic.
So this patch adds RAM_SAVE_CONTROL_NOT_SUPP to the check.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 savevm.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/savevm.c b/savevm.c
index 3ee256e..4576145 100644
--- a/savevm.c
+++ b/savevm.c
@@ -668,7 +668,8 @@ size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, 
ram_addr_t offset,
 int ret = f-ops-save_page(f, f-opaque, mr, offset,
 size, bytes_sent);
 
-if (ret != RAM_SAVE_CONTROL_DELAYED) {
+if (ret != RAM_SAVE_CONTROL_DELAYED 
+ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (bytes_sent  *bytes_sent  0) {
 qemu_update_position(f, *bytes_sent);
 } else if (ret  0) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 15/17] migration-unix: page flipping support on unix outgoing

2013-11-21 Thread Lei Li
Add page flipping support on unix outgoing part by stopping
VM with the new RunState RUN_STATE_MEMORY_STALE before
invoking migration if unix_page_flipping enabled.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-unix.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/migration-unix.c b/migration-unix.c
index 9beeafe..cbf2087 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -19,6 +19,7 @@
 #include migration/migration.h
 #include migration/qemu-file.h
 #include block/block.h
+#include sysemu/sysemu.h
 
 //#define DEBUG_MIGRATION_UNIX
 
@@ -33,6 +34,7 @@
 static void unix_wait_for_connect(int fd, void *opaque)
 {
 MigrationState *s = opaque;
+int ret;
 
 if (fd  0) {
 DPRINTF(migrate connect error\n);
@@ -47,6 +49,15 @@ static void unix_wait_for_connect(int fd, void *opaque)
 goto fail;
 }
 
+/* Stop VM before invoking migration if unix_page_flipping enabled */
+if (migrate_unix_page_flipping()) {
+ret = vm_stop_force_state(RUN_STATE_MEMORY_STALE);
+if (ret  0) {
+DPRINTF(failed to stop VM\n);
+goto fail;
+}
+}
+
 migrate_fd_connect(s);
 return;
 }
-- 
1.7.7.6




[Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping

2013-11-21 Thread Lei Li
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration.c |   10 +++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/migration.c b/migration.c
index 4ac466b..0f98ac1 100644
--- a/migration.c
+++ b/migration.c
@@ -579,10 +579,11 @@ static void *migration_thread(void *opaque)
 pending_size = qemu_savevm_state_pending(s-file, max_size);
 DPRINTF(pending size % PRIu64  max % PRIu64 \n,
 pending_size, max_size);
-if (pending_size  pending_size = max_size) {
+if (pending_size  pending_size = max_size 
+!runstate_needs_reset()) {
 qemu_savevm_state_iterate(s-file);
 } else {
-int ret;
+int ret = 0;
 
 DPRINTF(done iterating\n);
 qemu_mutex_lock_iothread();
@@ -590,7 +591,10 @@ static void *migration_thread(void *opaque)
 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 old_vm_running = runstate_is_running();
 
-ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+if (!runstate_needs_reset()) {
+ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}
+
 if (ret = 0) {
 qemu_file_set_rate_limit(s-file, INT_MAX);
 qemu_savevm_state_complete(s-file);
-- 
1.7.7.6




[Qemu-devel] [PATCH 09/17] save_page: replace block_offset with a MemoryRegion

2013-11-21 Thread Lei Li
This patch exports MemoryRegion to save_page hook, replacing
argument ram_addr_t block_offset with a MemoryRegion suggested
by Paolo Bonzini.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 arch_init.c   |4 ++--
 include/migration/migration.h |2 +-
 include/migration/qemu-file.h |8 
 migration-rdma.c  |4 ++--
 savevm.c  |8 
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index e0acbc5..daaa519 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 
 /* In doubt sent page as normal */
 bytes_sent = -1;
-ret = ram_control_save_page(f, block-offset,
-   offset, TARGET_PAGE_SIZE, bytes_sent);
+ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE,
+bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 7e5d01a..ca852a8 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags);
 #define RAM_SAVE_CONTROL_NOT_SUPP -1000
 #define RAM_SAVE_CONTROL_DELAYED  -2000
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr,
  ram_addr_t offset, size_t size,
  int *bytes_sent);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index f9b104a..6646e89 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, 
uint64_t flags);
  * is saved (such as RDMA, for example.)
  */
 typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
-   ram_addr_t block_offset,
-   ram_addr_t offset,
-   size_t size,
-   int *bytes_sent);
+ MemoryRegion *mr,
+ ram_addr_t offset,
+ size_t size,
+ int *bytes_sent);
 
 typedef struct QEMUFileOps {
 QEMUFilePutBufferFunc *put_buffer;
diff --git a/migration-rdma.c b/migration-rdma.c
index f94f3b4..ae04de4 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque)
  *  the protocol because most transfers are sent 
asynchronously.
  */
 static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
-  ram_addr_t block_offset, ram_addr_t offset,
+  MemoryRegion *mr, ram_addr_t offset,
   size_t size, int *bytes_sent)
 {
 QEMUFileRDMA *rfile = opaque;
@@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
  * is full, or the page doen't belong to the current chunk,
  * an actual RDMA write will occur and a new chunk will be formed.
  */
-ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
+ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size);
 if (ret  0) {
 fprintf(stderr, rdma migration: write error! %d\n, ret);
 goto err;
diff --git a/savevm.c b/savevm.c
index 2f631d4..3ee256e 100644
--- a/savevm.c
+++ b/savevm.c
@@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags)
 }
 }
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
- ram_addr_t offset, size_t size, int *bytes_sent)
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset,
+ size_t size, int *bytes_sent)
 {
 if (f-ops-save_page) {
-int ret = f-ops-save_page(f, f-opaque, block_offset,
-offset, size, bytes_sent);
+int ret = f-ops-save_page(f, f-opaque, mr, offset,
+size, bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_sent  *bytes_sent  0) {
-- 
1.7.7.6




[Qemu-devel] [PATCH 17/17] hmp: better format for info migrate_capabilities

2013-11-21 Thread Lei Li
As there might be more capabilities introduced, better to display
it in lines.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 hmp.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hmp.c b/hmp.c
index 32ee285..dcfa2f9 100644
--- a/hmp.c
+++ b/hmp.c
@@ -226,13 +226,12 @@ void hmp_info_migrate_capabilities(Monitor *mon, const 
QDict *qdict)
 caps = qmp_query_migrate_capabilities(NULL);
 
 if (caps) {
-monitor_printf(mon, capabilities: );
+monitor_printf(mon, Capabilities:\n);
 for (cap = caps; cap; cap = cap-next) {
-monitor_printf(mon, %s: %s ,
+monitor_printf(mon, %s: %s\n,
MigrationCapability_lookup[cap-value-capability],
cap-value-state ? on : off);
 }
-monitor_printf(mon, \n);
 }
 
 qapi_free_MigrationCapabilityStatusList(caps);
-- 
1.7.7.6




[Qemu-devel] [PATCH 04/17] migration-local: add QEMUFileLocal with socket based QEMUFile

2013-11-21 Thread Lei Li
This patch adds QEMUFileLocal with copy of socket based QEMUFile, will
be used as the basis code for Unix socket protocol migration and page
flipping migration.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 Makefile.target   |1 +
 migration-local.c |  121 +
 2 files changed, 122 insertions(+), 0 deletions(-)
 create mode 100644 migration-local.c

diff --git a/Makefile.target b/Makefile.target
index af6ac7e..aa09960 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -117,6 +117,7 @@ obj-$(CONFIG_KVM) += kvm-all.o
 obj-y += memory.o savevm.o cputlb.o
 obj-y += memory_mapping.o
 obj-y += dump.o
+obj-y += migration-local.o
 LIBS+=$(libs_softmmu)
 
 # xen support
diff --git a/migration-local.c b/migration-local.c
new file mode 100644
index 000..8b9e10e
--- /dev/null
+++ b/migration-local.c
@@ -0,0 +1,121 @@
+/*
+ * QEMU localhost migration with page flipping
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *   Lei Li   li...@linux.vnet.ibm.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include config-host.h
+#include qemu-common.h
+#include migration/migration.h
+#include exec/cpu-common.h
+#include config.h
+#include exec/cpu-all.h
+#include exec/memory.h
+#include exec/memory-internal.h
+#include monitor/monitor.h
+#include migration/qemu-file.h
+#include qemu/iov.h
+#include sysemu/arch_init.h
+#include sysemu/sysemu.h
+#include block/block.h
+#include qemu/sockets.h
+#include migration/block.h
+#include qemu/thread.h
+#include qmp-commands.h
+#include trace.h
+#include qemu/osdep.h
+
+//#define DEBUG_MIGRATION_LOCAL
+
+#ifdef DEBUG_MIGRATION_LOCAL
+#define DPRINTF(fmt, ...) \
+do { printf(migration-local:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+
+typedef struct QEMUFileLocal {
+QEMUFile *file;
+int sockfd;
+int pipefd[2];
+bool unix_page_flipping;
+} QEMUFileLocal;
+
+static int qemu_local_get_sockfd(void *opaque)
+{
+QEMUFileLocal *s = opaque;
+
+return s-sockfd;
+}
+
+static int qemu_local_get_buffer(void *opaque, uint8_t *buf,
+ int64_t pos, int size)
+{
+QEMUFileLocal *s = opaque;
+ssize_t len;
+
+for (;;) {
+len = qemu_recv(s-sockfd, buf, size, 0);
+if (len != -1) {
+break;
+}
+
+if (socket_error() == EAGAIN) {
+yield_until_fd_readable(s-sockfd);
+} else if (socket_error() != EINTR) {
+break;
+}
+}
+
+if (len == -1) {
+len = -socket_error();
+}
+
+return len;
+}
+
+static ssize_t qemu_local_writev_buffer(void *opaque, struct iovec *iov,
+int iovcnt, int64_t pos)
+{
+QEMUFileLocal *s = opaque;
+ssize_t len;
+ssize_t size = iov_size(iov, iovcnt);
+
+len = iov_send(s-sockfd, iov, iovcnt, 0, size);
+if (len  size) {
+len = -socket_error();
+}
+
+return len;
+}
+
+static int qemu_local_close(void *opaque)
+{
+QEMUFileLocal *s = opaque;
+
+closesocket(s-sockfd);
+g_free(s);
+
+return 0;
+}
+
+static const QEMUFileOps pipe_read_ops = {
+.get_fd= qemu_local_get_sockfd,
+.get_buffer= qemu_local_get_buffer,
+.close = qemu_local_close,
+};
+
+static const QEMUFileOps pipe_write_ops = {
+.get_fd = qemu_local_get_sockfd,
+.writev_buffer  = qemu_local_writev_buffer,
+.close  = qemu_local_close,
+};
-- 
1.7.7.6




[Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram

2013-11-21 Thread Lei Li
This patch series tries to introduce a mechanism using side
channel pipe for RAM via SCM_RIGHTS with unix domain socket
protocol migration.

This side channel is used for the page flipping by vmsplice,
which is the internal mechanism for localhost migration that
we are trying to add to QEMU. The backgroud info and previous
patch series for reference,

Localhost migration
http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html

migration: Introduce side channel for RAM
http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html

I have picked patches from the localhost migration series and rebased 
it on the series of side channel, now it is a complete series that
passed the basic test.

Please let me know if there is anything needs to be fixed or improved.
Your suggestions and comments are very welcome, and thanks to Paolo
for his continued review and useful suggestions.


Changes since V2;
  Address comments from Paolo including:

- Doc improvement for QAPI.
- Use callback get_buffer as the only one receiver.
- Rename the new RunState flipping-migrate to memory-stale, and
  add transition from 'prelaunch' to 'memory-stale'.
- Other minor fixes.

Changes since V1:
  Address suggestions from Paolo Bonzini including:

- Use Unix socket QEMUFile as basis of code and adjust the way
  of overriding RDMA hooks.
- Involve the vmsplice for page flipping.
- Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to
  runstate_needs_reset() for the adjustment of the current
  migration process with page flipping.



Lei Li (17):
  QAPI: introduce magration capability unix_page_flipping
  migration: add migrate_unix_page_flipping()
  qmp-command.hx: add missing docs for migration capabilites
  migration-local: add QEMUFileLocal with socket based QEMUFile
  migration-local: introduce qemu_fopen_socket_local()
  migration-local: add send_pipefd()
  migration-local: override before_ram_iterate to send pipefd
  add unix_msgfd_lookup() to callback get_buffer
  save_page: replace block_offset with a MemoryRegion
  migration-local: override save_page for page transmit
  savevm: adjust ram_control_save_page with page flipping
  migration-local: override hook_ram_load 
  migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local
  add new RanState RAN_STATE_MEMORY_STALE
  migration-unix: page flipping support on unix outgoing
  migration: adjust migration_thread() process for unix_page_flipping
  hmp: better fomat for info migrate_capabilities

 Makefile.target   |   1 +
 arch_init.c   |   4 +-
 migration-local.c | 512 ++
 hmp.c |   5 +-
 include/migration/migration.h |   3 +
 include/migration/qemu-file.h |   2 +
 migration-unix.c  |  27 ++-
 migration-rdma.c  |   4 +-
 migration.c   |  18 +-
 qapi-schema.json  |  18 +-
 qmp-commands.hx   |   8 +
 savevm.c  |  21 +-
 vl.c  |  12 +-
 13 files changed, 617 insertions(+), 27 deletions(-)
 create mode 100644 migration-local.c




[Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_MEMORY_STALE

2013-11-21 Thread Lei Li
Introduce new RanState RAN_STATE_MEMORY_STALE and
add it to runstate_needs_reset().

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |7 +--
 vl.c |   12 +++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index b290a0f..640a380 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -176,12 +176,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @memory-stale: guest is paused to transmit memory, the destination guest
+# will has the newer contents of it.
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
-'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-'guest-panicked' ] }
+'running', 'save-vm', 'shutdown', 'suspended', 'memory-stale',
+'watchdog', 'guest-panicked' ] }
 
 ##
 # @SnapshotInfo
diff --git a/vl.c b/vl.c
index 8d5d874..0f38405 100644
--- a/vl.c
+++ b/vl.c
@@ -601,6 +601,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 
 { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
 { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_PAUSED, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
@@ -608,6 +609,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING },
 { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE },
 { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+{ RUN_STATE_PRELAUNCH, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
@@ -624,23 +626,30 @@ static const RunStateTransition 
runstate_transitions_def[] = {
 { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
 { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
 { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+{ RUN_STATE_RUNNING, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
 
 { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED },
 { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SHUTDOWN, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED },
 { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
 { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
 { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
 { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_WATCHDOG, RUN_STATE_MEMORY_STALE },
 
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_GUEST_PANICKED, RUN_STATE_MEMORY_STALE },
 
+{ RUN_STATE_MEMORY_STALE, RUN_STATE_RUNNING },
+{ RUN_STATE_MEMORY_STALE, RUN_STATE_POSTMIGRATE },
 { RUN_STATE_MAX, RUN_STATE_MAX },
 };
 
@@ -685,7 +694,8 @@ int runstate_is_running(void)
 bool runstate_needs_reset(void)
 {
 return runstate_check(RUN_STATE_INTERNAL_ERROR) ||
-runstate_check(RUN_STATE_SHUTDOWN);
+runstate_check(RUN_STATE_SHUTDOWN) ||
+runstate_check(RUN_STATE_MEMORY_STALE);
 }
 
 StatusInfo *qmp_query_status(Error **errp)
-- 
1.7.7.6




[Qemu-devel] [PATCH 10/17] migration-local: override save_page for page transmit

2013-11-21 Thread Lei Li
This patch implements save_page callback for the outside
of page flipping. It will write the address of the page
on the Unix socket and flip the page data on pipe by
vmsplice(). Every page address would have a header flag
RAM_SAVE_FLAG_HOOK, which will trigger the load hook to
receive it in incoming side as well.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   54 +
 1 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 0f0896b..14207e9 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -200,6 +200,59 @@ static int qemu_local_send_pipefd(QEMUFile *f, void 
*opaque,
 return 0;
 }
 
+static size_t qemu_local_save_ram(QEMUFile *f, void *opaque,
+  MemoryRegion *mr, ram_addr_t offset,
+  size_t size, int *bytes_sent)
+{
+QEMUFileLocal *s = opaque;
+ram_addr_t current_addr = mr-ram_addr + offset;
+void *ram_addr;
+ssize_t ret;
+
+if (s-unix_page_flipping) {
+qemu_fflush(s-file);
+qemu_put_be64(s-file, RAM_SAVE_FLAG_HOOK);
+
+/* Write page address to unix socket */
+qemu_put_be64(s-file, current_addr);
+
+ram_addr = memory_region_get_ram_ptr(mr) + offset;
+
+/* vmsplice page data to pipe */
+struct iovec iov = {
+.iov_base = ram_addr,
+.iov_len  = size,
+};
+
+/*
+ * The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will movie pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently falls
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice save error: %s\n, strerror(errno));
+return ret;
+}
+} else {
+if (bytes_sent) {
+*bytes_sent = 1;
+}
+DPRINTF(block_offset: %lu, offset: %lu\n, block_offset, offset);
+return 0;
+}
+}
+
+return RAM_SAVE_CONTROL_NOT_SUPP;
+}
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
@@ -211,6 +264,7 @@ static const QEMUFileOps pipe_write_ops = {
 .writev_buffer  = qemu_local_writev_buffer,
 .close  = qemu_local_close,
 .before_ram_iterate = qemu_local_send_pipefd,
+.save_page  = qemu_local_save_ram
 };
 
 QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode)
-- 
1.7.7.6




[Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load

2013-11-21 Thread Lei Li
Override hook_ram_load to receive the pipe file descriptor
passed by source process and page address which will be
extracted to vmsplice the page data from pipe.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 migration-local.c |   55 +
 1 files changed, 55 insertions(+), 0 deletions(-)

diff --git a/migration-local.c b/migration-local.c
index 14207e9..8ac0af5 100644
--- a/migration-local.c
+++ b/migration-local.c
@@ -253,10 +253,65 @@ static size_t qemu_local_save_ram(QEMUFile *f, void 
*opaque,
 return RAM_SAVE_CONTROL_NOT_SUPP;
 }
 
+static int qemu_local_ram_load(QEMUFile *f, void *opaque,
+   uint64_t flags)
+{
+QEMUFileLocal *s = opaque;
+ram_addr_t addr;
+struct iovec iov;
+ssize_t ret = -EINVAL;
+
+/*
+ * PIPE file descriptor will be received by another callback
+ * get_buffer.
+ */
+if (pipefd_passed) {
+void *host;
+/*
+ * Extract the page address from the 8-byte record and
+ * read the page data from the pipe.
+ */
+addr = qemu_get_be64(s-file);
+host = qemu_get_ram_ptr(addr);
+
+iov.iov_base = host;
+iov.iov_len = TARGET_PAGE_SIZE;
+
+/* The flag SPLICE_F_MOVE is introduced in kernel for the page
+ * flipping feature in QEMU, which will movie pages rather than
+ * copying, previously unused.
+ *
+ * If a move is not possible the kernel will transparently falls
+ * back to copying data.
+ *
+ * For older kernels the SPLICE_F_MOVE would be ignored and a copy
+ * would occur.
+ */
+ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE);
+if (ret == -1) {
+if (errno != EAGAIN  errno != EINTR) {
+fprintf(stderr, vmsplice() load error: %s, strerror(errno));
+return ret;
+}
+DPRINTF(vmsplice load error\n);
+} else if (ret == 0) {
+DPRINTF(stderr, load_page: zero read\n);
+}
+
+DPRINTF(vmsplice (read): %zu\n, ret);
+return ret;
+}
+
+return 0;
+}
+
+
+
 static const QEMUFileOps pipe_read_ops = {
 .get_fd= qemu_local_get_sockfd,
 .get_buffer= qemu_local_get_buffer,
 .close = qemu_local_close,
+.hook_ram_load = qemu_local_ram_load
 };
 
 static const QEMUFileOps pipe_write_ops = {
-- 
1.7.7.6




[Qemu-devel] [PATCH resend] save_page: replace block_offset with a MemoryRegion

2013-11-06 Thread Lei Li
This patch exports MemoryRegion to save_page hook, replacing
argument ram_addr_t block_offset with a MemoryRegion suggested
by Paolo Bonzini.

Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 arch_init.c   |4 ++--
 include/migration/migration.h |2 +-
 include/migration/qemu-file.h |8 
 migration-rdma.c  |4 ++--
 savevm.c  |8 
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 7545d96..a9b97be 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 
 /* In doubt sent page as normal */
 bytes_sent = -1;
-ret = ram_control_save_page(f, block-offset,
-   offset, TARGET_PAGE_SIZE, bytes_sent);
+ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE,
+bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 7e5d01a..ca852a8 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags);
 #define RAM_SAVE_CONTROL_NOT_SUPP -1000
 #define RAM_SAVE_CONTROL_DELAYED  -2000
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr,
  ram_addr_t offset, size_t size,
  int *bytes_sent);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 0f757fb..d73dc4b 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, 
uint64_t flags);
  * is saved (such as RDMA, for example.)
  */
 typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
-   ram_addr_t block_offset,
-   ram_addr_t offset,
-   size_t size,
-   int *bytes_sent);
+ MemoryRegion *mr,
+ ram_addr_t offset,
+ size_t size,
+ int *bytes_sent);
 
 typedef struct QEMUFileOps {
 QEMUFilePutBufferFunc *put_buffer;
diff --git a/migration-rdma.c b/migration-rdma.c
index f94f3b4..ae04de4 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque)
  *  the protocol because most transfers are sent 
asynchronously.
  */
 static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
-  ram_addr_t block_offset, ram_addr_t offset,
+  MemoryRegion *mr, ram_addr_t offset,
   size_t size, int *bytes_sent)
 {
 QEMUFileRDMA *rfile = opaque;
@@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
  * is full, or the page doen't belong to the current chunk,
  * an actual RDMA write will occur and a new chunk will be formed.
  */
-ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
+ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size);
 if (ret  0) {
 fprintf(stderr, rdma migration: write error! %d\n, ret);
 goto err;
diff --git a/savevm.c b/savevm.c
index 2f631d4..3ee256e 100644
--- a/savevm.c
+++ b/savevm.c
@@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags)
 }
 }
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
- ram_addr_t offset, size_t size, int *bytes_sent)
+size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset,
+ size_t size, int *bytes_sent)
 {
 if (f-ops-save_page) {
-int ret = f-ops-save_page(f, f-opaque, block_offset,
-offset, size, bytes_sent);
+int ret = f-ops-save_page(f, f-opaque, mr, offset,
+size, bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_sent  *bytes_sent  0) {
-- 
1.7.7.6




Re: [Qemu-devel] [PATCH resend] sdl: Reverse support for video mode setting

2013-11-06 Thread Lei Li

Ping^2

On 10/24/2013 08:21 PM, Lei Li wrote:

This patch has been confirmed by the reporter himself as
link below,

https://bugs.launchpad.net/qemu/+bug/1216368

It has been on the mailing list for a while, could it be merged? PING...

On 09/04/2013 05:07 PM, Lei Li wrote:

Currently, If the setting of video mode failed, qemu will exit. It
should go back to the previous setting if the new screen resolution
failed. This patch fixes LP#1216368, add support to revert to existing
surface for the failure of video mode setting.

Reported-by: Sascha Krissler sas...@srlabs.de
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  ui/sdl.c |   23 +++
  1 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/ui/sdl.c b/ui/sdl.c
index 39a42d6..9d8583c 100644
--- a/ui/sdl.c
+++ b/ui/sdl.c
@@ -86,6 +86,7 @@ static void sdl_update(DisplayChangeListener *dcl,
  static void do_sdl_resize(int width, int height, int bpp)
  {
  int flags;
+SDL_Surface *tmp_screen;

  //printf(resizing to %d %d\n, w, h);

@@ -98,12 +99,26 @@ static void do_sdl_resize(int width, int height, 
int bpp)

  if (gui_noframe)
  flags |= SDL_NOFRAME;

-real_screen = SDL_SetVideoMode(width, height, bpp, flags);
+tmp_screen = SDL_SetVideoMode(width, height, bpp, flags);
  if (!real_screen) {
-fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n, 
width,

-height, bpp, SDL_GetError());
-exit(1);
+if (!tmp_screen) {
+fprintf(stderr, Could not open SDL display (%dx%dx%d): 
%s\n,

+width, height, bpp, SDL_GetError());
+exit(1);
+}
+} else {
+/*
+ * Revert to the previous video mode if the change of 
resizing or

+ * resolution failed.
+ */
+if (!tmp_screen) {
+fprintf(stderr, Failed to set SDL display (%dx%dx%d): 
%s\n,

+width, height, bpp, SDL_GetError());
+return;
+}
  }
+
+real_screen = tmp_screen;
  }

  static void sdl_switch(DisplayChangeListener *dcl,






--
Lei




Re: [Qemu-devel] [PATCH 09/17] migration-local: override before_ram_iterate to send pipefd

2013-10-25 Thread Lei Li

On 10/25/2013 03:23 PM, Paolo Bonzini wrote:

Il 25/10/2013 05:38, Lei Li ha scritto:

Just want to confirm, normally, should I take these 'otherwise looks
good/ok'
as a 'Reviewed-by' from you If the other comment is fixed in the update
version?

Depends on how much the patch changes... right now I'm still expecting
some changes so I didn't really look much at the patch and didn't test
it.  I prefer to take a more complete look at v3 before giving a
formal Reviewed-by.


I see, thanks for your explanation.



Paolo




--
Lei




Re: [Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_FLIPPING_MIGRATE

2013-10-25 Thread Lei Li

On 10/25/2013 03:31 PM, Paolo Bonzini wrote:

Il 25/10/2013 05:30, Lei Li ha scritto:

I am not sure about the name; for one thing, the new state would apply
also to postcopy migration.

About the name, how about 'live-upgrade'?

OK, I'll add the transition between postcopy and this new state.

Note I didn't mean postmigrate.

For a description of postcopy, see my answer to the cover letter (patch


Yes, I've realized that I misunderstood it...


0).  The new state means somebody else has newer contents of the
memory.  Perhaps stale?

And should it also apply from 'prelaunch' to 'flipping-migrate' too?

Yes, it should.  Good catch!

Paolo




--
Lei




Re: [Qemu-devel] [PATCH 0/17 v2] Localhost migration with side channel for ram

2013-10-25 Thread Lei Li

On 10/25/2013 03:30 PM, Paolo Bonzini wrote:

Il 25/10/2013 06:58, Lei Li ha scritto:

Right now just has inaccurate numbers without the new vmsplice, which
based on
the result from info migrate, as the guest ram size increases, although the
'total time' is number of times less compared with the current live
migration, but the 'downtime' performs badly.

Of course.

For a 1GB ram guest,

total time: 702 milliseconds
downtime: 692 milliseconds

And when the ram size of guest increasesexponentially, those numbers are
proportional to it.
  
I will make a list of the performance with the new vmsplice later, I am

sure it'd be much better than this at least.

Yes, please.  Is the memory usage is still 2x without vmsplice?

I think you have a nice proof of concept, but on the other hand this
probably needs to be coupled with some kind of postcopy live migration,
that is:

* the source starts sending data

* but the destination starts running immediately

* if the machine needs a page that is missing, the destination asks the
source to send it

* as soon as it arrives, the destination can restart

Using postcopy is problematic for reliability: if the destination fails,
the virtual machine is lost because the source doesn't have the latest
content of memory.  However, this is a much, much smaller problem for
live QEMU upgrade where the network cannot fail.

If you do this, you can achieve pretty much instantaneous live upgrade,
well within your original 200 ms goals.  But the flipping code with
vmsplice should be needed anyway to avoid doubling memory usage, and


Yes, I have read the postcopy migration patches, it does perform very
good on downtime, as just send the vmstates then switch the execution
to destination host. And as you pointed out, it can not avoid
doubling memory usage.

The numbers list above are based on the old vmsplice as I have not yet
worked on the benchmark for performance, it actually copys data rather
than moving. As the feedback for this version is positive, now I am
trying to get a real result out with the new vmsplice.

BTW, kernel side is looking for huge page solution for the improvement of
performance.

The recently patches from kernel as link,

http://article.gmane.org/gmane.linux.kernel/1574277


it's looking pretty good in this version already!  I'm relieved that the
RDMA code was designed right!


I am happy with it too. :)
Those RDMA hooks really make thingsmore flexible!



Paolo




--
Lei




[Qemu-devel] [PATCH 0/3 for 1.7] migration: introduce page flipping capability

2013-10-25 Thread Lei Li
This series is extracted from the lastest localhost migration
with side channel for ram patch set with comments from Paolo
fixed. Send it separately according to his suggestion.

Localhost migration with side channel for ram:
http://lists.gnu.org/archive/html/qemu-devel/2013-10/msg02787.html

Lei Li (3):
  QAPI: introduce magration capability unix_page_flipping
  migration: add migrate_unix_page_flipping()
  qmp-command.hx: add missing docs for migration capabilites




[Qemu-devel] [PATCH 3/3] qmp-command.hx: add missing docs for migration capabilites

2013-10-25 Thread Lei Li
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qmp-commands.hx |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/qmp-commands.hx b/qmp-commands.hx
index fba15cd..dcec433 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2898,6 +2898,10 @@ migrate-set-capabilities
 Enable/Disable migration capabilities
 
 - xbzrle: XBZRLE support
+- x-rdma-pin-all: Pin all pages during RDMA support
+- zero-blocks: Compress zero blocks during block migration
+- auto-converge: Block VCPU to help convergence of migration
+- unix-page-flipping: Page flipping for live QEMU upgrade
 
 Arguments:
 
@@ -2922,6 +2926,10 @@ Query current migration capabilities
 
 - capabilities: migration capabilities state
  - xbzrle : XBZRLE state (json-bool)
+ - x-rdma-pin-all: RDMA state (json-bool)
+ - zero-blocks: zero-blocks state (json-bool)
+ - auto-converge: Auto converge state (json-bool)
+ - unix-page-flipping: Page flipping state (json-bool)
 
 Arguments:
 
-- 
1.7.7.6




[Qemu-devel] [PATCH 1/3] QAPI: introduce magration capability unix_page_flipping

2013-10-25 Thread Lei Li
Introduce unix_page_flipping to MigrationCapability for localhost
migration.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 qapi-schema.json |   10 +-
 1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 60f3fd1..7cb88af 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -661,10 +661,18 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #  to speed up convergence of RAM migration. (since 1.6)
 #
+# @unix-page-flipping: If enabled, QEMU can optimize migration when the
+#  destination is a QEMU process that runs on the same host as
+#  the source (as is the case for live upgrade).  If the migration
+#  transport is a Unix socket, QEMU will flip RAM pages directly to
+#  the destination, so that memory is only allocated twice for the
+#  source and destination processes. Disabled by default. (since 1.8)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks',
+   'unix-page-flipping'] }
 
 ##
 # @MigrationCapabilityStatus
-- 
1.7.7.6




[Qemu-devel] [PATCH 2/3] migration: add migrate_unix_page_flipping()

2013-10-25 Thread Lei Li
Add migrate_unix_page_flipping() to check if
MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING is enabled.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
 include/migration/migration.h |3 +++
 migration.c   |9 +
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 140e6b4..7e5d01a 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason);
 void migrate_del_blocker(Error *reason);
 
 bool migrate_rdma_pin_all(void);
+
 bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
 
+bool migrate_unix_page_flipping(void);
+
 int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
  uint8_t *dst, int dlen);
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
diff --git a/migration.c b/migration.c
index 2b1ab20..4ac466b 100644
--- a/migration.c
+++ b/migration.c
@@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void)
 return s-xbzrle_cache_size;
 }
 
+bool migrate_unix_page_flipping(void)
+{
+MigrationState *s;
+
+s = migrate_get_current();
+
+return s-enabled_capabilities[MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING];
+}
+
 /* migration thread support */
 
 static void *migration_thread(void *opaque)
-- 
1.7.7.6




Re: [Qemu-devel] [PATCH resend] sdl: Reverse support for video mode setting

2013-10-24 Thread Lei Li

This patch has been confirmed by the reporter himself as
link below,

https://bugs.launchpad.net/qemu/+bug/1216368

It has been on the mailing list for a while, could it be merged? PING...

On 09/04/2013 05:07 PM, Lei Li wrote:

Currently, If the setting of video mode failed, qemu will exit. It
should go back to the previous setting if the new screen resolution
failed. This patch fixes LP#1216368, add support to revert to existing
surface for the failure of video mode setting.

Reported-by: Sascha Krissler sas...@srlabs.de
Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  ui/sdl.c |   23 +++
  1 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/ui/sdl.c b/ui/sdl.c
index 39a42d6..9d8583c 100644
--- a/ui/sdl.c
+++ b/ui/sdl.c
@@ -86,6 +86,7 @@ static void sdl_update(DisplayChangeListener *dcl,
  static void do_sdl_resize(int width, int height, int bpp)
  {
  int flags;
+SDL_Surface *tmp_screen;

  //printf(resizing to %d %d\n, w, h);

@@ -98,12 +99,26 @@ static void do_sdl_resize(int width, int height, int bpp)
  if (gui_noframe)
  flags |= SDL_NOFRAME;

-real_screen = SDL_SetVideoMode(width, height, bpp, flags);
+tmp_screen = SDL_SetVideoMode(width, height, bpp, flags);
  if (!real_screen) {
-   fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n, width,
-   height, bpp, SDL_GetError());
-exit(1);
+if (!tmp_screen) {
+fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n,
+width, height, bpp, SDL_GetError());
+exit(1);
+}
+} else {
+/*
+ * Revert to the previous video mode if the change of resizing or
+ * resolution failed.
+ */
+if (!tmp_screen) {
+fprintf(stderr, Failed to set SDL display (%dx%dx%d): %s\n,
+width, height, bpp, SDL_GetError());
+return;
+}
  }
+
+real_screen = tmp_screen;
  }

  static void sdl_switch(DisplayChangeListener *dcl,



--
Lei




Re: [Qemu-devel] [PATCH 01/17] rename is_active to is_block_active

2013-10-24 Thread Lei Li

On 10/24/2013 09:46 PM, Paolo Bonzini wrote:

Il 22/10/2013 04:25, Lei Li ha scritto:

is_active is used to identify block migration, rename to
is_block_active to make it more clear.

No, is_active is used to identify whether a set of SaveVMHandlers is
active.  The default is true, so only block migration is using it.  But
we could use it in the future for other features (probably using
migration capabilities instead of a flag as is the case for block).


It updates my knowledge.
Thanks for your clarifying!



Paolo


Signed-off-by: Lei Li li...@linux.vnet.ibm.com
---
  block-migration.c   |2 +-
  include/migration/vmstate.h |2 +-
  savevm.c|   16 
  3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index daf9ec1..b637695 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -834,7 +834,7 @@ SaveVMHandlers savevm_block_handlers = {
  .save_live_pending = block_save_pending,
  .load_state = block_load,
  .cancel = block_migration_cancel,
-.is_active = block_is_active,
+.is_block_active = block_is_active,
  };
  
  void blk_mig_init(void)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 9d09e60..c634d65 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -42,7 +42,7 @@ typedef struct SaveVMHandlers {
  int (*save_live_complete)(QEMUFile *f, void *opaque);
  
  /* This runs both outside and inside the iothread lock.  */

-bool (*is_active)(void *opaque);
+bool (*is_block_active)(void *opaque);
  
  /* This runs outside the iothread lock in the migration case, and

   * within the lock in the savevm case.  The callback had better only
diff --git a/savevm.c b/savevm.c
index 2f631d4..56b8643 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1867,8 +1867,8 @@ void qemu_savevm_state_begin(QEMUFile *f,
  if (!se-ops || !se-ops-save_live_setup) {
  continue;
  }
-if (se-ops  se-ops-is_active) {
-if (!se-ops-is_active(se-opaque)) {
+if (se-ops  se-ops-is_block_active) {
+if (!se-ops-is_block_active(se-opaque)) {
  continue;
  }
  }
@@ -1907,8 +1907,8 @@ int qemu_savevm_state_iterate(QEMUFile *f)
  if (!se-ops || !se-ops-save_live_iterate) {
  continue;
  }
-if (se-ops  se-ops-is_active) {
-if (!se-ops-is_active(se-opaque)) {
+if (se-ops  se-ops-is_block_active) {
+if (!se-ops-is_block_active(se-opaque)) {
  continue;
  }
  }
@@ -1948,8 +1948,8 @@ void qemu_savevm_state_complete(QEMUFile *f)
  if (!se-ops || !se-ops-save_live_complete) {
  continue;
  }
-if (se-ops  se-ops-is_active) {
-if (!se-ops-is_active(se-opaque)) {
+if (se-ops  se-ops-is_block_active) {
+if (!se-ops-is_block_active(se-opaque)) {
  continue;
  }
  }
@@ -2002,8 +2002,8 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t 
max_size)
  if (!se-ops || !se-ops-save_live_pending) {
  continue;
  }
-if (se-ops  se-ops-is_active) {
-if (!se-ops-is_active(se-opaque)) {
+if (se-ops  se-ops-is_block_active) {
+if (!se-ops-is_block_active(se-opaque)) {
  continue;
  }
  }






--
Lei




  1   2   3   4   5   >