[Qemu-devel] [PATCH 1/5] qemu-fd-exchange: provide common methods for exchange fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/qemu/fd-exchange.h | 25 +++ util/Makefile.objs |1 + util/qemu-fd-exchange.c| 97 3 files changed, 123 insertions(+), 0 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h new file mode 100644 index 000..8502960 --- /dev/null +++ b/include/qemu/fd-exchange.h @@ -0,0 +1,25 @@ +/* + * Internal common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef FD_EXCHANGE_H +#define FD_EXCHANGE_H + +#include sys/socket.h + +union MsgControl { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +}; + +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len); + +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len); + +#endif diff --git a/util/Makefile.objs b/util/Makefile.objs index af3e5cb..2fb42bf 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -13,3 +13,4 @@ util-obj-y += hexdump.o util-obj-y += crc32c.o util-obj-y += throttle.o util-obj-y += getauxval.o +util-obj-y += qemu-fd-exchange.o diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c new file mode 100644 index 000..bee3fc1 --- /dev/null +++ b/util/qemu-fd-exchange.c @@ -0,0 +1,97 @@ +/* + * Internal common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include qemu/fd-exchange.h +#include qemu-common.h + + +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len) +{ +struct msghdr msg; +struct iovec iov; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; + +iov.iov_base = (char *)buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = len; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +if (passed_fd 0) { +*(char *)buf = passed_fd; +} else { +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +cmsg = msg_control.cmsg; +cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd)); +cmsg-cmsg_level = SOL_SOCKET; +cmsg-cmsg_type = SCM_RIGHTS; +memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd)); + +} + +do { +retval = sendmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +return retval; +} + +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len) +{ +struct iovec iov; +struct msghdr msg; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; +char data; + +iov.iov_base = data; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +do { +retval = recvmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +if (retval = 0) { +return retval; +} + +if (data != *(char *)buf) { +*passed_fd = data; +return 0; +} + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd)); +return 0; +} + +*passed_fd = -ENFILE; +return retval; +} -- 1.7.7.6
[Qemu-devel] [PATCH 5/5] virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- hw/9pfs/virtio-9p-proxy.c | 60 ++-- 1 files changed, 3 insertions(+), 57 deletions(-) diff --git a/hw/9pfs/virtio-9p-proxy.c b/hw/9pfs/virtio-9p-proxy.c index 5f44bb7..f34b845 100644 --- a/hw/9pfs/virtio-9p-proxy.c +++ b/hw/9pfs/virtio-9p-proxy.c @@ -14,6 +14,7 @@ #include hw/virtio/virtio.h #include virtio-9p.h #include qemu/error-report.h +#include qemu/fd-exchange.h #include fsdev/qemu-fsdev.h #include virtio-9p-proxy.h @@ -24,62 +25,6 @@ typedef struct V9fsProxy { struct iovec out_iovec; } V9fsProxy; -/* - * Return received file descriptor on success in *status. - * errno is also returned on *status (which will be 0) - * return 0 on transport error. - */ -static int v9fs_receivefd(int sockfd, int *status) -{ -struct iovec iov; -struct msghdr msg; -struct cmsghdr *cmsg; -int retval, data, fd; -union MsgControl msg_control; - -iov.iov_base = data; -iov.iov_len = sizeof(data); - -memset(msg, 0, sizeof(msg)); -msg.msg_iov = iov; -msg.msg_iovlen = 1; -msg.msg_control = msg_control; -msg.msg_controllen = sizeof(msg_control); - -do { -retval = recvmsg(sockfd, msg, 0); -} while (retval 0 errno == EINTR); -if (retval = 0) { -return retval; -} -/* - * data is set to V9FS_FD_VALID, if ancillary data is sent. If this - * request doesn't need ancillary data (fd) or an error occurred, - * data is set to negative errno value. - */ -if (data != V9FS_FD_VALID) { -*status = data; -return 0; -} -/* - * File descriptor (fd) is sent in the ancillary data. Check if we - * indeed received it. One of the reasons to fail to receive it is if - * we exceeded the maximum number of file descriptors! - */ -for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { -if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || -cmsg-cmsg_level != SOL_SOCKET || -cmsg-cmsg_type != SCM_RIGHTS) { -continue; -} -fd = *((int *)CMSG_DATA(cmsg)); -*status = fd; -return 0; -} -*status = -ENFILE; /* Ancillary data sent but not received */ -return 0; -} - static ssize_t socket_read(int sockfd, void *buff, size_t size) { ssize_t retval, total = 0; @@ -307,6 +252,7 @@ static int v9fs_request(V9fsProxy *proxy, int type, V9fsString *name, *value; V9fsString *path, *oldpath; struct iovec *iovec = NULL, *reply = NULL; +int data = V9FS_FD_VALID; qemu_mutex_lock(proxy-mutex); @@ -548,7 +494,7 @@ static int v9fs_request(V9fsProxy *proxy, int type, * A file descriptor is returned as response for * T_OPEN,T_CREATE on success */ -if (v9fs_receivefd(proxy-sockfd, retval) 0) { +if (qemu_recv_with_fd(proxy-sockfd, retval, data, sizeof(data)) 0) { goto close_error; } break; -- 1.7.7.6
[Qemu-devel] [PATCH 0/5 v2] Provide common methods for exchange FD
This patch series tries to refactor the functions used for exchange of FD in current code, provide common methods for it. I just tested it through page flipping migration, and tap/ bridge-helper a bit, but have some environment problem on proxy fs driver. So it'd be appreciated if someone could help on verifying whether it has impact on it. :) Please let me know if there is anything needs to be improved. Thanks Changes since V1: -- Copy right and typo fixes pointed out by Eric. -- Don't cast 'char *' to 'int *' from Daniel. -- Get rid of local migration part. Lei Li (5): fd-exchange: provide common methods for exchange of fd qemu-bridge-helper: replace send_fd with qemu_send_with_fd net/tap: replace recv_fd with qemu_recv_with_fd virtfs-proxy-helper: replace send_fd with qemu_send_with_fd virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd Makefile|2 +- fsdev/virtfs-proxy-helper.c | 51 --- hw/9pfs/virtio-9p-proxy.c | 60 +- hw/9pfs/virtio-9p-proxy.h |5 -- include/qemu/fd-exchange.h | 25 +++ net/tap.c | 40 + qemu-bridge-helper.c| 31 + util/Makefile.objs |1 + util/qemu-fd-exchange.c | 97 +++ 10 files changed, 144 insertions(+), 220 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c
[Qemu-devel] [PATCH 4/5] virtfs-proxy-helper: replace send_fd with qemu_send_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- fsdev/virtfs-proxy-helper.c | 51 ++ hw/9pfs/virtio-9p-proxy.h |5 2 files changed, 8 insertions(+), 48 deletions(-) diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c index 713a7b2..44c6e61 100644 --- a/fsdev/virtfs-proxy-helper.c +++ b/fsdev/virtfs-proxy-helper.c @@ -23,6 +23,7 @@ #include qemu-common.h #include qemu/sockets.h #include qemu/xattr.h +#include qemu/fd-exchange.h #include virtio-9p-marshal.h #include hw/9pfs/virtio-9p-proxy.h #include fsdev/virtio-9p-marshal.h @@ -203,48 +204,6 @@ static int read_request(int sockfd, struct iovec *iovec, ProxyHeader *header) return 0; } -static int send_fd(int sockfd, int fd) -{ -struct msghdr msg; -struct iovec iov; -int retval, data; -struct cmsghdr *cmsg; -union MsgControl msg_control; - -iov.iov_base = data; -iov.iov_len = sizeof(data); - -memset(msg, 0, sizeof(msg)); -msg.msg_iov = iov; -msg.msg_iovlen = 1; -/* No ancillary data on error */ -if (fd 0) { -/* fd is really negative errno if the request failed */ -data = fd; -} else { -data = V9FS_FD_VALID; -msg.msg_control = msg_control; -msg.msg_controllen = sizeof(msg_control); - -cmsg = msg_control.cmsg; -cmsg-cmsg_len = CMSG_LEN(sizeof(fd)); -cmsg-cmsg_level = SOL_SOCKET; -cmsg-cmsg_type = SCM_RIGHTS; -memcpy(CMSG_DATA(cmsg), fd, sizeof(fd)); -} - -do { -retval = sendmsg(sockfd, msg, 0); -} while (retval 0 errno == EINTR); -if (fd = 0) { -close(fd); -} -if (retval 0) { -return retval; -} -return 0; -} - static int send_status(int sockfd, struct iovec *iovec, int status) { ProxyHeader header; @@ -784,11 +743,17 @@ static void usage(char *prog) static int process_reply(int sock, int type, struct iovec *out_iovec, int retval) { +int data = V9FS_FD_VALID; + switch (type) { case T_OPEN: case T_CREATE: -if (send_fd(sock, retval) 0) { +if (qemu_send_with_fd(sock, retval, data, sizeof(data)) 0) { return -1; +} else { +if (retval = 0) { +close(retval); +} } break; case T_MKNOD: diff --git a/hw/9pfs/virtio-9p-proxy.h b/hw/9pfs/virtio-9p-proxy.h index 005c1ad..e359ac5 100644 --- a/hw/9pfs/virtio-9p-proxy.h +++ b/hw/9pfs/virtio-9p-proxy.h @@ -24,11 +24,6 @@ #define proxy_marshal(out_sg, offset, fmt, args...) \ v9fs_marshal(out_sg, 1, offset, 0, fmt, ##args) -union MsgControl { -struct cmsghdr cmsg; -char control[CMSG_SPACE(sizeof(int))]; -}; - typedef struct { uint32_t type; uint32_t size; -- 1.7.7.6
[Qemu-devel] [PATCH 2/5] qemu-bridge-helper: replace send_fd with qemu_send_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- Makefile |2 +- qemu-bridge-helper.c | 31 +++ 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index bdff4e4..6850f35 100644 --- a/Makefile +++ b/Makefile @@ -195,7 +195,7 @@ qemu-img$(EXESUF): qemu-img.o $(block-obj-y) libqemuutil.a libqemustub.a qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) libqemuutil.a libqemustub.a qemu-io$(EXESUF): qemu-io.o $(block-obj-y) libqemuutil.a libqemustub.a -qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o +qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/virtio-9p-marshal.o libqemuutil.a libqemustub.a fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c index 6a0974e..8303b6b 100644 --- a/qemu-bridge-helper.c +++ b/qemu-bridge-helper.c @@ -40,6 +40,7 @@ #endif #include qemu/queue.h +#include qemu/fd-exchange.h #include net/tap-linux.h @@ -174,33 +175,6 @@ static void prep_ifreq(struct ifreq *ifr, const char *ifname) snprintf(ifr-ifr_name, IFNAMSIZ, %s, ifname); } -static int send_fd(int c, int fd) -{ -char msgbuf[CMSG_SPACE(sizeof(fd))]; -struct msghdr msg = { -.msg_control = msgbuf, -.msg_controllen = sizeof(msgbuf), -}; -struct cmsghdr *cmsg; -struct iovec iov; -char req[1] = { 0x00 }; - -cmsg = CMSG_FIRSTHDR(msg); -cmsg-cmsg_level = SOL_SOCKET; -cmsg-cmsg_type = SCM_RIGHTS; -cmsg-cmsg_len = CMSG_LEN(sizeof(fd)); -msg.msg_controllen = cmsg-cmsg_len; - -iov.iov_base = req; -iov.iov_len = sizeof(req); - -msg.msg_iov = iov; -msg.msg_iovlen = 1; -memcpy(CMSG_DATA(cmsg), fd, sizeof(fd)); - -return sendmsg(c, msg, 0); -} - #ifdef CONFIG_LIBCAP static int drop_privileges(void) { @@ -239,6 +213,7 @@ int main(int argc, char **argv) ACLList acl_list; int access_allowed, access_denied; int ret = EXIT_SUCCESS; +char req[1] = { 0x00 }; #ifdef CONFIG_LIBCAP /* if we're run from an suid binary, immediately drop privileges preserving @@ -424,7 +399,7 @@ int main(int argc, char **argv) } /* write fd to the domain socket */ -if (send_fd(unixfd, fd) == -1) { +if (qemu_send_with_fd(unixfd, fd, req, sizeof(req)) == -1) { fprintf(stderr, failed to write fd to unix socket: %s\n, strerror(errno)); ret = EXIT_FAILURE; -- 1.7.7.6
[Qemu-devel] [PATCH 3/5] net/tap: replace recv_fd with qemu_recv_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- net/tap.c | 40 +++- 1 files changed, 3 insertions(+), 37 deletions(-) diff --git a/net/tap.c b/net/tap.c index 39c1cda..97ee2e8 100644 --- a/net/tap.c +++ b/net/tap.c @@ -39,6 +39,7 @@ #include sysemu/sysemu.h #include qemu-common.h #include qemu/error-report.h +#include qemu/fd-exchange.h #include net/tap.h @@ -385,40 +386,6 @@ static int launch_script(const char *setup_script, const char *ifname, int fd) return -1; } -static int recv_fd(int c) -{ -int fd; -uint8_t msgbuf[CMSG_SPACE(sizeof(fd))]; -struct msghdr msg = { -.msg_control = msgbuf, -.msg_controllen = sizeof(msgbuf), -}; -struct cmsghdr *cmsg; -struct iovec iov; -uint8_t req[1]; -ssize_t len; - -cmsg = CMSG_FIRSTHDR(msg); -cmsg-cmsg_level = SOL_SOCKET; -cmsg-cmsg_type = SCM_RIGHTS; -cmsg-cmsg_len = CMSG_LEN(sizeof(fd)); -msg.msg_controllen = cmsg-cmsg_len; - -iov.iov_base = req; -iov.iov_len = sizeof(req); - -msg.msg_iov = iov; -msg.msg_iovlen = 1; - -len = recvmsg(c, msg, 0); -if (len 0) { -memcpy(fd, CMSG_DATA(cmsg), sizeof(fd)); -return fd; -} - -return len; -} - static int net_bridge_run_helper(const char *helper, const char *bridge) { sigset_t oldmask, mask; @@ -489,12 +456,11 @@ static int net_bridge_run_helper(const char *helper, const char *bridge) } else if (pid 0) { int fd; +char req[1] = { 0x00 }; close(sv[1]); -do { -fd = recv_fd(sv[0]); -} while (fd == -1 errno == EINTR); +qemu_recv_with_fd(sv[0], fd, req, sizeof(req)); close(sv[0]); -- 1.7.7.6
Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd
On 01/17/2014 06:02 PM, Daniel P. Berrange wrote: On Wed, Jan 08, 2014 at 05:12:51PM +0800, Lei Li wrote: Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/qemu/fd-exchange.h | 25 +++ util/Makefile.objs |1 + util/qemu-fd-exchange.c| 97 3 files changed, 123 insertions(+), 0 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h new file mode 100644 index 000..6929026 --- /dev/null +++ b/include/qemu/fd-exchange.h @@ -0,0 +1,25 @@ +/* + * Internel common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef FD_EXCHANGE_H +#define FD_EXCHANGE_H + +#include sys/socket.h + +union MsgControl { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +}; + +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len); + +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len); + +#endif diff --git a/util/Makefile.objs b/util/Makefile.objs index af3e5cb..2fb42bf 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -13,3 +13,4 @@ util-obj-y += hexdump.o util-obj-y += crc32c.o util-obj-y += throttle.o util-obj-y += getauxval.o +util-obj-y += qemu-fd-exchange.o diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c new file mode 100644 index 000..70a3206 --- /dev/null +++ b/util/qemu-fd-exchange.c @@ -0,0 +1,97 @@ +/* + * Internel common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include qemu/fd-exchange.h +#include qemu-common.h + + +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len) +{ +struct msghdr msg; +struct iovec iov; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; + +iov.iov_base = (int *)buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = len; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +if (passed_fd 0) { +*(int *)buf = passed_fd; You are casting 'char *buf' to an 'int *' but many of the callers only pass in a pointer to a 'char buf[1]'. So you are overflowing the array and also likely causing alignment violations on ARM platforms. You are right, will fix it. Thanks. +} else { +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +cmsg = msg_control.cmsg; +cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd)); +cmsg-cmsg_level = SOL_SOCKET; +cmsg-cmsg_type = SCM_RIGHTS; +memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd)); + +} + +do { +retval = sendmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +return retval; +} + +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len) +{ +struct iovec iov; +struct msghdr msg; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; +int data = *(int *)buf; + +iov.iov_base = buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +do { +retval = recvmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +if (retval = 0) { +return retval; +} + +if (data != *(int *)buf) { +*passed_fd = data; +return 0; +} Again cast issues + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd)); +return 0; +} + +*passed_fd = -ENFILE; +return retval; +} -- Regards, Daniel -- Lei
Re: [Qemu-devel] [PATCH resend 0/6 RFC] Provide common methods for exchange FD
Any comments? On 01/08/2014 05:12 PM, Lei Li wrote: This patch series tries to refactor the functions used for exchange of FD in current code, provide common methods for it. The series is based on the localhost migration with side channel for ram series as it was already a good shape. But if you want to merge this first, I'll get rid of the migration part. I just tested page flipping migration, and tap/bridge-helper a bit, but have some environment problem on proxy fs driver. So it'd be appreciated if someone could help on verifying whether it has impact on it. :) Please let me know if there is anything needs to be improved. Thanks. Lei Li (6): fd-exchange: provide common methods for exchange of fd qemu-bridge-helper: replace send_fd with qemu_send_with_fd net/tap: replace recv_fd with qemu_recv_with_fd virtfs-proxy-helper: replace send_fd with qemu_send_with_fd virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd migration-local: replace send_pipefd with qemu_send_with_fd Makefile|2 +- fsdev/virtfs-proxy-helper.c | 51 --- hw/9pfs/virtio-9p-proxy.c | 60 +- hw/9pfs/virtio-9p-proxy.h |5 -- include/qemu/fd-exchange.h | 25 +++ migration-local.c | 52 +-- net/tap.c | 40 + qemu-bridge-helper.c| 31 + util/Makefile.objs |1 + util/qemu-fd-exchange.c | 97 +++ 10 files changed, 144 insertions(+), 220 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c -- Lei
Re: [Qemu-devel] [PATCH 5/6] virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd
On 01/16/2014 06:16 PM, Daniel P. Berrange wrote: On Wed, Jan 08, 2014 at 05:12:55PM +0800, Lei Li wrote: Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- hw/9pfs/virtio-9p-proxy.c | 60 ++-- 1 files changed, 3 insertions(+), 57 deletions(-) diff --git a/hw/9pfs/virtio-9p-proxy.c b/hw/9pfs/virtio-9p-proxy.c index 5f44bb7..f34b845 100644 --- a/hw/9pfs/virtio-9p-proxy.c +++ b/hw/9pfs/virtio-9p-proxy.c -do { -retval = recvmsg(sockfd, msg, 0); -} while (retval 0 errno == EINTR); -if (retval = 0) { -return retval; -} -/* - * data is set to V9FS_FD_VALID, if ancillary data is sent. If this - * request doesn't need ancillary data (fd) or an error occurred, - * data is set to negative errno value. - */ -if (data != V9FS_FD_VALID) { -*status = data; -return 0; -} This code is handling the 'data' value... @@ -307,6 +252,7 @@ static int v9fs_request(V9fsProxy *proxy, int type, V9fsString *name, *value; V9fsString *path, *oldpath; struct iovec *iovec = NULL, *reply = NULL; +int data = V9FS_FD_VALID; qemu_mutex_lock(proxy-mutex); @@ -548,7 +494,7 @@ static int v9fs_request(V9fsProxy *proxy, int type, * A file descriptor is returned as response for * T_OPEN,T_CREATE on success */ -if (v9fs_receivefd(proxy-sockfd, retval) 0) { +if (qemu_recv_with_fd(proxy-sockfd, retval, data, sizeof(data)) 0) { goto close_error; } ...but this code is ignoring the return value in 'data'. It is not be ignored. The above logical is put into the common method, like: if (data != *(int *)buf) { *passed_fd = data; return 0; } Daniel -- Lei
Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd
On 01/16/2014 11:16 PM, Eric Blake wrote: On 01/08/2014 02:12 AM, Lei Li wrote: Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/qemu/fd-exchange.h | 25 +++ util/Makefile.objs |1 + util/qemu-fd-exchange.c| 97 3 files changed, 123 insertions(+), 0 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h new file mode 100644 index 000..6929026 --- /dev/null +++ b/include/qemu/fd-exchange.h @@ -0,0 +1,25 @@ +/* + * Internel common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. Any reason you can't use GPLv2+? Limiting to exactly version 2 means your file cannot be copied into programs that want a wider array of licensing optoins. Er... it's my miss copy, apologize to this. :( -- Lei
Re: [Qemu-devel] [PATCH 4/6] virtfs-proxy-helper: replace send_fd with qemu_send_with_fd
On 01/16/2014 06:15 PM, Daniel P. Berrange wrote: On Wed, Jan 08, 2014 at 05:12:54PM +0800, Lei Li wrote: Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- fsdev/virtfs-proxy-helper.c | 51 ++ hw/9pfs/virtio-9p-proxy.h |5 2 files changed, 8 insertions(+), 48 deletions(-) diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c index 713a7b2..44c6e61 100644 --- a/fsdev/virtfs-proxy-helper.c +++ b/fsdev/virtfs-proxy-helper.c -static int send_fd(int sockfd, int fd) -{ ... -/* No ancillary data on error */ -if (fd 0) { -/* fd is really negative errno if the request failed */ -data = fd; -} else { -data = V9FS_FD_VALID; The way data is initialized here... @@ -784,11 +743,17 @@ static void usage(char *prog) static int process_reply(int sock, int type, struct iovec *out_iovec, int retval) { +int data = V9FS_FD_VALID; Doesn't match what you do here. Well, it looks like it does not match the original order, because the 'data' has to be passed to the common methods by the parameter *buf first, as there would be different data value set for the check by those callers. But the logical is the same: if the passed_fd is negative, 'data' will be set to the negative fd; otherwise it'll be the check value. + switch (type) { case T_OPEN: case T_CREATE: -if (send_fd(sock, retval) 0) { +if (qemu_send_with_fd(sock, retval, data, sizeof(data)) 0) { return -1; +} else { +if (retval = 0) { +close(retval); +} } break; case T_MKNOD: Regards, Daniel -- Lei
Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd
On 01/16/2014 11:26 PM, Eric Blake wrote: On 01/08/2014 02:12 AM, Lei Li wrote: Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/qemu/fd-exchange.h | 25 +++ util/Makefile.objs |1 + util/qemu-fd-exchange.c| 97 3 files changed, 123 insertions(+), 0 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h new file mode 100644 index 000..6929026 --- /dev/null +++ b/include/qemu/fd-exchange.h @@ -0,0 +1,25 @@ +/* + * Internel common methods for exchange of FD s/Internel/Internal/ +++ b/util/qemu-fd-exchange.c @@ -0,0 +1,97 @@ +/* + * Internel common methods for exchange of FD and again. Good catch! Thanks. +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len) +{ +struct msghdr msg; +struct iovec iov; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; + +iov.iov_base = (int *)buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = len; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +if (passed_fd 0) { +*(int *)buf = passed_fd; Is it safe to assume that buf is aligned well enough to be casting it to int* then dereferencing it? Why not just type the parameter correctly That's because there would be different type for this parameter. to begin with? And why are you even writing into the caller's buffer when they pass a negative fd, but leaving it alone when they pass a non-negative fd? That's just the original logical of exchange fd for proxy fs driver, if (fd 0) { data = fd; } else { data = V9FS_FD_VALID; } This common method don't leave it alone when a non-negative fd passed, it'll be the same as the check value passed from the caller. +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len) +{ +struct iovec iov; +struct msghdr msg; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; +int data = *(int *)buf; Again, why not type buf correctly, since otherwise you risk a user passing in a buffer that is unsuitably aligned for dereferencing as an int pointer. + +iov.iov_base = buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + Should you take advantage of Linux' ability to use MSG_CMSG_CLOEXEC to guarantee the received fd is atomically marked cloexec when possible? Whether close the fd in the common method depends on the process of these current users (they are not the same). It'd be better to let the users handling the close of fd to fit it. +do { +retval = recvmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +if (retval = 0) { +return retval; +} + +if (data != *(int *)buf) { +*passed_fd = data; +return 0; +} + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd)); +return 0; +} And even when MSG_CMSG_CLOEXEC is not available, shouldn't you ensure that cloexec is set after the fact? That's a good suggestion, thanks. -- Lei
[Qemu-devel] [PATCH 2/6] qemu-bridge-helper: replace send_fd with qemu_send_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- Makefile |2 +- qemu-bridge-helper.c | 31 +++ 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index bdff4e4..6850f35 100644 --- a/Makefile +++ b/Makefile @@ -195,7 +195,7 @@ qemu-img$(EXESUF): qemu-img.o $(block-obj-y) libqemuutil.a libqemustub.a qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) libqemuutil.a libqemustub.a qemu-io$(EXESUF): qemu-io.o $(block-obj-y) libqemuutil.a libqemustub.a -qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o +qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/virtio-9p-marshal.o libqemuutil.a libqemustub.a fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c index 6a0974e..8303b6b 100644 --- a/qemu-bridge-helper.c +++ b/qemu-bridge-helper.c @@ -40,6 +40,7 @@ #endif #include qemu/queue.h +#include qemu/fd-exchange.h #include net/tap-linux.h @@ -174,33 +175,6 @@ static void prep_ifreq(struct ifreq *ifr, const char *ifname) snprintf(ifr-ifr_name, IFNAMSIZ, %s, ifname); } -static int send_fd(int c, int fd) -{ -char msgbuf[CMSG_SPACE(sizeof(fd))]; -struct msghdr msg = { -.msg_control = msgbuf, -.msg_controllen = sizeof(msgbuf), -}; -struct cmsghdr *cmsg; -struct iovec iov; -char req[1] = { 0x00 }; - -cmsg = CMSG_FIRSTHDR(msg); -cmsg-cmsg_level = SOL_SOCKET; -cmsg-cmsg_type = SCM_RIGHTS; -cmsg-cmsg_len = CMSG_LEN(sizeof(fd)); -msg.msg_controllen = cmsg-cmsg_len; - -iov.iov_base = req; -iov.iov_len = sizeof(req); - -msg.msg_iov = iov; -msg.msg_iovlen = 1; -memcpy(CMSG_DATA(cmsg), fd, sizeof(fd)); - -return sendmsg(c, msg, 0); -} - #ifdef CONFIG_LIBCAP static int drop_privileges(void) { @@ -239,6 +213,7 @@ int main(int argc, char **argv) ACLList acl_list; int access_allowed, access_denied; int ret = EXIT_SUCCESS; +char req[1] = { 0x00 }; #ifdef CONFIG_LIBCAP /* if we're run from an suid binary, immediately drop privileges preserving @@ -424,7 +399,7 @@ int main(int argc, char **argv) } /* write fd to the domain socket */ -if (send_fd(unixfd, fd) == -1) { +if (qemu_send_with_fd(unixfd, fd, req, sizeof(req)) == -1) { fprintf(stderr, failed to write fd to unix socket: %s\n, strerror(errno)); ret = EXIT_FAILURE; -- 1.7.7.6
[Qemu-devel] [PATCH resend 0/6 RFC] Provide common methods for exchange FD
This patch series tries to refactor the functions used for exchange of FD in current code, provide common methods for it. The series is based on the localhost migration with side channel for ram series as it was already a good shape. But if you want to merge this first, I'll get rid of the migration part. I just tested page flipping migration, and tap/bridge-helper a bit, but have some environment problem on proxy fs driver. So it'd be appreciated if someone could help on verifying whether it has impact on it. :) Please let me know if there is anything needs to be improved. Thanks. Lei Li (6): fd-exchange: provide common methods for exchange of fd qemu-bridge-helper: replace send_fd with qemu_send_with_fd net/tap: replace recv_fd with qemu_recv_with_fd virtfs-proxy-helper: replace send_fd with qemu_send_with_fd virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd migration-local: replace send_pipefd with qemu_send_with_fd Makefile|2 +- fsdev/virtfs-proxy-helper.c | 51 --- hw/9pfs/virtio-9p-proxy.c | 60 +- hw/9pfs/virtio-9p-proxy.h |5 -- include/qemu/fd-exchange.h | 25 +++ migration-local.c | 52 +-- net/tap.c | 40 + qemu-bridge-helper.c| 31 + util/Makefile.objs |1 + util/qemu-fd-exchange.c | 97 +++ 10 files changed, 144 insertions(+), 220 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c
[Qemu-devel] [PATCH 4/6] virtfs-proxy-helper: replace send_fd with qemu_send_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- fsdev/virtfs-proxy-helper.c | 51 ++ hw/9pfs/virtio-9p-proxy.h |5 2 files changed, 8 insertions(+), 48 deletions(-) diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c index 713a7b2..44c6e61 100644 --- a/fsdev/virtfs-proxy-helper.c +++ b/fsdev/virtfs-proxy-helper.c @@ -23,6 +23,7 @@ #include qemu-common.h #include qemu/sockets.h #include qemu/xattr.h +#include qemu/fd-exchange.h #include virtio-9p-marshal.h #include hw/9pfs/virtio-9p-proxy.h #include fsdev/virtio-9p-marshal.h @@ -203,48 +204,6 @@ static int read_request(int sockfd, struct iovec *iovec, ProxyHeader *header) return 0; } -static int send_fd(int sockfd, int fd) -{ -struct msghdr msg; -struct iovec iov; -int retval, data; -struct cmsghdr *cmsg; -union MsgControl msg_control; - -iov.iov_base = data; -iov.iov_len = sizeof(data); - -memset(msg, 0, sizeof(msg)); -msg.msg_iov = iov; -msg.msg_iovlen = 1; -/* No ancillary data on error */ -if (fd 0) { -/* fd is really negative errno if the request failed */ -data = fd; -} else { -data = V9FS_FD_VALID; -msg.msg_control = msg_control; -msg.msg_controllen = sizeof(msg_control); - -cmsg = msg_control.cmsg; -cmsg-cmsg_len = CMSG_LEN(sizeof(fd)); -cmsg-cmsg_level = SOL_SOCKET; -cmsg-cmsg_type = SCM_RIGHTS; -memcpy(CMSG_DATA(cmsg), fd, sizeof(fd)); -} - -do { -retval = sendmsg(sockfd, msg, 0); -} while (retval 0 errno == EINTR); -if (fd = 0) { -close(fd); -} -if (retval 0) { -return retval; -} -return 0; -} - static int send_status(int sockfd, struct iovec *iovec, int status) { ProxyHeader header; @@ -784,11 +743,17 @@ static void usage(char *prog) static int process_reply(int sock, int type, struct iovec *out_iovec, int retval) { +int data = V9FS_FD_VALID; + switch (type) { case T_OPEN: case T_CREATE: -if (send_fd(sock, retval) 0) { +if (qemu_send_with_fd(sock, retval, data, sizeof(data)) 0) { return -1; +} else { +if (retval = 0) { +close(retval); +} } break; case T_MKNOD: diff --git a/hw/9pfs/virtio-9p-proxy.h b/hw/9pfs/virtio-9p-proxy.h index 005c1ad..e359ac5 100644 --- a/hw/9pfs/virtio-9p-proxy.h +++ b/hw/9pfs/virtio-9p-proxy.h @@ -24,11 +24,6 @@ #define proxy_marshal(out_sg, offset, fmt, args...) \ v9fs_marshal(out_sg, 1, offset, 0, fmt, ##args) -union MsgControl { -struct cmsghdr cmsg; -char control[CMSG_SPACE(sizeof(int))]; -}; - typedef struct { uint32_t type; uint32_t size; -- 1.7.7.6
[Qemu-devel] [PATCH 5/6] virtio-9p-proxy: replace v9fs_receivefd with qemu_recv_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- hw/9pfs/virtio-9p-proxy.c | 60 ++-- 1 files changed, 3 insertions(+), 57 deletions(-) diff --git a/hw/9pfs/virtio-9p-proxy.c b/hw/9pfs/virtio-9p-proxy.c index 5f44bb7..f34b845 100644 --- a/hw/9pfs/virtio-9p-proxy.c +++ b/hw/9pfs/virtio-9p-proxy.c @@ -14,6 +14,7 @@ #include hw/virtio/virtio.h #include virtio-9p.h #include qemu/error-report.h +#include qemu/fd-exchange.h #include fsdev/qemu-fsdev.h #include virtio-9p-proxy.h @@ -24,62 +25,6 @@ typedef struct V9fsProxy { struct iovec out_iovec; } V9fsProxy; -/* - * Return received file descriptor on success in *status. - * errno is also returned on *status (which will be 0) - * return 0 on transport error. - */ -static int v9fs_receivefd(int sockfd, int *status) -{ -struct iovec iov; -struct msghdr msg; -struct cmsghdr *cmsg; -int retval, data, fd; -union MsgControl msg_control; - -iov.iov_base = data; -iov.iov_len = sizeof(data); - -memset(msg, 0, sizeof(msg)); -msg.msg_iov = iov; -msg.msg_iovlen = 1; -msg.msg_control = msg_control; -msg.msg_controllen = sizeof(msg_control); - -do { -retval = recvmsg(sockfd, msg, 0); -} while (retval 0 errno == EINTR); -if (retval = 0) { -return retval; -} -/* - * data is set to V9FS_FD_VALID, if ancillary data is sent. If this - * request doesn't need ancillary data (fd) or an error occurred, - * data is set to negative errno value. - */ -if (data != V9FS_FD_VALID) { -*status = data; -return 0; -} -/* - * File descriptor (fd) is sent in the ancillary data. Check if we - * indeed received it. One of the reasons to fail to receive it is if - * we exceeded the maximum number of file descriptors! - */ -for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { -if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || -cmsg-cmsg_level != SOL_SOCKET || -cmsg-cmsg_type != SCM_RIGHTS) { -continue; -} -fd = *((int *)CMSG_DATA(cmsg)); -*status = fd; -return 0; -} -*status = -ENFILE; /* Ancillary data sent but not received */ -return 0; -} - static ssize_t socket_read(int sockfd, void *buff, size_t size) { ssize_t retval, total = 0; @@ -307,6 +252,7 @@ static int v9fs_request(V9fsProxy *proxy, int type, V9fsString *name, *value; V9fsString *path, *oldpath; struct iovec *iovec = NULL, *reply = NULL; +int data = V9FS_FD_VALID; qemu_mutex_lock(proxy-mutex); @@ -548,7 +494,7 @@ static int v9fs_request(V9fsProxy *proxy, int type, * A file descriptor is returned as response for * T_OPEN,T_CREATE on success */ -if (v9fs_receivefd(proxy-sockfd, retval) 0) { +if (qemu_recv_with_fd(proxy-sockfd, retval, data, sizeof(data)) 0) { goto close_error; } break; -- 1.7.7.6
[Qemu-devel] [PATCH 6/6] migration-local: replace send_pipefd with qemu_send_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 52 +++- 1 files changed, 3 insertions(+), 49 deletions(-) diff --git a/migration-local.c b/migration-local.c index ce4c070..c01ba06 100644 --- a/migration-local.c +++ b/migration-local.c @@ -26,6 +26,7 @@ #include sysemu/sysemu.h #include block/block.h #include qemu/sockets.h +#include qemu/fd-exchange.h #include migration/block.h #include qemu/thread.h #include qmp-commands.h @@ -169,8 +170,6 @@ static int qemu_local_close(void *opaque) return 0; } -static int send_pipefd(int sockfd, int pipefd); - static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent) @@ -179,13 +178,14 @@ static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, ram_addr_t current_addr = mr-ram_addr + offset; void *ram_addr; ssize_t ret; +char req[1] = { 0x01 }; if (s-unix_page_flipping) { qemu_put_be64(s-file, current_addr | RAM_SAVE_FLAG_HOOK); qemu_fflush(s-file); if (!s-pipefd_passed) { -ret = send_pipefd(s-sockfd, s-pipefd[0]); +ret = qemu_send_with_fd(s-sockfd, s-pipefd[0], req, sizeof(req)); if (ret 0) { fprintf(stderr, failed to pass PIPE\n); return ret; @@ -342,49 +342,3 @@ fail: g_free(s); return NULL; } - - -/* - * Pass a pipe file descriptor to another process. - * - * Return negative value If pipefd 0. Return 0 on - * success. - * - */ -static int send_pipefd(int sockfd, int pipefd) -{ -struct msghdr msg; -struct iovec iov[1]; -ssize_t ret; -char req[1] = { 0x01 }; - -union { - struct cmsghdr cm; - char control[CMSG_SPACE(sizeof(int))]; -} control_un; -struct cmsghdr *cmptr; - -msg.msg_control = control_un.control; -msg.msg_controllen = sizeof(control_un.control); - -cmptr = CMSG_FIRSTHDR(msg); -cmptr-cmsg_len = CMSG_LEN(sizeof(int)); -cmptr-cmsg_level = SOL_SOCKET; -cmptr-cmsg_type = SCM_RIGHTS; -*((int *) CMSG_DATA(cmptr)) = pipefd; - -msg.msg_name = NULL; -msg.msg_namelen = 0; - -iov[0].iov_base = req; -iov[0].iov_len = sizeof(req); -msg.msg_iov = iov; -msg.msg_iovlen = 1; - -ret = sendmsg(sockfd, msg, 0); -if (ret = 0) { -DPRINTF(sendmsg error: %s\n, strerror(errno)); -} - -return ret; -} -- 1.7.7.6
[Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/qemu/fd-exchange.h | 25 +++ util/Makefile.objs |1 + util/qemu-fd-exchange.c| 97 3 files changed, 123 insertions(+), 0 deletions(-) create mode 100644 include/qemu/fd-exchange.h create mode 100644 util/qemu-fd-exchange.c diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h new file mode 100644 index 000..6929026 --- /dev/null +++ b/include/qemu/fd-exchange.h @@ -0,0 +1,25 @@ +/* + * Internel common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef FD_EXCHANGE_H +#define FD_EXCHANGE_H + +#include sys/socket.h + +union MsgControl { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +}; + +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len); + +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len); + +#endif diff --git a/util/Makefile.objs b/util/Makefile.objs index af3e5cb..2fb42bf 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -13,3 +13,4 @@ util-obj-y += hexdump.o util-obj-y += crc32c.o util-obj-y += throttle.o util-obj-y += getauxval.o +util-obj-y += qemu-fd-exchange.o diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c new file mode 100644 index 000..70a3206 --- /dev/null +++ b/util/qemu-fd-exchange.c @@ -0,0 +1,97 @@ +/* + * Internel common methods for exchange of FD + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include qemu/fd-exchange.h +#include qemu-common.h + + +ssize_t qemu_send_with_fd(int sockfd, int passed_fd, + const void *buf, size_t len) +{ +struct msghdr msg; +struct iovec iov; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; + +iov.iov_base = (int *)buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = len; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +if (passed_fd 0) { +*(int *)buf = passed_fd; +} else { +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +cmsg = msg_control.cmsg; +cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd)); +cmsg-cmsg_level = SOL_SOCKET; +cmsg-cmsg_type = SCM_RIGHTS; +memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd)); + +} + +do { +retval = sendmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +return retval; +} + +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, + void *buf, size_t len) +{ +struct iovec iov; +struct msghdr msg; +struct cmsghdr *cmsg; +union MsgControl msg_control; +int retval; +int data = *(int *)buf; + +iov.iov_base = buf; +iov.iov_len = len; + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +do { +retval = recvmsg(sockfd, msg, 0); +} while (retval 0 errno == EINTR); + +if (retval = 0) { +return retval; +} + +if (data != *(int *)buf) { +*passed_fd = data; +return 0; +} + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd)); +return 0; +} + +*passed_fd = -ENFILE; +return retval; +} -- 1.7.7.6
[Qemu-devel] [PATCH 3/6] net/tap: replace recv_fd with qemu_recv_with_fd
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- net/tap.c | 40 +++- 1 files changed, 3 insertions(+), 37 deletions(-) diff --git a/net/tap.c b/net/tap.c index 39c1cda..97ee2e8 100644 --- a/net/tap.c +++ b/net/tap.c @@ -39,6 +39,7 @@ #include sysemu/sysemu.h #include qemu-common.h #include qemu/error-report.h +#include qemu/fd-exchange.h #include net/tap.h @@ -385,40 +386,6 @@ static int launch_script(const char *setup_script, const char *ifname, int fd) return -1; } -static int recv_fd(int c) -{ -int fd; -uint8_t msgbuf[CMSG_SPACE(sizeof(fd))]; -struct msghdr msg = { -.msg_control = msgbuf, -.msg_controllen = sizeof(msgbuf), -}; -struct cmsghdr *cmsg; -struct iovec iov; -uint8_t req[1]; -ssize_t len; - -cmsg = CMSG_FIRSTHDR(msg); -cmsg-cmsg_level = SOL_SOCKET; -cmsg-cmsg_type = SCM_RIGHTS; -cmsg-cmsg_len = CMSG_LEN(sizeof(fd)); -msg.msg_controllen = cmsg-cmsg_len; - -iov.iov_base = req; -iov.iov_len = sizeof(req); - -msg.msg_iov = iov; -msg.msg_iovlen = 1; - -len = recvmsg(c, msg, 0); -if (len 0) { -memcpy(fd, CMSG_DATA(cmsg), sizeof(fd)); -return fd; -} - -return len; -} - static int net_bridge_run_helper(const char *helper, const char *bridge) { sigset_t oldmask, mask; @@ -489,12 +456,11 @@ static int net_bridge_run_helper(const char *helper, const char *bridge) } else if (pid 0) { int fd; +char req[1] = { 0x00 }; close(sv[1]); -do { -fd = recv_fd(sv[0]); -} while (fd == -1 errno == EINTR); +qemu_recv_with_fd(sv[0], fd, req, sizeof(req)); close(sv[0]); -- 1.7.7.6
Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
On 12/02/2013 05:33 PM, Daniel P. Berrange wrote: On Mon, Dec 02, 2013 at 05:19:06PM +0800, Lei Li wrote: This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 46 ++ 1 files changed, 46 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 929ed60..f479530 100644 --- a/migration-local.c +++ b/migration-local.c @@ -167,3 +167,49 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; +char req[1] = { 0x01 }; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; + +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} Just a reminder about my comments from previous posting. This is introducing a 3rd private function for sending FDs. The existing code should be refactored into qemu-socket.{c,h} and shared. Hi Daniel, Yes, I remembered your suggestion. As my reply in the previous version, I'll make this refactoring in a separate thread. There are some differences between these private functions (like data type and length of bytes transmitted), may need a little time to get the common method settle down, and would be better to do some test to make sure there is no impact on them. And now this is a complete series as an experimental version, do you mind if the refactoring would be posted after this series? Daniel -- Lei
Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
On 12/03/2013 07:35 PM, Daniel P. Berrange wrote: On Tue, Dec 03, 2013 at 07:19:40PM +0800, Lei Li wrote: On 12/02/2013 05:33 PM, Daniel P. Berrange wrote: On Mon, Dec 02, 2013 at 05:19:06PM +0800, Lei Li wrote: This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 46 ++ 1 files changed, 46 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 929ed60..f479530 100644 --- a/migration-local.c +++ b/migration-local.c @@ -167,3 +167,49 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; +char req[1] = { 0x01 }; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; + +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} Just a reminder about my comments from previous posting. This is introducing a 3rd private function for sending FDs. The existing code should be refactored into qemu-socket.{c,h} and shared. Hi Daniel, Yes, I remembered your suggestion. As my reply in the previous version, I'll make this refactoring in a separate thread. There are some differences between these private functions (like data type and length of bytes transmitted), may need a little time to get the common method settle down, and would be better to do some test to make sure there is no impact on them. And now this is a complete series as an experimental version, do you mind if the refactoring would be posted after this series? IMHO the refactoring should be a pre-requisite of this series. I've seen too many times where future refactoring was promised but never arrived because the motivation to fix it is gone once the main series is committed. It is up to QEMU maintainers though - this is just my personal opinion. Just this is already a good shape and the refactoring may need a little more time since some details might needs to be considered and better to discuss in a separate thread. I am happy to take any chance to contribute to community, as I can learn a lot from you guys and it's really good experience that my work could be useful to lots of people. And I believe this is not my last patch for it. :) Regards, Daniel -- Lei
Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
On 12/03/2013 07:52 PM, Paolo Bonzini wrote: Il 03/12/2013 12:19, Lei Li ha scritto: On 12/02/2013 05:33 PM, Daniel P. Berrange wrote: On Mon, Dec 02, 2013 at 05:19:06PM +0800, Lei Li wrote: This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 46 ++ 1 files changed, 46 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 929ed60..f479530 100644 --- a/migration-local.c +++ b/migration-local.c @@ -167,3 +167,49 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; +char req[1] = { 0x01 }; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; + +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} Just a reminder about my comments from previous posting. This is introducing a 3rd private function for sending FDs. The existing code should be refactored into qemu-socket.{c,h} and shared. Hi Daniel, Yes, I remembered your suggestion. As my reply in the previous version, I'll make this refactoring in a separate thread. There are some differences between these private functions (like data type and length of bytes transmitted), may need a little time to get the common method settle down, and would be better to do some test to make sure there is no impact on them. You would have to implement it in such a way that the buffer is specified in the function, for example: ssize_t qemu_send_with_fd(int sockfd, int passed_fd, const void *buf, size_t len); ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd, void *buf, size_t len); The functions can go in util/ (I think not in qemu-socket.c, a new file is preferrable). I don't think it's particularly important, but it's definitely welcome. Hi Paolo, Thanks for your specified suggestion! As it needs to test the related code (tap/bridge Proxy FS flipping migration), I will work on it after back from my vacation next week. :-) Paolo -- Lei
Re: [Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
On 11/29/2013 07:14 PM, Daniel P. Berrange wrote: On Fri, Nov 29, 2013 at 06:06:13PM +0800, Lei Li wrote: This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 46 ++ 1 files changed, 46 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 929ed60..f479530 100644 --- a/migration-local.c +++ b/migration-local.c @@ -167,3 +167,49 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; +char req[1] = { 0x01 }; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; + +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} There are already two copies of this function in QEMU, not to mention several copies of code for receving FDs. Rather than adding yet more copies of this functionality it would be much better to add 2 methods to util/qemu-sockets.{c,h} for sending and receiving file descriptors and update all existing code to use them. Hi Daniel, Make sense, sounds like a good plan to me. Just take a quick look, seems there are some differences between them, I will have a try in a separate thread after back from my vacation next week. Thanks for your suggestion. Daniel -- Lei
Re: [Qemu-devel] [PATCH 0/17 v4] Localhost migration with side channel for ram
On 11/29/2013 06:26 PM, Paolo Bonzini wrote: Il 29/11/2013 11:06, Lei Li ha scritto: This patch series tries to introduce a mechanism using side channel pipe for RAM via SCM_RIGHTS with unix domain socket protocol migration. This side channel is used for the page flipping by vmsplice, which is the internal mechanism for localhost migration that we are trying to add to QEMU. The backgroud info and previous patch series for reference, Localhost migration http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html migration: Introduce side channel for RAM http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html I have picked patches from the localhost migration series and rebased it on the series of side channel, now it is a complete series that passed the basic test. One change: please rename the capability to x-unix-page-flipping for now. No need to rename the function migrate_unix_page_flipping(), only the capability name in qapi-schema.json (and references to the enum MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING). With that change, v5 will get Reviewed-by: Paolo Bonzini pbonz...@redhat.com. Hi Paolo, Will update soon, thanks! Thanks! Paolo Please let me know if there is anything needs to be fixed or improved. Your suggestions and comments are very welcome, and thanks to Paolo for his continued review and useful suggestions. Changes since V3: Address comments from Paolo including: - Get rid of useless check in send_pipefd() and the override of before_ram_iterate, send pipefd in the first save_page call, qemu_get_byte() in the first ram_load correspondingly. - Add new argument ram_addr_t to hook_ram_load to cut half of the data transferred on the socket. - Add transition from 'debug' to 'memory-stale'. - Other minor fixes. Changes since V2: Address comments from Paolo including: - Doc improvement for QAPI. - Use callback get_buffer as the only one receiver. - Rename the new RunState flipping-migrate to memory-stale, and add transition from 'prelaunch' to 'memory-stale'. - Other minor fixes. Changes since V1: Address suggestions from Paolo Bonzini including: - Use Unix socket QEMUFile as basis of code and adjust the way of overriding RDMA hooks. - Involve the vmsplice for page flipping. - Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to runstate_needs_reset() for the adjustment of the current migration process with page flipping. Lei Li (17): QAPI: introduce magration capability unix_page_flipping migration: add migrate_unix_page_flipping() qmp-command.hx: add missing docs for migration capabilites migration-local: add QEMUFileLocal with socket based QEMUFile migration-local: introduce qemu_fopen_socket_local() migration-local: add send_pipefd() save_page: replace block_offset with a MemoryRegion migration-local: override save_page for page transmit savevm: adjust ram_control_save_page with page flipping add unix_msgfd_lookup() to callback get_buffer add argument ram_addr_t to hook_ram_load migration-local: override hook_ram_load migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local add new RanState RAN_STATE_MEMORY_STALE migration-unix: page flipping support on unix outgoing migration: adjust migration_thread() process for unix_page_flipping hmp: better fomat for info migrate_capabilities Makefile.target | 1 + arch_init.c | 4 +- migration-local.c | 512 ++ hmp.c | 5 +- include/migration/migration.h | 3 + include/migration/qemu-file.h | 2 + migration-unix.c | 27 ++- migration-rdma.c | 4 +- migration.c | 18 +- qapi-schema.json | 18 +- qmp-commands.hx | 8 + savevm.c | 21 +- vl.c | 12 +- 13 files changed, 617 insertions(+), 27 deletions(-) create mode 100644 migration-local.c -- Lei
[Qemu-devel] [PATCH 02/17] migration: add migrate_unix_page_flipping()
Add migrate_unix_page_flipping() to check if MIGRATION_CAPABILITY_X_UNIX_PAGE_FLIPPING is enabled. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/migration.h |3 +++ migration.c |9 + 2 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index 140e6b4..7e5d01a 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason); void migrate_del_blocker(Error *reason); bool migrate_rdma_pin_all(void); + bool migrate_zero_blocks(void); bool migrate_auto_converge(void); +bool migrate_unix_page_flipping(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); diff --git a/migration.c b/migration.c index 2b1ab20..e012cd4 100644 --- a/migration.c +++ b/migration.c @@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void) return s-xbzrle_cache_size; } +bool migrate_unix_page_flipping(void) +{ +MigrationState *s; + +s = migrate_get_current(); + +return s-enabled_capabilities[MIGRATION_CAPABILITY_X_UNIX_PAGE_FLIPPING]; +} + /* migration thread support */ static void *migration_thread(void *opaque) -- 1.7.7.6
[Qemu-devel] [PATCH 01/17] QAPI: introduce migration capability x_unix_page_flipping
Introduce x_unix_page_flipping to MigrationCapability for localhost migration. Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json | 12 +++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index 83fa485..ea910ef 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -685,10 +685,20 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @x-unix-page-flipping: If enabled, QEMU can optimize migration when the +# destination is a QEMU process that runs on the same host as +# the source (as is the case for live upgrade). If the migration +# transport is a Unix socket, QEMU will flip RAM pages directly to +# the destination, so that memory is only allocated twice for the +# source and destination processes. Disabled by default. +# Experimental: will get rid of the x tag after further testing with +# the new vmsplice. (since 2.0) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks', + 'x-unix-page-flipping'] } ## # @MigrationCapabilityStatus -- 1.7.7.6
[Qemu-devel] [PATCH 04/17] migration-local: add QEMUFileLocal with socket based QEMUFile
This patch adds QEMUFileLocal with copy of socket based QEMUFile, will be used as the basis code for Unix socket protocol migration and page flipping migration. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- Makefile.target |1 + migration-local.c | 123 + 2 files changed, 124 insertions(+), 0 deletions(-) create mode 100644 migration-local.c diff --git a/Makefile.target b/Makefile.target index af6ac7e..aa09960 100644 --- a/Makefile.target +++ b/Makefile.target @@ -117,6 +117,7 @@ obj-$(CONFIG_KVM) += kvm-all.o obj-y += memory.o savevm.o cputlb.o obj-y += memory_mapping.o obj-y += dump.o +obj-y += migration-local.o LIBS+=$(libs_softmmu) # xen support diff --git a/migration-local.c b/migration-local.c new file mode 100644 index 000..ca01a20 --- /dev/null +++ b/migration-local.c @@ -0,0 +1,123 @@ +/* + * QEMU localhost migration with page flipping + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Lei Li li...@linux.vnet.ibm.com + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include config-host.h +#include qemu-common.h +#include migration/migration.h +#include exec/cpu-common.h +#include config.h +#include exec/cpu-all.h +#include exec/memory.h +#include exec/memory-internal.h +#include monitor/monitor.h +#include migration/qemu-file.h +#include qemu/iov.h +#include sysemu/arch_init.h +#include sysemu/sysemu.h +#include block/block.h +#include qemu/sockets.h +#include migration/block.h +#include qemu/thread.h +#include qmp-commands.h +#include trace.h +#include qemu/osdep.h + +//#define DEBUG_MIGRATION_LOCAL + +#ifdef DEBUG_MIGRATION_LOCAL +#define DPRINTF(fmt, ...) \ +do { printf(migration-local: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + + +typedef struct QEMUFileLocal { +QEMUFile *file; +int sockfd; +int pipefd[2]; +int pipefd_passed; +int pipefd_received; +bool unix_page_flipping; +} QEMUFileLocal; + +static int qemu_local_get_sockfd(void *opaque) +{ +QEMUFileLocal *s = opaque; + +return s-sockfd; +} + +static int qemu_local_get_buffer(void *opaque, uint8_t *buf, + int64_t pos, int size) +{ +QEMUFileLocal *s = opaque; +ssize_t len; + +for (;;) { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} + +if (socket_error() == EAGAIN) { +yield_until_fd_readable(s-sockfd); +} else if (socket_error() != EINTR) { +break; +} +} + +if (len == -1) { +len = -socket_error(); +} + +return len; +} + +static ssize_t qemu_local_writev_buffer(void *opaque, struct iovec *iov, +int iovcnt, int64_t pos) +{ +QEMUFileLocal *s = opaque; +ssize_t len; +ssize_t size = iov_size(iov, iovcnt); + +len = iov_send(s-sockfd, iov, iovcnt, 0, size); +if (len size) { +len = -socket_error(); +} + +return len; +} + +static int qemu_local_close(void *opaque) +{ +QEMUFileLocal *s = opaque; + +closesocket(s-sockfd); +g_free(s); + +return 0; +} + +static const QEMUFileOps pipe_read_ops = { +.get_fd= qemu_local_get_sockfd, +.get_buffer= qemu_local_get_buffer, +.close = qemu_local_close, +}; + +static const QEMUFileOps pipe_write_ops = { +.get_fd = qemu_local_get_sockfd, +.writev_buffer = qemu_local_writev_buffer, +.close = qemu_local_close, +}; -- 1.7.7.6
[Qemu-devel] [PATCH 0/17 v5] Localhost migration with side channel for ram
This patch series tries to introduce a mechanism using side channel pipe for RAM via SCM_RIGHTS with unix domain socket protocol migration. This side channel is used for the page flipping by vmsplice, which is the internal mechanism for localhost migration that we are trying to add to QEMU. The backgroud info and previous patch series for reference, Localhost migration http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html migration: Introduce side channel for RAM http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html I have picked patches from the localhost migration series and rebased it on the series of side channel, now it is a complete series that passed the basic test. Please let me know if there is anything needs to be fixed or improved. Your suggestions and comments are very welcome, and thanks to Paolo for his continued review and useful suggestions. Changes since V4: Rename the capability to x-unix-page-flipping for now. (Paolo) Changes since V3: Address comments from Paolo including: - Get rid of useless check in send_pipefd() and the override of before_ram_iterate, send pipefd in the first save_page call, qemu_get_byte() in the first ram_load correspondingly. - Add new argument ram_addr_t to hook_ram_load to cut half of the data transferred on the socket. - Add transition from 'debug' to 'memory-stale'. - Other minor fixes. Changes since V2: Address comments from Paolo including: - Doc improvement for QAPI. - Use callback get_buffer as the only one receiver. - Rename the new RunState flipping-migrate to memory-stale, and add transition from 'prelaunch' to 'memory-stale'. - Other minor fixes. Changes since V1: Address suggestions from Paolo Bonzini including: - Use Unix socket QEMUFile as basis of code and adjust the way of overriding RDMA hooks. - Involve the vmsplice for page flipping. - Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to runstate_needs_reset() for the adjustment of the current migration process with page flipping. Lei Li (17): QAPI: introduce magration capability unix_page_flipping migration: add migrate_unix_page_flipping() qmp-command.hx: add missing docs for migration capabilites migration-local: add QEMUFileLocal with socket based QEMUFile migration-local: introduce qemu_fopen_socket_local() migration-local: add send_pipefd() save_page: replace block_offset with a MemoryRegion migration-local: override save_page for page transmit savevm: adjust ram_control_save_page with page flipping add unix_msgfd_lookup() to callback get_buffer add argument ram_addr_t to hook_ram_load migration-local: override hook_ram_load migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local add new RanState RAN_STATE_MEMORY_STALE migration-unix: page flipping support on unix outgoing migration: adjust migration_thread() process for unix_page_flipping hmp: better format for info migrate_capabilities Makefile.target | 1 + arch_init.c | 4 +- migration-local.c | 512 ++ hmp.c | 5 +- include/migration/migration.h | 3 + include/migration/qemu-file.h | 2 + migration-unix.c | 27 ++- migration-rdma.c | 4 +- migration.c | 18 +- qapi-schema.json | 18 +- qmp-commands.hx | 8 + savevm.c | 21 +- vl.c | 12 +- 13 files changed, 617 insertions(+), 27 deletions(-) create mode 100644 migration-local.c
[Qemu-devel] [PATCH 05/17] migration-local: introduce qemu_fopen_socket_local()
Add qemu_fopen_socket_local() to open QEMUFileLocal introduced earlier. It will create a pipe in write mode if unix_page_flipping is enabled, adjust qemu_local_close() to close pipe as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/qemu-file.h |2 + migration-local.c | 46 + 2 files changed, 48 insertions(+), 0 deletions(-) diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 0f757fb..f9b104a 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -99,6 +99,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode); QEMUFile *qemu_fdopen(int fd, const char *mode); QEMUFile *qemu_fopen_socket(int fd, const char *mode); QEMUFile *qemu_popen_cmd(const char *command, const char *mode); +QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode); + int qemu_get_fd(QEMUFile *f); int qemu_fclose(QEMUFile *f); int64_t qemu_ftell(QEMUFile *f); diff --git a/migration-local.c b/migration-local.c index ca01a20..929ed60 100644 --- a/migration-local.c +++ b/migration-local.c @@ -105,6 +105,12 @@ static int qemu_local_close(void *opaque) QEMUFileLocal *s = opaque; closesocket(s-sockfd); + +if (s-unix_page_flipping) { +close(s-pipefd[0]); +close(s-pipefd[1]); +} + g_free(s); return 0; @@ -121,3 +127,43 @@ static const QEMUFileOps pipe_write_ops = { .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, }; + +QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) +{ +QEMUFileLocal *s; +int pipefd[2]; + +if (qemu_file_mode_is_not_valid(mode)) { +return NULL; +} + +s = g_malloc0(sizeof(QEMUFileLocal)); +s-sockfd = sockfd; + +if (migrate_unix_page_flipping()) { +s-unix_page_flipping = 1; +} + +if (mode[0] == 'w') { +if (s-unix_page_flipping) { +if (pipe(pipefd) 0) { +fprintf(stderr, failed to create PIPE\n); +goto fail; +} + +s-pipefd[0] = pipefd[0]; +s-pipefd[1] = pipefd[1]; +} + +qemu_set_block(s-sockfd); +s-file = qemu_fopen_ops(s, pipe_write_ops); +} else { +s-file = qemu_fopen_ops(s, pipe_read_ops); +} + +return s-file; + +fail: +g_free(s); +return NULL; +} -- 1.7.7.6
[Qemu-devel] [PATCH 03/17] qmp-command.hx: add missing docs for migration capabilites
Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qmp-commands.hx |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..0df08c0 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2898,6 +2898,10 @@ migrate-set-capabilities Enable/Disable migration capabilities - xbzrle: XBZRLE support +- x-rdma-pin-all: Pin all pages during RDMA support +- zero-blocks: Compress zero blocks during block migration +- auto-converge: Block VCPU to help convergence of migration +- x-unix-page-flipping: Page flipping for live QEMU upgrade Arguments: @@ -2922,6 +2926,10 @@ Query current migration capabilities - capabilities: migration capabilities state - xbzrle : XBZRLE state (json-bool) + - x-rdma-pin-all: RDMA state (json-bool) + - zero-blocks: zero-blocks state (json-bool) + - auto-converge: Auto converge state (json-bool) + - x-unix-page-flipping: Page flipping state (json-bool) Arguments: -- 1.7.7.6
[Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 46 ++ 1 files changed, 46 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 929ed60..f479530 100644 --- a/migration-local.c +++ b/migration-local.c @@ -167,3 +167,49 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; +char req[1] = { 0x01 }; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; + +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} -- 1.7.7.6
[Qemu-devel] [PATCH 08/17] migration-local: override save_page for page transmit
This patch implements save_page callback for the outside of page flipping. It will write the address of the page on the Unix socket and flip the page data on pipe by vmsplice(). Every page address would have a header flag RAM_SAVE_FLAG_HOOK, which will trigger the load hook to receive it in incoming side as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 63 + 1 files changed, 63 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index f479530..9453ec8 100644 --- a/migration-local.c +++ b/migration-local.c @@ -116,6 +116,68 @@ static int qemu_local_close(void *opaque) return 0; } +static int send_pipefd(int sockfd, int pipefd); + +static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, + MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) +{ +QEMUFileLocal *s = opaque; +ram_addr_t current_addr = mr-ram_addr + offset; +void *ram_addr; +ssize_t ret; + +if (s-unix_page_flipping) { +qemu_put_be64(s-file, current_addr | RAM_SAVE_FLAG_HOOK); +qemu_fflush(s-file); + +if (!s-pipefd_passed) { +ret = send_pipefd(s-sockfd, s-pipefd[0]); +if (ret 0) { +fprintf(stderr, failed to pass PIPE\n); +return ret; +} +s-pipefd_passed = true; +} + +ram_addr = memory_region_get_ram_ptr(mr) + offset; + +/* vmsplice page data to pipe */ +struct iovec iov = { +.iov_base = ram_addr, +.iov_len = size, +}; + +/* + * The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will move pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently fall + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ + +ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice save error: %s\n, strerror(errno)); +return ret; +} +} else { +if (bytes_sent) { +*bytes_sent = size; +} +DPRINTF(block_offset: %lu, offset: %lu\n, mr-ram_addr, offset); +return 0; +} +} + +return RAM_SAVE_CONTROL_NOT_SUPP; +} + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, @@ -126,6 +188,7 @@ static const QEMUFileOps pipe_write_ops = { .get_fd = qemu_local_get_sockfd, .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, +.save_page = qemu_local_save_ram }; QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) -- 1.7.7.6
[Qemu-devel] [PATCH 09/17] savevm: adjust ram_control_save_page for page flipping
As callback save_page will always be opened by qemu_fopen_socket_local(), and without unix_page_flipping it will return RAM_SAVE_CONTROL_NOT_SUPP, it leads to a wrong qemu_file_set_error() based on the current logic. So this patch adds RAM_SAVE_CONTROL_NOT_SUPP to the check. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- savevm.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/savevm.c b/savevm.c index 06c1f29..137e74f 100644 --- a/savevm.c +++ b/savevm.c @@ -668,7 +668,8 @@ size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, int ret = f-ops-save_page(f, f-opaque, mr, offset, size, bytes_sent); -if (ret != RAM_SAVE_CONTROL_DELAYED) { +if (ret != RAM_SAVE_CONTROL_DELAYED +ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (bytes_sent *bytes_sent 0) { qemu_update_position(f, *bytes_sent); } else if (ret 0) { -- 1.7.7.6
[Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load
Override hook_ram_load to receive the pipe file descriptor passed by source process and page address which will be extracted to vmsplice the page data from pipe. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 59 + 1 files changed, 59 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 5f98a01..ce4c070 100644 --- a/migration-local.c +++ b/migration-local.c @@ -231,10 +231,69 @@ static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, return RAM_SAVE_CONTROL_NOT_SUPP; } +static int qemu_local_ram_load(QEMUFile *f, void *opaque, + ram_addr_t addr, uint64_t flags) +{ +QEMUFileLocal *s = opaque; +struct iovec iov; +ssize_t ret = -EINVAL; + +if (!s-pipefd_received) { +/* + * send_pipefd was called at this point, and it wrote one + * byte to the stream. + */ +qemu_get_byte(s-file); +s-pipefd_received = true; +} + +if (s-pipefd_passed) { +void *host; +/* + * Extract the page address from the 8-byte record and + * read the page data from the pipe. + */ +host = qemu_get_ram_ptr(addr); + +iov.iov_base = host; +iov.iov_len = TARGET_PAGE_SIZE; + +/* + * The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will move pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently fall + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ + +ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice() load error: %s, strerror(errno)); +return ret; +} +DPRINTF(vmsplice load error\n); +} else if (ret == 0) { +DPRINTF(stderr, load_page: zero read\n); +} + +DPRINTF(vmsplice (read): %zu\n, ret); +return ret; +} + +return -EINVAL; +} + + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, .close = qemu_local_close, +.hook_ram_load = qemu_local_ram_load }; static const QEMUFileOps pipe_write_ops = { -- 1.7.7.6
[Qemu-devel] [PATCH 07/17] save_page: replace block_offset with a MemoryRegion
This patch exports MemoryRegion to save_page hook, replacing argument ram_addr_t block_offset with a MemoryRegion suggested by Paolo Bonzini. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- arch_init.c |4 ++-- include/migration/migration.h |2 +- include/migration/qemu-file.h |8 migration-rdma.c |4 ++-- savevm.c |8 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch_init.c b/arch_init.c index e0acbc5..daaa519 100644 --- a/arch_init.c +++ b/arch_init.c @@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* In doubt sent page as normal */ bytes_sent = -1; -ret = ram_control_save_page(f, block-offset, - offset, TARGET_PAGE_SIZE, bytes_sent); +ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE, +bytes_sent); if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { diff --git a/include/migration/migration.h b/include/migration/migration.h index 7e5d01a..ca852a8 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags); #define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index f9b104a..6646e89 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); * is saved (such as RDMA, for example.) */ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, - ram_addr_t block_offset, - ram_addr_t offset, - size_t size, - int *bytes_sent); + MemoryRegion *mr, + ram_addr_t offset, + size_t size, + int *bytes_sent); typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4..ae04de4 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque) * the protocol because most transfers are sent asynchronously. */ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, - ram_addr_t block_offset, ram_addr_t offset, + MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent) { QEMUFileRDMA *rfile = opaque; @@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, * is full, or the page doen't belong to the current chunk, * an actual RDMA write will occur and a new chunk will be formed. */ -ret = qemu_rdma_write(f, rdma, block_offset, offset, size); +ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size); if (ret 0) { fprintf(stderr, rdma migration: write error! %d\n, ret); goto err; diff --git a/savevm.c b/savevm.c index 3f912dd..06c1f29 100644 --- a/savevm.c +++ b/savevm.c @@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags) } } -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size, int *bytes_sent) +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) { if (f-ops-save_page) { -int ret = f-ops-save_page(f, f-opaque, block_offset, -offset, size, bytes_sent); +int ret = f-ops-save_page(f, f-opaque, mr, offset, +size, bytes_sent); if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent *bytes_sent 0) { -- 1.7.7.6
[Qemu-devel] [PATCH 14/17] add new RunState RUN_STATE_MEMORY_STALE
Introduce new RunState RUN_STATE_MEMORY_STALE and add it to runstate_needs_reset(). Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json |7 +-- vl.c | 13 - 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index ea910ef..6ff46ff 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -176,12 +176,15 @@ # @watchdog: the watchdog action is configured to pause and has been triggered # # @guest-panicked: guest has been panicked as a result of guest OS panic +# +# @memory-stale: guest is paused to start unix_page_flipping migration +# process, the destination QEMU will has the newer contents of the memory ## { 'enum': 'RunState', 'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused', 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm', -'running', 'save-vm', 'shutdown', 'suspended', 'watchdog', -'guest-panicked' ] } +'running', 'save-vm', 'shutdown', 'suspended', 'memory-stale', +'watchdog', 'guest-panicked' ] } ## # @SnapshotInfo diff --git a/vl.c b/vl.c index 8d5d874..3ea96b2 100644 --- a/vl.c +++ b/vl.c @@ -601,6 +601,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PAUSED, RUN_STATE_RUNNING }, { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_PAUSED, RUN_STATE_MEMORY_STALE }, { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE }, @@ -608,6 +609,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING }, { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE }, { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, +{ RUN_STATE_PRELAUNCH, RUN_STATE_MEMORY_STALE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, @@ -624,23 +626,31 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN }, { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG }, { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED }, +{ RUN_STATE_RUNNING, RUN_STATE_MEMORY_STALE }, { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING }, { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED }, { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SHUTDOWN, RUN_STATE_MEMORY_STALE }, { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED }, +{ RUN_STATE_DEBUG, RUN_STATE_MEMORY_STALE }, { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE }, { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_WATCHDOG, RUN_STATE_MEMORY_STALE }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_GUEST_PANICKED, RUN_STATE_MEMORY_STALE }, +{ RUN_STATE_MEMORY_STALE, RUN_STATE_RUNNING }, +{ RUN_STATE_MEMORY_STALE, RUN_STATE_POSTMIGRATE }, { RUN_STATE_MAX, RUN_STATE_MAX }, }; @@ -685,7 +695,8 @@ int runstate_is_running(void) bool runstate_needs_reset(void) { return runstate_check(RUN_STATE_INTERNAL_ERROR) || -runstate_check(RUN_STATE_SHUTDOWN); +runstate_check(RUN_STATE_SHUTDOWN) || +runstate_check(RUN_STATE_MEMORY_STALE); } StatusInfo *qmp_query_status(Error **errp) -- 1.7.7.6
[Qemu-devel] [PATCH 17/17] hmp: better format for info migrate_capabilities
As there might be more capabilities introduced, better to display it in lines. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- hmp.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..dcfa2f9 100644 --- a/hmp.c +++ b/hmp.c @@ -226,13 +226,12 @@ void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict) caps = qmp_query_migrate_capabilities(NULL); if (caps) { -monitor_printf(mon, capabilities: ); +monitor_printf(mon, Capabilities:\n); for (cap = caps; cap; cap = cap-next) { -monitor_printf(mon, %s: %s , +monitor_printf(mon, %s: %s\n, MigrationCapability_lookup[cap-value-capability], cap-value-state ? on : off); } -monitor_printf(mon, \n); } qapi_free_MigrationCapabilityStatusList(caps); -- 1.7.7.6
[Qemu-devel] [PATCH 15/17] migration-unix: page flipping support on unix outgoing
Add page flipping support on unix outgoing part by stopping VM with the new RunState RUN_STATE_MEMORY_STALE before invoking migration if unix_page_flipping enabled. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-unix.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/migration-unix.c b/migration-unix.c index 9beeafe..cbf2087 100644 --- a/migration-unix.c +++ b/migration-unix.c @@ -19,6 +19,7 @@ #include migration/migration.h #include migration/qemu-file.h #include block/block.h +#include sysemu/sysemu.h //#define DEBUG_MIGRATION_UNIX @@ -33,6 +34,7 @@ static void unix_wait_for_connect(int fd, void *opaque) { MigrationState *s = opaque; +int ret; if (fd 0) { DPRINTF(migrate connect error\n); @@ -47,6 +49,15 @@ static void unix_wait_for_connect(int fd, void *opaque) goto fail; } +/* Stop VM before invoking migration if unix_page_flipping enabled */ +if (migrate_unix_page_flipping()) { +ret = vm_stop_force_state(RUN_STATE_MEMORY_STALE); +if (ret 0) { +DPRINTF(failed to stop VM\n); +goto fail; +} +} + migrate_fd_connect(s); return; } -- 1.7.7.6
[Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration.c |7 +-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/migration.c b/migration.c index e012cd4..7e0ec33 100644 --- a/migration.c +++ b/migration.c @@ -582,7 +582,7 @@ static void *migration_thread(void *opaque) if (pending_size pending_size = max_size) { qemu_savevm_state_iterate(s-file); } else { -int ret; +int ret = 0; DPRINTF(done iterating\n); qemu_mutex_lock_iothread(); @@ -590,7 +590,10 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); -ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +if (!runstate_needs_reset()) { +ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +} + if (ret = 0) { qemu_file_set_rate_limit(s-file, INT_MAX); qemu_savevm_state_complete(s-file); -- 1.7.7.6
[Qemu-devel] [PATCH 10/17] add unix_msgfd_lookup() to callback get_buffer
The control message for exchange of pipe file descriptor should be received by recvmsg, and it might be eaten to stream file by qemu_recv() when receiving by two callbacks. So this patch adds unix_msgfd_lookup() to callback get_buffer as the only one receiver, where the pipe file descriptor would be caughted. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 59 ++-- 1 files changed, 56 insertions(+), 3 deletions(-) diff --git a/migration-local.c b/migration-local.c index 9453ec8..5f98a01 100644 --- a/migration-local.c +++ b/migration-local.c @@ -59,16 +59,69 @@ static int qemu_local_get_sockfd(void *opaque) return s-sockfd; } +static int unix_msgfd_lookup(void *opaque, struct msghdr *msg) +{ +QEMUFileLocal *s = opaque; +struct cmsghdr *cmsg; +bool found = false; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) +continue; + +/* PIPE file descriptor to be received */ +s-pipefd[0] = *((int *)CMSG_DATA(cmsg)); +} + +if (s-pipefd[0] 0) { +fprintf(stderr, no pipe fd can be received\n); +return found; +} + +DPRINTF(pipefd successfully received\n); +return s-pipefd[0]; +} + static int qemu_local_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileLocal *s = opaque; ssize_t len; +struct msghdr msg = { NULL, }; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; + +iov[0].iov_base = buf; +iov[0].iov_len = size; + +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); for (;;) { -len = qemu_recv(s-sockfd, buf, size, 0); -if (len != -1) { -break; +if (!s-pipefd_passed) { +/* + * recvmsg is called here to catch the control message for + * the exchange of PIPE file descriptor until it is received. + */ +len = recvmsg(s-sockfd, msg, 0); +if (len != -1) { +if (unix_msgfd_lookup(s, msg) 0) { +s-pipefd_passed = 1; +} +break; +} +} else { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} } if (socket_error() == EAGAIN) { -- 1.7.7.6
[Qemu-devel] [PATCH 11/17] add argument ram_addr_t to hook_ram_load
Adds argument ram_addr_t to hook_ram_load, and replaces QEMURamHookFunc with QEMURamLoadHookFunc for it. With this new argument, it will allow cut almost half of the data transferred on the Unix socket using by page flipping migraton. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- arch_init.c |2 +- include/migration/migration.h |2 +- include/migration/qemu-file.h | 11 ++- migration-rdma.c |2 +- savevm.c |4 ++-- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch_init.c b/arch_init.c index daaa519..0621893 100644 --- a/arch_init.c +++ b/arch_init.c @@ -945,7 +945,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) goto done; } } else if (flags RAM_SAVE_FLAG_HOOK) { -ram_control_load_hook(f, flags); +ram_control_load_hook(f, addr, flags); } error = qemu_file_get_error(f); if (error) { diff --git a/include/migration/migration.h b/include/migration/migration.h index ca852a8..300e52c 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -149,7 +149,7 @@ int64_t xbzrle_cache_resize(int64_t new_size); void ram_control_before_iterate(QEMUFile *f, uint64_t flags); void ram_control_after_iterate(QEMUFile *f, uint64_t flags); -void ram_control_load_hook(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, ram_addr_t addr, uint64_t flags); /* Whenever this is found in the data stream, the flags * will be passed to ram_control_load_hook in the incoming-migration diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 6646e89..176c2d9 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -65,6 +65,15 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); /* + * This function provides load hook for RAM migration, allows + * override of where the RAM page is loaded (such as page + * flipping for example). + */ +typedef int (QEMURamLoadHookFunc)(QEMUFile *f, void *opaque, + ram_addr_t addr, + uint64_t flags); + +/* * Constants used by ram_control_* hooks */ #define RAM_CONTROL_SETUP0 @@ -90,7 +99,7 @@ typedef struct QEMUFileOps { QEMUFileWritevBufferFunc *writev_buffer; QEMURamHookFunc *before_ram_iterate; QEMURamHookFunc *after_ram_iterate; -QEMURamHookFunc *hook_ram_load; +QEMURamLoadHookFunc *hook_ram_load; QEMURamSaveFunc *save_page; } QEMUFileOps; diff --git a/migration-rdma.c b/migration-rdma.c index ae04de4..732ec1a 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2938,7 +2938,7 @@ err_rdma_dest_wait: * Keep doing this until the source tells us to stop. */ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, - uint64_t flags) + ram_addr_t offset, uint64_t flags) { RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), .type = RDMA_CONTROL_REGISTER_RESULT, diff --git a/savevm.c b/savevm.c index 137e74f..75e397c 100644 --- a/savevm.c +++ b/savevm.c @@ -647,12 +647,12 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags) } } -void ram_control_load_hook(QEMUFile *f, uint64_t flags) +void ram_control_load_hook(QEMUFile *f, ram_addr_t offset, uint64_t flags) { int ret = -EINVAL; if (f-ops-hook_ram_load) { -ret = f-ops-hook_ram_load(f, f-opaque, flags); +ret = f-ops-hook_ram_load(f, f-opaque, offset, flags); if (ret 0) { qemu_file_set_error(f, ret); } -- 1.7.7.6
[Qemu-devel] [PATCH 05/17] migration-local: introduce qemu_fopen_socket_local()
Add qemu_fopen_socket_local() to open QEMUFileLocal introduced earlier. It will create a pipe in write mode if unix_page_flipping is enabled, adjust qemu_local_close() to close pipe as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/qemu-file.h |2 + migration-local.c | 46 + 2 files changed, 48 insertions(+), 0 deletions(-) diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 0f757fb..f9b104a 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -99,6 +99,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode); QEMUFile *qemu_fdopen(int fd, const char *mode); QEMUFile *qemu_fopen_socket(int fd, const char *mode); QEMUFile *qemu_popen_cmd(const char *command, const char *mode); +QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode); + int qemu_get_fd(QEMUFile *f); int qemu_fclose(QEMUFile *f); int64_t qemu_ftell(QEMUFile *f); diff --git a/migration-local.c b/migration-local.c index ca01a20..929ed60 100644 --- a/migration-local.c +++ b/migration-local.c @@ -105,6 +105,12 @@ static int qemu_local_close(void *opaque) QEMUFileLocal *s = opaque; closesocket(s-sockfd); + +if (s-unix_page_flipping) { +close(s-pipefd[0]); +close(s-pipefd[1]); +} + g_free(s); return 0; @@ -121,3 +127,43 @@ static const QEMUFileOps pipe_write_ops = { .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, }; + +QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) +{ +QEMUFileLocal *s; +int pipefd[2]; + +if (qemu_file_mode_is_not_valid(mode)) { +return NULL; +} + +s = g_malloc0(sizeof(QEMUFileLocal)); +s-sockfd = sockfd; + +if (migrate_unix_page_flipping()) { +s-unix_page_flipping = 1; +} + +if (mode[0] == 'w') { +if (s-unix_page_flipping) { +if (pipe(pipefd) 0) { +fprintf(stderr, failed to create PIPE\n); +goto fail; +} + +s-pipefd[0] = pipefd[0]; +s-pipefd[1] = pipefd[1]; +} + +qemu_set_block(s-sockfd); +s-file = qemu_fopen_ops(s, pipe_write_ops); +} else { +s-file = qemu_fopen_ops(s, pipe_read_ops); +} + +return s-file; + +fail: +g_free(s); +return NULL; +} -- 1.7.7.6
[Qemu-devel] [PATCH 01/17] QAPI: introduce migration capability unix_page_flipping
Introduce unix_page_flipping to MigrationCapability for localhost migration. Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index 83fa485..b290a0f 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -685,10 +685,18 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @unix-page-flipping: If enabled, QEMU can optimize migration when the +# destination is a QEMU process that runs on the same host as +# the source (as is the case for live upgrade). If the migration +# transport is a Unix socket, QEMU will flip RAM pages directly to +# the destination, so that memory is only allocated twice for the +# source and destination processes. Disabled by default. (since 1.8) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks', + 'unix-page-flipping'] } ## # @MigrationCapabilityStatus -- 1.7.7.6
[Qemu-devel] [PATCH 0/17 v4] Localhost migration with side channel for ram
This patch series tries to introduce a mechanism using side channel pipe for RAM via SCM_RIGHTS with unix domain socket protocol migration. This side channel is used for the page flipping by vmsplice, which is the internal mechanism for localhost migration that we are trying to add to QEMU. The backgroud info and previous patch series for reference, Localhost migration http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html migration: Introduce side channel for RAM http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html I have picked patches from the localhost migration series and rebased it on the series of side channel, now it is a complete series that passed the basic test. Please let me know if there is anything needs to be fixed or improved. Your suggestions and comments are very welcome, and thanks to Paolo for his continued review and useful suggestions. Changes since V3: Address comments from Paolo including: - Get rid of useless check in send_pipefd() and the override of before_ram_iterate, send pipefd in the first save_page call, qemu_get_byte() in the first ram_load correspondingly. - Add new argument ram_addr_t to hook_ram_load to cut half of the data transferred on the socket. - Add transition from 'debug' to 'memory-stale'. - Other minor fixes. Changes since V2: Address comments from Paolo including: - Doc improvement for QAPI. - Use callback get_buffer as the only one receiver. - Rename the new RunState flipping-migrate to memory-stale, and add transition from 'prelaunch' to 'memory-stale'. - Other minor fixes. Changes since V1: Address suggestions from Paolo Bonzini including: - Use Unix socket QEMUFile as basis of code and adjust the way of overriding RDMA hooks. - Involve the vmsplice for page flipping. - Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to runstate_needs_reset() for the adjustment of the current migration process with page flipping. Lei Li (17): QAPI: introduce magration capability unix_page_flipping migration: add migrate_unix_page_flipping() qmp-command.hx: add missing docs for migration capabilites migration-local: add QEMUFileLocal with socket based QEMUFile migration-local: introduce qemu_fopen_socket_local() migration-local: add send_pipefd() save_page: replace block_offset with a MemoryRegion migration-local: override save_page for page transmit savevm: adjust ram_control_save_page with page flipping add unix_msgfd_lookup() to callback get_buffer add argument ram_addr_t to hook_ram_load migration-local: override hook_ram_load migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local add new RanState RAN_STATE_MEMORY_STALE migration-unix: page flipping support on unix outgoing migration: adjust migration_thread() process for unix_page_flipping hmp: better fomat for info migrate_capabilities Makefile.target | 1 + arch_init.c | 4 +- migration-local.c | 512 ++ hmp.c | 5 +- include/migration/migration.h | 3 + include/migration/qemu-file.h | 2 + migration-unix.c | 27 ++- migration-rdma.c | 4 +- migration.c | 18 +- qapi-schema.json | 18 +- qmp-commands.hx | 8 + savevm.c | 21 +- vl.c | 12 +- 13 files changed, 617 insertions(+), 27 deletions(-) create mode 100644 migration-local.c
[Qemu-devel] [PATCH 03/17] qmp-command.hx: add missing docs for migration capabilites
Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qmp-commands.hx |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..dcec433 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2898,6 +2898,10 @@ migrate-set-capabilities Enable/Disable migration capabilities - xbzrle: XBZRLE support +- x-rdma-pin-all: Pin all pages during RDMA support +- zero-blocks: Compress zero blocks during block migration +- auto-converge: Block VCPU to help convergence of migration +- unix-page-flipping: Page flipping for live QEMU upgrade Arguments: @@ -2922,6 +2926,10 @@ Query current migration capabilities - capabilities: migration capabilities state - xbzrle : XBZRLE state (json-bool) + - x-rdma-pin-all: RDMA state (json-bool) + - zero-blocks: zero-blocks state (json-bool) + - auto-converge: Auto converge state (json-bool) + - unix-page-flipping: Page flipping state (json-bool) Arguments: -- 1.7.7.6
[Qemu-devel] [PATCH 02/17] migration: add migrate_unix_page_flipping()
Add migrate_unix_page_flipping() to check if MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING is enabled. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/migration.h |3 +++ migration.c |9 + 2 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index 140e6b4..7e5d01a 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason); void migrate_del_blocker(Error *reason); bool migrate_rdma_pin_all(void); + bool migrate_zero_blocks(void); bool migrate_auto_converge(void); +bool migrate_unix_page_flipping(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); diff --git a/migration.c b/migration.c index 2b1ab20..4ac466b 100644 --- a/migration.c +++ b/migration.c @@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void) return s-xbzrle_cache_size; } +bool migrate_unix_page_flipping(void) +{ +MigrationState *s; + +s = migrate_get_current(); + +return s-enabled_capabilities[MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING]; +} + /* migration thread support */ static void *migration_thread(void *opaque) -- 1.7.7.6
[Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 46 ++ 1 files changed, 46 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 929ed60..f479530 100644 --- a/migration-local.c +++ b/migration-local.c @@ -167,3 +167,49 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; +char req[1] = { 0x01 }; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; + +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} -- 1.7.7.6
[Qemu-devel] [PATCH 10/17] add unix_msgfd_lookup() to callback get_buffer
The control message for exchange of pipe file descriptor should be received by recvmsg, and it might be eaten to stream file by qemu_recv() when receiving by two callbacks. So this patch adds unix_msgfd_lookup() to callback get_buffer as the only one receiver, where the pipe file descriptor would be caughted. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 59 ++-- 1 files changed, 56 insertions(+), 3 deletions(-) diff --git a/migration-local.c b/migration-local.c index 0a41c69..76ec306 100644 --- a/migration-local.c +++ b/migration-local.c @@ -59,16 +59,69 @@ static int qemu_local_get_sockfd(void *opaque) return s-sockfd; } +static int unix_msgfd_lookup(void *opaque, struct msghdr *msg) +{ +QEMUFileLocal *s = opaque; +struct cmsghdr *cmsg; +bool found = false; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) +continue; + +/* PIPE file descriptor to be received */ +s-pipefd[0] = *((int *)CMSG_DATA(cmsg)); +} + +if (s-pipefd[0] 0) { +fprintf(stderr, no pipe fd can be received\n); +return found; +} + +DPRINTF(pipefd successfully received\n); +return s-pipefd[0]; +} + static int qemu_local_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileLocal *s = opaque; ssize_t len; +struct msghdr msg = { NULL, }; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; + +iov[0].iov_base = buf; +iov[0].iov_len = size; + +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); for (;;) { -len = qemu_recv(s-sockfd, buf, size, 0); -if (len != -1) { -break; +if (!s-pipefd_passed) { +/* + * recvmsg is called here to catch the control message for + * the exchange of PIPE file descriptor until it is received. + */ +len = recvmsg(s-sockfd, msg, 0); +if (len != -1) { +if (unix_msgfd_lookup(s, msg) 0) { +s-pipefd_passed = 1; +} +break; +} +} else { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} } if (socket_error() == EAGAIN) { -- 1.7.7.6
[Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_MEMORY_STALE
Introduce new RanState RAN_STATE_MEMORY_STALE and add it to runstate_needs_reset(). Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json |7 +-- vl.c | 13 - 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index b290a0f..4d9e712 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -176,12 +176,15 @@ # @watchdog: the watchdog action is configured to pause and has been triggered # # @guest-panicked: guest has been panicked as a result of guest OS panic +# +# @memory-stale: guest is paused to start unix_page_flipping migration +# process, the destination QEMU will has the newer contents of the memory ## { 'enum': 'RunState', 'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused', 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm', -'running', 'save-vm', 'shutdown', 'suspended', 'watchdog', -'guest-panicked' ] } +'running', 'save-vm', 'shutdown', 'suspended', 'memory-stale', +'watchdog', 'guest-panicked' ] } ## # @SnapshotInfo diff --git a/vl.c b/vl.c index 8d5d874..3ea96b2 100644 --- a/vl.c +++ b/vl.c @@ -601,6 +601,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PAUSED, RUN_STATE_RUNNING }, { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_PAUSED, RUN_STATE_MEMORY_STALE }, { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE }, @@ -608,6 +609,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING }, { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE }, { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, +{ RUN_STATE_PRELAUNCH, RUN_STATE_MEMORY_STALE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, @@ -624,23 +626,31 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN }, { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG }, { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED }, +{ RUN_STATE_RUNNING, RUN_STATE_MEMORY_STALE }, { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING }, { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED }, { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SHUTDOWN, RUN_STATE_MEMORY_STALE }, { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED }, +{ RUN_STATE_DEBUG, RUN_STATE_MEMORY_STALE }, { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE }, { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_WATCHDOG, RUN_STATE_MEMORY_STALE }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_GUEST_PANICKED, RUN_STATE_MEMORY_STALE }, +{ RUN_STATE_MEMORY_STALE, RUN_STATE_RUNNING }, +{ RUN_STATE_MEMORY_STALE, RUN_STATE_POSTMIGRATE }, { RUN_STATE_MAX, RUN_STATE_MAX }, }; @@ -685,7 +695,8 @@ int runstate_is_running(void) bool runstate_needs_reset(void) { return runstate_check(RUN_STATE_INTERNAL_ERROR) || -runstate_check(RUN_STATE_SHUTDOWN); +runstate_check(RUN_STATE_SHUTDOWN) || +runstate_check(RUN_STATE_MEMORY_STALE); } StatusInfo *qmp_query_status(Error **errp) -- 1.7.7.6
[Qemu-devel] [PATCH 07/17] save_page: replace block_offset with a MemoryRegion
This patch exports MemoryRegion to save_page hook, replacing argument ram_addr_t block_offset with a MemoryRegion suggested by Paolo Bonzini. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- arch_init.c |4 ++-- include/migration/migration.h |2 +- include/migration/qemu-file.h |8 migration-rdma.c |4 ++-- savevm.c |8 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch_init.c b/arch_init.c index e0acbc5..daaa519 100644 --- a/arch_init.c +++ b/arch_init.c @@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* In doubt sent page as normal */ bytes_sent = -1; -ret = ram_control_save_page(f, block-offset, - offset, TARGET_PAGE_SIZE, bytes_sent); +ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE, +bytes_sent); if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { diff --git a/include/migration/migration.h b/include/migration/migration.h index 7e5d01a..ca852a8 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags); #define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index f9b104a..6646e89 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); * is saved (such as RDMA, for example.) */ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, - ram_addr_t block_offset, - ram_addr_t offset, - size_t size, - int *bytes_sent); + MemoryRegion *mr, + ram_addr_t offset, + size_t size, + int *bytes_sent); typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4..ae04de4 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque) * the protocol because most transfers are sent asynchronously. */ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, - ram_addr_t block_offset, ram_addr_t offset, + MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent) { QEMUFileRDMA *rfile = opaque; @@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, * is full, or the page doen't belong to the current chunk, * an actual RDMA write will occur and a new chunk will be formed. */ -ret = qemu_rdma_write(f, rdma, block_offset, offset, size); +ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size); if (ret 0) { fprintf(stderr, rdma migration: write error! %d\n, ret); goto err; diff --git a/savevm.c b/savevm.c index 3f912dd..06c1f29 100644 --- a/savevm.c +++ b/savevm.c @@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags) } } -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size, int *bytes_sent) +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) { if (f-ops-save_page) { -int ret = f-ops-save_page(f, f-opaque, block_offset, -offset, size, bytes_sent); +int ret = f-ops-save_page(f, f-opaque, mr, offset, +size, bytes_sent); if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent *bytes_sent 0) { -- 1.7.7.6
[Qemu-devel] [PATCH 11/17] add argument ram_addr_t to hook_ram_load
Adds argument ram_addr_t to hook_ram_load, and replaces QEMURamHookFunc with QEMURamLoadHookFunc for it. With this new argument, it will allows cut almost half of the data transferred on the Unix socket using by page flipping migraton. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- arch_init.c |2 +- include/migration/migration.h |2 +- include/migration/qemu-file.h | 11 ++- migration-rdma.c |2 +- savevm.c |4 ++-- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch_init.c b/arch_init.c index daaa519..0621893 100644 --- a/arch_init.c +++ b/arch_init.c @@ -945,7 +945,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) goto done; } } else if (flags RAM_SAVE_FLAG_HOOK) { -ram_control_load_hook(f, flags); +ram_control_load_hook(f, addr, flags); } error = qemu_file_get_error(f); if (error) { diff --git a/include/migration/migration.h b/include/migration/migration.h index ca852a8..300e52c 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -149,7 +149,7 @@ int64_t xbzrle_cache_resize(int64_t new_size); void ram_control_before_iterate(QEMUFile *f, uint64_t flags); void ram_control_after_iterate(QEMUFile *f, uint64_t flags); -void ram_control_load_hook(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, ram_addr_t addr, uint64_t flags); /* Whenever this is found in the data stream, the flags * will be passed to ram_control_load_hook in the incoming-migration diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 6646e89..176c2d9 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -65,6 +65,15 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); /* + * This function provides load hook for RAM migration, allows + * override of where the RAM page is loaded (such as page + * flipping for example). + */ +typedef int (QEMURamLoadHookFunc)(QEMUFile *f, void *opaque, + ram_addr_t addr, + uint64_t flags); + +/* * Constants used by ram_control_* hooks */ #define RAM_CONTROL_SETUP0 @@ -90,7 +99,7 @@ typedef struct QEMUFileOps { QEMUFileWritevBufferFunc *writev_buffer; QEMURamHookFunc *before_ram_iterate; QEMURamHookFunc *after_ram_iterate; -QEMURamHookFunc *hook_ram_load; +QEMURamLoadHookFunc *hook_ram_load; QEMURamSaveFunc *save_page; } QEMUFileOps; diff --git a/migration-rdma.c b/migration-rdma.c index ae04de4..732ec1a 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2938,7 +2938,7 @@ err_rdma_dest_wait: * Keep doing this until the source tells us to stop. */ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, - uint64_t flags) + ram_addr_t offset, uint64_t flags) { RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), .type = RDMA_CONTROL_REGISTER_RESULT, diff --git a/savevm.c b/savevm.c index 137e74f..75e397c 100644 --- a/savevm.c +++ b/savevm.c @@ -647,12 +647,12 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags) } } -void ram_control_load_hook(QEMUFile *f, uint64_t flags) +void ram_control_load_hook(QEMUFile *f, ram_addr_t offset, uint64_t flags) { int ret = -EINVAL; if (f-ops-hook_ram_load) { -ret = f-ops-hook_ram_load(f, f-opaque, flags); +ret = f-ops-hook_ram_load(f, f-opaque, offset, flags); if (ret 0) { qemu_file_set_error(f, ret); } -- 1.7.7.6
[Qemu-devel] [PATCH 08/17] migration-local: override save_page for page transmit
This patch implements save_page callback for the outside of page flipping. It will write the address of the page on the Unix socket and flip the page data on pipe by vmsplice(). Every page address would have a header flag RAM_SAVE_FLAG_HOOK, which will trigger the load hook to receive it in incoming side as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 63 + 1 files changed, 63 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index f479530..0a41c69 100644 --- a/migration-local.c +++ b/migration-local.c @@ -116,6 +116,68 @@ static int qemu_local_close(void *opaque) return 0; } +static int send_pipefd(int sockfd, int pipefd); + +static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, + MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) +{ +QEMUFileLocal *s = opaque; +ram_addr_t current_addr = mr-ram_addr + offset; +void *ram_addr; +ssize_t ret; + +if (s-unix_page_flipping) { +qemu_put_be64(s-file, current_addr | RAM_SAVE_FLAG_HOOK); +qemu_fflush(s-file); + +if (!s-pipefd_passed) { +ret = send_pipefd(s-sockfd, s-pipefd[0]); +if (ret 0) { +fprintf(stderr, failed to pass PIPE\n); +return ret; +} +s-pipefd_passed = true; +} + +ram_addr = memory_region_get_ram_ptr(mr) + offset; + +/* vmsplice page data to pipe */ +struct iovec iov = { +.iov_base = ram_addr, +.iov_len = size, +}; + +/* + * The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will movie pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently falls + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ + +ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice save error: %s\n, strerror(errno)); +return ret; +} +} else { +if (bytes_sent) { +*bytes_sent = 1; +} +DPRINTF(block_offset: %lu, offset: %lu\n, mr-ram_addr, offset); +return 0; +} +} + +return RAM_SAVE_CONTROL_NOT_SUPP; +} + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, @@ -126,6 +188,7 @@ static const QEMUFileOps pipe_write_ops = { .get_fd = qemu_local_get_sockfd, .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, +.save_page = qemu_local_save_ram }; QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) -- 1.7.7.6
[Qemu-devel] [PATCH 17/17] hmp: better format for info migrate_capabilities
As there might be more capabilities introduced, better to display it in lines. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- hmp.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..dcfa2f9 100644 --- a/hmp.c +++ b/hmp.c @@ -226,13 +226,12 @@ void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict) caps = qmp_query_migrate_capabilities(NULL); if (caps) { -monitor_printf(mon, capabilities: ); +monitor_printf(mon, Capabilities:\n); for (cap = caps; cap; cap = cap-next) { -monitor_printf(mon, %s: %s , +monitor_printf(mon, %s: %s\n, MigrationCapability_lookup[cap-value-capability], cap-value-state ? on : off); } -monitor_printf(mon, \n); } qapi_free_MigrationCapabilityStatusList(caps); -- 1.7.7.6
[Qemu-devel] [PATCH 09/17] savevm: adjust ram_control_save_page for page flipping
As callback save_page will always be opened by qemu_fopen_socket_local(), and without unix_page_flipping it will return RAM_SAVE_CONTROL_NOT_SUPP, it leads to a wrong qemu_file_set_error() based on the current logic. So this patch adds RAM_SAVE_CONTROL_NOT_SUPP to the check. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- savevm.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/savevm.c b/savevm.c index 06c1f29..137e74f 100644 --- a/savevm.c +++ b/savevm.c @@ -668,7 +668,8 @@ size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, int ret = f-ops-save_page(f, f-opaque, mr, offset, size, bytes_sent); -if (ret != RAM_SAVE_CONTROL_DELAYED) { +if (ret != RAM_SAVE_CONTROL_DELAYED +ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (bytes_sent *bytes_sent 0) { qemu_update_position(f, *bytes_sent); } else if (ret 0) { -- 1.7.7.6
[Qemu-devel] [PATCH 13/17] migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local
Relace qemu_fopen_socket with qemu_fopen_socket_local in Unix protocol migration. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-unix.c | 18 ++ 1 files changed, 14 insertions(+), 4 deletions(-) diff --git a/migration-unix.c b/migration-unix.c index 651fc5b..9beeafe 100644 --- a/migration-unix.c +++ b/migration-unix.c @@ -37,12 +37,22 @@ static void unix_wait_for_connect(int fd, void *opaque) if (fd 0) { DPRINTF(migrate connect error\n); s-file = NULL; -migrate_fd_error(s); +goto fail; } else { DPRINTF(migrate connect success\n); -s-file = qemu_fopen_socket(fd, wb); + +s-file = qemu_fopen_socket_local(fd, wb); +if (s-file == NULL) { +fprintf(stderr, failed to open Unix socket\n); +goto fail; +} + migrate_fd_connect(s); +return; } + +fail: +migrate_fd_error(s); } void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp) @@ -71,9 +81,9 @@ static void unix_accept_incoming_migration(void *opaque) goto out; } -f = qemu_fopen_socket(c, rb); +f = qemu_fopen_socket_local(c, rb); if (f == NULL) { -fprintf(stderr, could not qemu_fopen socket\n); +fprintf(stderr, failed to open Unix socket\n); goto out; } -- 1.7.7.6
[Qemu-devel] [PATCH 15/17] migration-unix: page flipping support on unix outgoing
Add page flipping support on unix outgoing part by stopping VM with the new RunState RUN_STATE_MEMORY_STALE before invoking migration if unix_page_flipping enabled. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-unix.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/migration-unix.c b/migration-unix.c index 9beeafe..cbf2087 100644 --- a/migration-unix.c +++ b/migration-unix.c @@ -19,6 +19,7 @@ #include migration/migration.h #include migration/qemu-file.h #include block/block.h +#include sysemu/sysemu.h //#define DEBUG_MIGRATION_UNIX @@ -33,6 +34,7 @@ static void unix_wait_for_connect(int fd, void *opaque) { MigrationState *s = opaque; +int ret; if (fd 0) { DPRINTF(migrate connect error\n); @@ -47,6 +49,15 @@ static void unix_wait_for_connect(int fd, void *opaque) goto fail; } +/* Stop VM before invoking migration if unix_page_flipping enabled */ +if (migrate_unix_page_flipping()) { +ret = vm_stop_force_state(RUN_STATE_MEMORY_STALE); +if (ret 0) { +DPRINTF(failed to stop VM\n); +goto fail; +} +} + migrate_fd_connect(s); return; } -- 1.7.7.6
[Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration.c |7 +-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/migration.c b/migration.c index 4ac466b..68b5b02 100644 --- a/migration.c +++ b/migration.c @@ -582,7 +582,7 @@ static void *migration_thread(void *opaque) if (pending_size pending_size = max_size) { qemu_savevm_state_iterate(s-file); } else { -int ret; +int ret = 0; DPRINTF(done iterating\n); qemu_mutex_lock_iothread(); @@ -590,7 +590,10 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); -ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +if (!runstate_needs_reset()) { +ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +} + if (ret = 0) { qemu_file_set_rate_limit(s-file, INT_MAX); qemu_savevm_state_complete(s-file); -- 1.7.7.6
[Qemu-devel] [PATCH 04/17] migration-local: add QEMUFileLocal with socket based QEMUFile
This patch adds QEMUFileLocal with copy of socket based QEMUFile, will be used as the basis code for Unix socket protocol migration and page flipping migration. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- Makefile.target |1 + migration-local.c | 123 + 2 files changed, 124 insertions(+), 0 deletions(-) create mode 100644 migration-local.c diff --git a/Makefile.target b/Makefile.target index af6ac7e..aa09960 100644 --- a/Makefile.target +++ b/Makefile.target @@ -117,6 +117,7 @@ obj-$(CONFIG_KVM) += kvm-all.o obj-y += memory.o savevm.o cputlb.o obj-y += memory_mapping.o obj-y += dump.o +obj-y += migration-local.o LIBS+=$(libs_softmmu) # xen support diff --git a/migration-local.c b/migration-local.c new file mode 100644 index 000..ca01a20 --- /dev/null +++ b/migration-local.c @@ -0,0 +1,123 @@ +/* + * QEMU localhost migration with page flipping + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Lei Li li...@linux.vnet.ibm.com + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include config-host.h +#include qemu-common.h +#include migration/migration.h +#include exec/cpu-common.h +#include config.h +#include exec/cpu-all.h +#include exec/memory.h +#include exec/memory-internal.h +#include monitor/monitor.h +#include migration/qemu-file.h +#include qemu/iov.h +#include sysemu/arch_init.h +#include sysemu/sysemu.h +#include block/block.h +#include qemu/sockets.h +#include migration/block.h +#include qemu/thread.h +#include qmp-commands.h +#include trace.h +#include qemu/osdep.h + +//#define DEBUG_MIGRATION_LOCAL + +#ifdef DEBUG_MIGRATION_LOCAL +#define DPRINTF(fmt, ...) \ +do { printf(migration-local: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + + +typedef struct QEMUFileLocal { +QEMUFile *file; +int sockfd; +int pipefd[2]; +int pipefd_passed; +int pipefd_received; +bool unix_page_flipping; +} QEMUFileLocal; + +static int qemu_local_get_sockfd(void *opaque) +{ +QEMUFileLocal *s = opaque; + +return s-sockfd; +} + +static int qemu_local_get_buffer(void *opaque, uint8_t *buf, + int64_t pos, int size) +{ +QEMUFileLocal *s = opaque; +ssize_t len; + +for (;;) { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} + +if (socket_error() == EAGAIN) { +yield_until_fd_readable(s-sockfd); +} else if (socket_error() != EINTR) { +break; +} +} + +if (len == -1) { +len = -socket_error(); +} + +return len; +} + +static ssize_t qemu_local_writev_buffer(void *opaque, struct iovec *iov, +int iovcnt, int64_t pos) +{ +QEMUFileLocal *s = opaque; +ssize_t len; +ssize_t size = iov_size(iov, iovcnt); + +len = iov_send(s-sockfd, iov, iovcnt, 0, size); +if (len size) { +len = -socket_error(); +} + +return len; +} + +static int qemu_local_close(void *opaque) +{ +QEMUFileLocal *s = opaque; + +closesocket(s-sockfd); +g_free(s); + +return 0; +} + +static const QEMUFileOps pipe_read_ops = { +.get_fd= qemu_local_get_sockfd, +.get_buffer= qemu_local_get_buffer, +.close = qemu_local_close, +}; + +static const QEMUFileOps pipe_write_ops = { +.get_fd = qemu_local_get_sockfd, +.writev_buffer = qemu_local_writev_buffer, +.close = qemu_local_close, +}; -- 1.7.7.6
[Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load
Override hook_ram_load to receive the pipe file descriptor passed by source process and page address which will be extracted to vmsplice the page data from pipe. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 59 + 1 files changed, 59 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 76ec306..b086f38 100644 --- a/migration-local.c +++ b/migration-local.c @@ -231,10 +231,69 @@ static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, return RAM_SAVE_CONTROL_NOT_SUPP; } +static int qemu_local_ram_load(QEMUFile *f, void *opaque, + ram_addr_t addr, uint64_t flags) +{ +QEMUFileLocal *s = opaque; +struct iovec iov; +ssize_t ret = -EINVAL; + +if (!s-pipefd_received) { +/* + * send_pipefd was called at this point, and it wrote one + * byte to the stream. + */ +qemu_get_byte(s-file); +s-pipefd_received = true; +} + +if (s-pipefd_passed) { +void *host; +/* + * Extract the page address from the 8-byte record and + * read the page data from the pipe. + */ +host = qemu_get_ram_ptr(addr); + +iov.iov_base = host; +iov.iov_len = TARGET_PAGE_SIZE; + +/* + * The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will movie pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently falls + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ + +ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice() load error: %s, strerror(errno)); +return ret; +} +DPRINTF(vmsplice load error\n); +} else if (ret == 0) { +DPRINTF(stderr, load_page: zero read\n); +} + +DPRINTF(vmsplice (read): %zu\n, ret); +return ret; +} + +return -EINVAL; +} + + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, .close = qemu_local_close, +.hook_ram_load = qemu_local_ram_load }; static const QEMUFileOps pipe_write_ops = { -- 1.7.7.6
Re: [Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping
On 11/26/2013 10:11 PM, Paolo Bonzini wrote: Il 26/11/2013 14:53, Lei Li ha scritto: 1) ram_save_setup stage, it will send all the bytes in this stages to destination, and send_pipefd by ram_control_before_iterate at the end of it. ram_save_setup runs doesn't send anything from guest RAM. It sends the lengths of the various blocks. As you said, at the end of ram_save_setup you send the pipefd. ram_save_iterate runs before ram_save_complete. ram_save_iterate and ram_save_complete write data with exactly the same format. Both of them can use ram_save_page It should not matter if some pages are sent as part of ram_save_iterate and others as part of ram_save_complete. One possibility is that you are hitting a bug due to the way you ignore the 0x01 byte that send_pipefd places on the socket. Oops. I might have said this before thinking about postcopy and/or before seeing the benchmark results from Juan's patches. If this part of the patch is just an optimization, I'd rather leave it out for now. I am afraid that page flipping can not proceed correctly without this.. I really would like to understand why, because it really shouldn't (this shouldn't be a place where you need a hook). Hi Paolo, Sorry for the late reply. Yes, you are right!! I just have a try with this adjustment removed, it works well... I remembered that it can not proceed correctly when debugging in previous version without this as in theory it should like your explanation above. I guess the only answer is that there was a bug regarding the one byte fd control message just like the possibility you listed! Paolo -- Lei
Re: [Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram
On 11/25/2013 05:48 PM, Paolo Bonzini wrote: Il 25/11/2013 08:29, Lei Li ha scritto: In this case, if the migration would fail just because the misconfiguration of device state on destination, in the meantime the outgoing migration has no aware of this failure, I think it should add such handling (like synchronize of the device state list in incoming side?) to the current migration protocol as it is kind of missing... It can not just rely on the resume of source guest for such failure... or maybe it should be handled in management app to force the configuration right? It is already handled by libvirt, indeed. Basically, -incoming without -S is a broken option because of the missing handshake at the end of migration. With -S something else (either a human or a program) can check that everything went well and choose whether to restart the source or the destination. I see, thanks for your explanation. :-) BTW, do you think we should add such handling to the current migration protocol? Postcopy would fix this (assuming the postcopy phase is reliable) by migrating device data before any page flipping occurs. Are you suggesting that page flipping should be coupled with the postcopy migration for live upgrade of QEMU as your comments in the previous version? In order to make live upgrade reliable, it should. The whole procedure for page flipping migration is straight forward, and the cases of failure I listed are in theory, which never happened at least since many times I have tested (except the case you raised above). But I agree with you on coupling with postcopy migration to make it more reliable, specially for the undetected problems. For this, I am not quite sure I understand it correctly, seems the latest update of post copy migration was sent on last Oct, would you please give some insights on what else could I do for the coupling with postcopy migration? If no, now page flipping is implemented as a migration capability, and it's a good shape already as your comments in the previous version. Although it still needs a little more time to get the numbers of the new vmsplice, I'd to ask your opinion that do you consider it could be merged as an experimental version for now? Paolo -- Lei
Re: [Qemu-devel] [PATCH 08/17] add unix_msgfd_lookup() to callback get_buffer
On 11/21/2013 05:11 PM, Lei Li wrote: The control message for exchange of pipe file descriptor should be received by recvmsg, and it might be eaten to stream file by qemu_recv() when receiving by two callbacks. So this patch adds unix_msgfd_lookup() to callback get_buffer as the only one receiver, where the pipe file descriptor would be caughted. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 68 ++-- 1 files changed, 65 insertions(+), 3 deletions(-) diff --git a/migration-local.c b/migration-local.c index e028beb..0f0896b 100644 --- a/migration-local.c +++ b/migration-local.c @@ -50,6 +50,8 @@ typedef struct QEMUFileLocal { bool unix_page_flipping; } QEMUFileLocal; +static bool pipefd_passed; + static int qemu_local_get_sockfd(void *opaque) { QEMUFileLocal *s = opaque; @@ -57,16 +59,76 @@ static int qemu_local_get_sockfd(void *opaque) return s-sockfd; } +static int unix_msgfd_lookup(void *opaque, struct msghdr *msg) +{ +QEMUFileLocal *s = opaque; +struct cmsghdr *cmsg; +bool found = false; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) +continue; + +/* PIPE file descriptor to be received */ +s-pipefd[0] = *((int *)CMSG_DATA(cmsg)); +} + +if (s-pipefd[0] = 0) { And this should be if (s-pipefd[0] 0).. +fprintf(stderr, no pipe fd can be received\n); +return found; +} + +DPRINTF(pipefd successfully received\n); +return s-pipefd[0]; +} + static int qemu_local_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileLocal *s = opaque; ssize_t len; +struct msghdr msg = { NULL, }; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; + +iov[0].iov_base = buf; +iov[0].iov_len = size; + +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); for (;;) { -len = qemu_recv(s-sockfd, buf, size, 0); -if (len != -1) { -break; +if (!pipefd_passed) { +/* + * recvmsg is called here to catch the control message for + * the exchange of PIPE file descriptor until it is received. + */ +len = recvmsg(s-sockfd, msg, 0); +if (len != -1) { +if (unix_msgfd_lookup(s, msg) 0) { +pipefd_passed = 1; +/* + * Do not count one byte taken by the PIPE file + * descriptor. + */ +len--; +} else { +len = -1; +} Just found that this 'else' should go away as it will break the normal Unix migration since pipefd_passed will always be 0 for it. I have fixed this in my code, seems I mis-send it for some reason, sorry for this...:-[ +break; +} +} else { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} } if (socket_error() == EAGAIN) { -- Lei
Re: [Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping
On 11/26/2013 07:32 PM, Paolo Bonzini wrote: Il 21/11/2013 10:11, Lei Li ha scritto: Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration.c | 10 +++--- 1 files changed, 7 insertions(+), 3 deletions(-) diff --git a/migration.c b/migration.c index 4ac466b..0f98ac1 100644 --- a/migration.c +++ b/migration.c @@ -579,10 +579,11 @@ static void *migration_thread(void *opaque) pending_size = qemu_savevm_state_pending(s-file, max_size); DPRINTF(pending size % PRIu64 max % PRIu64 \n, pending_size, max_size); -if (pending_size pending_size = max_size) { +if (pending_size pending_size = max_size +!runstate_needs_reset()) { qemu_savevm_state_iterate(s-file); I'm not sure why you need this. The adjustment here is to avoid the iteration stage for page flipping. Because pending_size = ram_save_remaining() * TARGET_PAGE_SIZE which is not 0 and pending_size max_size (0) at start. In the previous version it was like this: if (pending_size pending_size = max_size !migrate_unix_page_flipping()) { And you said 'This is a bit ugly but I understand the need. Perhaps !runstate_needs_reset() like below?' :) } else { -int ret; +int ret = 0; DPRINTF(done iterating\n); qemu_mutex_lock_iothread(); @@ -590,7 +591,10 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); -ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +if (!runstate_needs_reset()) { +ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +} This however is okay. Paolo if (ret = 0) { qemu_file_set_rate_limit(s-file, INT_MAX); qemu_savevm_state_complete(s-file); -- Lei
Re: [Qemu-devel] [PATCH 10/17] migration-local: override save_page for page transmit
On 11/26/2013 07:22 PM, Paolo Bonzini wrote: Il 21/11/2013 10:11, Lei Li ha scritto: This patch implements save_page callback for the outside of page flipping. It will write the address of the page on the Unix socket and flip the page data on pipe by vmsplice(). Every page address would have a header flag RAM_SAVE_FLAG_HOOK, which will trigger the load hook to receive it in incoming side as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 54 + 1 files changed, 54 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 0f0896b..14207e9 100644 --- a/migration-local.c +++ b/migration-local.c @@ -200,6 +200,59 @@ static int qemu_local_send_pipefd(QEMUFile *f, void *opaque, return 0; } +static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, + MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) +{ +QEMUFileLocal *s = opaque; +ram_addr_t current_addr = mr-ram_addr + offset; +void *ram_addr; +ssize_t ret; + +if (s-unix_page_flipping) { +qemu_fflush(s-file); +qemu_put_be64(s-file, RAM_SAVE_FLAG_HOOK); + +/* Write page address to unix socket */ +qemu_put_be64(s-file, current_addr); + You can write current_addr | RAM_SAVE_FLAG_HOOK. The value will be in the flags argument of the hook_ram_load, you can extract it with flags ~RAM_SAVE_FLAG_HOOK. This cuts by half the data written to the Unix socket. OK, thanks. Paolo +ram_addr = memory_region_get_ram_ptr(mr) + offset; + +/* vmsplice page data to pipe */ +struct iovec iov = { +.iov_base = ram_addr, +.iov_len = size, +}; + +/* + * The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will movie pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently falls + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ +ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice save error: %s\n, strerror(errno)); +return ret; +} +} else { +if (bytes_sent) { +*bytes_sent = 1; +} +DPRINTF(block_offset: %lu, offset: %lu\n, block_offset, offset); +return 0; +} +} + +return RAM_SAVE_CONTROL_NOT_SUPP; +} + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, @@ -211,6 +264,7 @@ static const QEMUFileOps pipe_write_ops = { .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, .before_ram_iterate = qemu_local_send_pipefd, +.save_page = qemu_local_save_ram }; QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) -- Lei
Re: [Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load
On 11/26/2013 07:25 PM, Paolo Bonzini wrote: Il 21/11/2013 10:11, Lei Li ha scritto: +static int qemu_local_ram_load(QEMUFile *f, void *opaque, + uint64_t flags) +{ +QEMUFileLocal *s = opaque; +ram_addr_t addr; +struct iovec iov; +ssize_t ret = -EINVAL; + +/* + * PIPE file descriptor will be received by another callback + * get_buffer. + */ +if (pipefd_passed) { +void *host; +/* + * Extract the page address from the 8-byte record and + * read the page data from the pipe. + */ +addr = qemu_get_be64(s-file); +host = qemu_get_ram_ptr(addr); + +iov.iov_base = host; +iov.iov_len = TARGET_PAGE_SIZE; + +/* The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will movie pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently falls + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ +ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice() load error: %s, strerror(errno)); +return ret; +} +DPRINTF(vmsplice load error\n); +} else if (ret == 0) { +DPRINTF(stderr, load_page: zero read\n); +} + +DPRINTF(vmsplice (read): %zu\n, ret); +return ret; +} + +return 0; +} I think you need to return -EINVAL if there is no pipe. Yes, you are right.. Paolo -- Lei
Re: [Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping
On 11/26/2013 08:54 PM, Paolo Bonzini wrote: Il 26/11/2013 13:03, Lei Li ha scritto: +if (pending_size pending_size = max_size +!runstate_needs_reset()) { qemu_savevm_state_iterate(s-file); I'm not sure why you need this. The adjustment here is to avoid the iteration stage for page flipping. Because pending_size = ram_save_remaining() * TARGET_PAGE_SIZE which is not 0 and pending_size max_size (0) at start. It's still not clear to me that avoiding the iteration stage is The purpose of it is not just for optimization, but to avoid the iteration for better alignment. The current flow of page flipping basically has two stages: 1) ram_save_setup stage, it will send all the bytes in this stages to destination, and send_pipefd by ram_control_before_iterate at the end of it. 2) ram_save_complete, it will start to transmit the ram page in ram_save_block, and send the device state after that. So it needs to adjust the current migration process to avoid the iteration stage. necessary. I think it's just an optimization to avoid scanning the bitmap, but: (1) Juan's bitmap optimization will make this mostly unnecessary (2) getting good downtime from page flipping will require postcopy anyway. And you said 'This is a bit ugly but I understand the need. Perhaps !runstate_needs_reset() like below?' :) Oops. I might have said this before thinking about postcopy and/or before seeing the benchmark results from Juan's patches. If this part of the patch is just an optimization, I'd rather leave it out for now. I am afraid that page flipping can not proceed correctly without this.. Thanks for putting up with me. :) Paolo -- Lei
Re: [Qemu-devel] [PATCH 08/17] add unix_msgfd_lookup() to callback get_buffer
On 11/26/2013 07:31 PM, Paolo Bonzini wrote: Il 21/11/2013 10:11, Lei Li ha scritto: +/* + * recvmsg is called here to catch the control message for + * the exchange of PIPE file descriptor until it is received. + */ +len = recvmsg(s-sockfd, msg, 0); +if (len != -1) { +if (unix_msgfd_lookup(s, msg) 0) { +pipefd_passed = 1; +/* + * Do not count one byte taken by the PIPE file + * descriptor. + */ +len--; I think adding a byte in the middle of the stream is not reliable. Rather, you should transmit the socket always at the same place, for example in the first call of qemu_local_save_ram, after it has written the 64-bit field. I guess 'transmit the socket' you mean transmit the fd? Sorry that I am quite understand your suggestion here.. Do you mean that send_pipefd in the first call of qemu_local_save_ram after it has written the 64-bit field? In this way, get rid of qemu_local_send_pipefd? Currently, the fd control message is sent at the end of the stream in ram_save_setup stage, followed by the ram page. The control message of fd is always at the same place. The matching code in qemu_local_ram_load will be like this: static int qemu_local_ram_load(QEMUFile *f, void *opaque, uint64_t flags) { QEMUFileLocal *s = opaque; ram_addr_t addr; struct iovec iov; ssize_t ret = -EINVAL; if (!s-pipefd_received) { /* * send_pipefd was called at this point, and it wrote one byte * to the stream. */ qemu_get_byte(s); s-pipefd_received = true; } if (pipefd_passed) { ... } return -EINVAL; } Also, please move pipefd_passed within QEMUFileLocal. Thanks, Paolo -- Lei
Re: [Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_MEMORY_STALE
On 11/26/2013 08:28 PM, Paolo Bonzini wrote: Il 21/11/2013 10:11, Lei Li ha scritto: { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED }, DEBUG - MEMORY_STALE is missing. Good catch, I will add it, thanks. :) Paolo { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE }, -- Lei
Re: [Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram
On 11/22/2013 07:36 PM, Paolo Bonzini wrote: Il 22/11/2013 12:29, Lei Li ha scritto: During the page flipping migration, ram page of source guest would be flipped to the destination, that's why the source guest can not be resumed. AFAICT, the page flipping migration may fail at the connection stage (including the exchange of pipe fd) and migration register stage (say any blocker like unsupported migration device), Unfortunately, some migration problems (e.g. misconfiguration of the destination QEMU) cannot be detected until the device data is migrated. This happens after RAM migration, so there is indeed a reliability problem. Hi Paolo, 'Some migration problems cannot be detected until the device data is migrated', do you mean that the outgoing migration has no idea the failure of incoming side caused by the misconfiguration of the destination QEMU? In this case, if the migration would fail just because the misconfiguration of device state on destination, in the meantime the outgoing migration has no aware of this failure, I think it should add such handling (like synchronize of the device state list in incoming side?) to the current migration protocol as it is kind of missing... It can not just rely on the resume of source guest for such failure... or maybe it should be handled in management app to force the configuration right? Postcopy would fix this (assuming the postcopy phase is reliable) by migrating device data before any page flipping occurs. Are you suggesting that page flipping should be coupled with the postcopy migration for live upgrade of QEMU as your comments in the previous version? Paolo but it could be resumed for such situation since the memory has not been flipped to another content. Once the connection is successfully setup, it would proceed the transmission of ram page which hardly fails. And for the failure handling in Libvirt, ZhengSheng has proposed that restarts the old QEMU instead of resume. I know 'hardly' is not an good answer to your concern, but it is the cost of the limited memory IMO. So if downtime is the key to the user, or if it's *zero toleration of the restarting of QEMU, page flipping migration might not be a good choice. From the perspective of management app like Libvirt, as the 'live upgrade' of QEMU will be done through localhost migration, and there are other migration solutions which have lower downtime, like the real live migration and the postcopy migration that Paolo mentioned in the previous version [3]. Why not have more than one choice for it? -- Lei
Re: [Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram
On 11/21/2013 06:19 PM, Daniel P. Berrange wrote: On Thu, Nov 21, 2013 at 05:11:23PM +0800, Lei Li wrote: This patch series tries to introduce a mechanism using side channel pipe for RAM via SCM_RIGHTS with unix domain socket protocol migration. This side channel is used for the page flipping by vmsplice, which is the internal mechanism for localhost migration that we are trying to add to QEMU. The backgroud info and previous patch series for reference, Localhost migration http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html migration: Introduce side channel for RAM http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html I have picked patches from the localhost migration series and rebased it on the series of side channel, now it is a complete series that passed the basic test. Please let me know if there is anything needs to be fixed or improved. Your suggestions and comments are very welcome, and thanks to Paolo for his continued review and useful suggestions. In discussions about supporting this for libvirt, we were told that when this localhost migration fails, you cannot re-start the guest on the original source QEMU. If this is true, this implementation is not satisfactory IMHO. One of the main motivations of this feature is to allow for in-place live upgrades of QEMU binaries, for people who can't tolerate the downtime of restarting their guests, and whom don't have a spare host to migrate them to. If people are using this because they can't tolerate any downtime of the guest, then we need to be able to fully deal with failure to complete migration by switching back to the original QEMU process, as we can do with normal non-localhost migration. Hi Daniel, Page flipping is introduced here not primarily for low downtime, but more to avoid requiring that there is enough free memory to fit an additional copy of the largest guest which is the requirement today with current localhost migration as the additional explanation from Anthony in first proposal version [1]. Of course low downtime is also important to the page flipping migration as the use case of it is to allow 'live' upgrade of a running QEMU instance, so we expect page flipping through vmsplice is fast enough to meet it. As an initial implementation of this feature right now, the downtime is not good, but we are working on it as there has been some work on kernel side [2]. During the page flipping migration, ram page of source guest would be flipped to the destination, that's why the source guest can not be resumed. AFAICT, the page flipping migration may fail at the connection stage (including the exchange of pipe fd) and migration register stage (say any blocker like unsupported migration device), but it could be resumed for such situation since the memory has not been flipped to another content. Once the connection is successfully setup, it would proceed the transmission of ram page which hardly fails. And for the failure handling in Libvirt, ZhengSheng has proposed that restarts the old QEMU instead of resume. I know 'hardly' is not an good answer to your concern, but it is the cost of the limited memory IMO. So if downtime is the key to the user, or if it's *zero toleration of the restarting of QEMU, page flipping migration might not be a good choice. From the perspective of management app like Libvirt, as the 'live upgrade' of QEMU will be done through localhost migration, and there are other migration solutions which have lower downtime, like the real live migration and the postcopy migration that Paolo mentioned in the previous version [3]. Why not have more than one choice for it? [1]http://lists.gnu.org/archive/html/qemu-devel/2013-06/msg02577.html [2]http://article.gmane.org/gmane.linux.kernel/1574277 [3]http://lists.gnu.org/archive/html/qemu-devel/2013-10/msg03212.html Regards, Daniel -- Lei
Re: [Qemu-devel] [PATCH 0/17 v2] Localhost migration with side channel for ram
On 10/25/2013 08:24 PM, Lei Li wrote: On 10/25/2013 03:30 PM, Paolo Bonzini wrote: Il 25/10/2013 06:58, Lei Li ha scritto: Right now just has inaccurate numbers without the new vmsplice, which based on the result from info migrate, as the guest ram size increases, although the 'total time' is number of times less compared with the current live migration, but the 'downtime' performs badly. Of course. For a 1GB ram guest, total time: 702 milliseconds downtime: 692 milliseconds And when the ram size of guest increasesexponentially, those numbers are proportional to it. I will make a list of the performance with the new vmsplice later, I am sure it'd be much better than this at least. Yes, please. Is the memory usage is still 2x without vmsplice? I think you have a nice proof of concept, but on the other hand this probably needs to be coupled with some kind of postcopy live migration, that is: * the source starts sending data * but the destination starts running immediately * if the machine needs a page that is missing, the destination asks the source to send it * as soon as it arrives, the destination can restart Using postcopy is problematic for reliability: if the destination fails, the virtual machine is lost because the source doesn't have the latest content of memory. However, this is a much, much smaller problem for live QEMU upgrade where the network cannot fail. If you do this, you can achieve pretty much instantaneous live upgrade, well within your original 200 ms goals. But the flipping code with vmsplice should be needed anyway to avoid doubling memory usage, and Yes, I have read the postcopy migration patches, it does perform very good on downtime, as just send the vmstates then switch the execution to destination host. And as you pointed out, it can not avoid doubling memory usage. The numbers list above are based on the old vmsplice as I have not yet worked on the benchmark for performance, it actually copys data rather than moving. As the feedback for this version is positive, now I am trying to get a real result out with the new vmsplice. BTW, kernel side is looking for huge page solution for the improvement of performance. The recently patches from kernel as link, http://article.gmane.org/gmane.linux.kernel/1574277 Hi Paolo, I have been working on the benchmark of the performance, I am afraid that it may take a bit more time as there has some problems on the new vmsplice which kernel side is working on right now. I will post a v3 of the series with your comments in previous version fixed soon. it's looking pretty good in this version already! I'm relieved that the RDMA code was designed right! I am happy with it too. :) Those RDMA hooks really make thingsmore flexible! Paolo -- Lei
[Qemu-devel] [PATCH 02/17] migration: add migrate_unix_page_flipping()
Add migrate_unix_page_flipping() to check if MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING is enabled. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/migration.h |3 +++ migration.c |9 + 2 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index 140e6b4..7e5d01a 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason); void migrate_del_blocker(Error *reason); bool migrate_rdma_pin_all(void); + bool migrate_zero_blocks(void); bool migrate_auto_converge(void); +bool migrate_unix_page_flipping(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); diff --git a/migration.c b/migration.c index 2b1ab20..4ac466b 100644 --- a/migration.c +++ b/migration.c @@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void) return s-xbzrle_cache_size; } +bool migrate_unix_page_flipping(void) +{ +MigrationState *s; + +s = migrate_get_current(); + +return s-enabled_capabilities[MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING]; +} + /* migration thread support */ static void *migration_thread(void *opaque) -- 1.7.7.6
[Qemu-devel] [PATCH 08/17] add unix_msgfd_lookup() to callback get_buffer
The control message for exchange of pipe file descriptor should be received by recvmsg, and it might be eaten to stream file by qemu_recv() when receiving by two callbacks. So this patch adds unix_msgfd_lookup() to callback get_buffer as the only one receiver, where the pipe file descriptor would be caughted. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 68 ++-- 1 files changed, 65 insertions(+), 3 deletions(-) diff --git a/migration-local.c b/migration-local.c index e028beb..0f0896b 100644 --- a/migration-local.c +++ b/migration-local.c @@ -50,6 +50,8 @@ typedef struct QEMUFileLocal { bool unix_page_flipping; } QEMUFileLocal; +static bool pipefd_passed; + static int qemu_local_get_sockfd(void *opaque) { QEMUFileLocal *s = opaque; @@ -57,16 +59,76 @@ static int qemu_local_get_sockfd(void *opaque) return s-sockfd; } +static int unix_msgfd_lookup(void *opaque, struct msghdr *msg) +{ +QEMUFileLocal *s = opaque; +struct cmsghdr *cmsg; +bool found = false; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) +continue; + +/* PIPE file descriptor to be received */ +s-pipefd[0] = *((int *)CMSG_DATA(cmsg)); +} + +if (s-pipefd[0] = 0) { +fprintf(stderr, no pipe fd can be received\n); +return found; +} + +DPRINTF(pipefd successfully received\n); +return s-pipefd[0]; +} + static int qemu_local_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileLocal *s = opaque; ssize_t len; +struct msghdr msg = { NULL, }; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; + +iov[0].iov_base = buf; +iov[0].iov_len = size; + +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); for (;;) { -len = qemu_recv(s-sockfd, buf, size, 0); -if (len != -1) { -break; +if (!pipefd_passed) { +/* + * recvmsg is called here to catch the control message for + * the exchange of PIPE file descriptor until it is received. + */ +len = recvmsg(s-sockfd, msg, 0); +if (len != -1) { +if (unix_msgfd_lookup(s, msg) 0) { +pipefd_passed = 1; +/* + * Do not count one byte taken by the PIPE file + * descriptor. + */ +len--; +} else { +len = -1; +} +break; +} +} else { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} } if (socket_error() == EAGAIN) { -- 1.7.7.6
[Qemu-devel] [PATCH 01/17] QAPI: introduce migration capability unix_page_flipping
Introduce unix_page_flipping to MigrationCapability for localhost migration. Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index 83fa485..b290a0f 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -685,10 +685,18 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @unix-page-flipping: If enabled, QEMU can optimize migration when the +# destination is a QEMU process that runs on the same host as +# the source (as is the case for live upgrade). If the migration +# transport is a Unix socket, QEMU will flip RAM pages directly to +# the destination, so that memory is only allocated twice for the +# source and destination processes. Disabled by default. (since 1.8) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks', + 'unix-page-flipping'] } ## # @MigrationCapabilityStatus -- 1.7.7.6
[Qemu-devel] [PATCH 05/17] migration-local: introduce qemu_fopen_socket_local()
Add qemu_fopen_socket_local() to open QEMUFileLocal introduced earlier. It will create a pipe in write mode if unix_page_flipping is enabled, adjust qemu_local_close() to close pipe as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/qemu-file.h |2 + migration-local.c | 46 + 2 files changed, 48 insertions(+), 0 deletions(-) diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 0f757fb..f9b104a 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -99,6 +99,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode); QEMUFile *qemu_fdopen(int fd, const char *mode); QEMUFile *qemu_fopen_socket(int fd, const char *mode); QEMUFile *qemu_popen_cmd(const char *command, const char *mode); +QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode); + int qemu_get_fd(QEMUFile *f); int qemu_fclose(QEMUFile *f); int64_t qemu_ftell(QEMUFile *f); diff --git a/migration-local.c b/migration-local.c index 8b9e10e..28da05b 100644 --- a/migration-local.c +++ b/migration-local.c @@ -103,6 +103,12 @@ static int qemu_local_close(void *opaque) QEMUFileLocal *s = opaque; closesocket(s-sockfd); + +if (s-unix_page_flipping) { +close(s-pipefd[0]); +close(s-pipefd[1]); +} + g_free(s); return 0; @@ -119,3 +125,43 @@ static const QEMUFileOps pipe_write_ops = { .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, }; + +QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) +{ +QEMUFileLocal *s; +int pipefd[2]; + +if (qemu_file_mode_is_not_valid(mode)) { +return NULL; +} + +s = g_malloc0(sizeof(QEMUFileLocal)); +s-sockfd = sockfd; + +if (migrate_unix_page_flipping()) { +s-unix_page_flipping = 1; +} + +if (mode[0] == 'w') { +if (s-unix_page_flipping) { +if (pipe(pipefd) 0) { +fprintf(stderr, failed to create PIPE\n); +goto fail; +} + +s-pipefd[0] = pipefd[0]; +s-pipefd[1] = pipefd[1]; +} + +qemu_set_block(s-sockfd); +s-file = qemu_fopen_ops(s, pipe_write_ops); +} else { +s-file = qemu_fopen_ops(s, pipe_read_ops); +} + +return s-file; + +fail: +g_free(s); +return NULL; +} -- 1.7.7.6
[Qemu-devel] [PATCH 03/17] qmp-command.hx: add missing docs for migration capabilites
Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qmp-commands.hx |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..dcec433 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2898,6 +2898,10 @@ migrate-set-capabilities Enable/Disable migration capabilities - xbzrle: XBZRLE support +- x-rdma-pin-all: Pin all pages during RDMA support +- zero-blocks: Compress zero blocks during block migration +- auto-converge: Block VCPU to help convergence of migration +- unix-page-flipping: Page flipping for live QEMU upgrade Arguments: @@ -2922,6 +2926,10 @@ Query current migration capabilities - capabilities: migration capabilities state - xbzrle : XBZRLE state (json-bool) + - x-rdma-pin-all: RDMA state (json-bool) + - zero-blocks: zero-blocks state (json-bool) + - auto-converge: Auto converge state (json-bool) + - unix-page-flipping: Page flipping state (json-bool) Arguments: -- 1.7.7.6
[Qemu-devel] [PATCH 07/17] migration-local: override before_ram_iterate to send pipefd
Override before_ram_iterate to send pipefd. It will qemu_fflush the stream QEMUFile and send it in RAM_CONTROL_SETUP stage. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 25 + 1 files changed, 25 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index f4265a1..e028beb 100644 --- a/migration-local.c +++ b/migration-local.c @@ -114,6 +114,30 @@ static int qemu_local_close(void *opaque) return 0; } +static int send_pipefd(int sockfd, int pipefd); + +static int qemu_local_send_pipefd(QEMUFile *f, void *opaque, + uint64_t flags) +{ +QEMUFileLocal *s = opaque; +int ret; + +if (s-unix_page_flipping) { +/* Avoid sending pipe fd again in ram_save_complete() stage */ +if (flags == RAM_CONTROL_SETUP) { +qemu_fflush(f); +ret = send_pipefd(s-sockfd, s-pipefd[0]); +if (ret 0) { +fprintf(stderr, failed to pass PIPE\n); +return ret; +} +DPRINTF(PIPE fd was sent\n); +} +} + +return 0; +} + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, @@ -124,6 +148,7 @@ static const QEMUFileOps pipe_write_ops = { .get_fd = qemu_local_get_sockfd, .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, +.before_ram_iterate = qemu_local_send_pipefd, }; QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) -- 1.7.7.6
[Qemu-devel] [PATCH 06/17] migration-local: add send_pipefd()
This patch adds send_pipefd() to pass the pipe file descriptor to destination process. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 53 + 1 files changed, 53 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 28da05b..f4265a1 100644 --- a/migration-local.c +++ b/migration-local.c @@ -165,3 +165,56 @@ fail: g_free(s); return NULL; } + + +/* + * Pass a pipe file descriptor to another process. + * + * Return negative value If pipefd 0. Return 0 on + * success. + * + */ +static int send_pipefd(int sockfd, int pipefd) +{ +struct msghdr msg; +struct iovec iov[1]; +ssize_t ret; + +union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; +} control_un; +struct cmsghdr *cmptr; +char req[1] = { 0x01 }; + +if (pipefd 0) { +msg.msg_control = NULL; +msg.msg_controllen = 0; +/* Negative status means error */ +req[0] = pipefd; +} else { +msg.msg_control = control_un.control; +msg.msg_controllen = sizeof(control_un.control); + +cmptr = CMSG_FIRSTHDR(msg); +cmptr-cmsg_len = CMSG_LEN(sizeof(int)); +cmptr-cmsg_level = SOL_SOCKET; +cmptr-cmsg_type = SCM_RIGHTS; +*((int *) CMSG_DATA(cmptr)) = pipefd; + +msg.msg_name = NULL; +msg.msg_namelen = 0; + +iov[0].iov_base = req; +iov[0].iov_len = sizeof(req); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +} + +ret = sendmsg(sockfd, msg, 0); +if (ret = 0) { +DPRINTF(sendmsg error: %s\n, strerror(errno)); +} + +return ret; +} -- 1.7.7.6
[Qemu-devel] [PATCH 13/17] migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local
Relace qemu_fopen_socket with qemu_fopen_socket_local in Unix protocol migration. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-unix.c | 18 ++ 1 files changed, 14 insertions(+), 4 deletions(-) diff --git a/migration-unix.c b/migration-unix.c index 651fc5b..9beeafe 100644 --- a/migration-unix.c +++ b/migration-unix.c @@ -37,12 +37,22 @@ static void unix_wait_for_connect(int fd, void *opaque) if (fd 0) { DPRINTF(migrate connect error\n); s-file = NULL; -migrate_fd_error(s); +goto fail; } else { DPRINTF(migrate connect success\n); -s-file = qemu_fopen_socket(fd, wb); + +s-file = qemu_fopen_socket_local(fd, wb); +if (s-file == NULL) { +fprintf(stderr, failed to open Unix socket\n); +goto fail; +} + migrate_fd_connect(s); +return; } + +fail: +migrate_fd_error(s); } void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp) @@ -71,9 +81,9 @@ static void unix_accept_incoming_migration(void *opaque) goto out; } -f = qemu_fopen_socket(c, rb); +f = qemu_fopen_socket_local(c, rb); if (f == NULL) { -fprintf(stderr, could not qemu_fopen socket\n); +fprintf(stderr, failed to open Unix socket\n); goto out; } -- 1.7.7.6
[Qemu-devel] [PATCH 11/17] savevm: adjust ram_control_save_page for page flipping
As callback save_page will always be opened by qemu_fopen_socket_local(), and without unix_page_flipping it will return RAM_SAVE_CONTROL_NOT_SUPP, it leads to a wrong qemu_file_set_error() based on the current logic. So this patch adds RAM_SAVE_CONTROL_NOT_SUPP to the check. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- savevm.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/savevm.c b/savevm.c index 3ee256e..4576145 100644 --- a/savevm.c +++ b/savevm.c @@ -668,7 +668,8 @@ size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, int ret = f-ops-save_page(f, f-opaque, mr, offset, size, bytes_sent); -if (ret != RAM_SAVE_CONTROL_DELAYED) { +if (ret != RAM_SAVE_CONTROL_DELAYED +ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (bytes_sent *bytes_sent 0) { qemu_update_position(f, *bytes_sent); } else if (ret 0) { -- 1.7.7.6
[Qemu-devel] [PATCH 15/17] migration-unix: page flipping support on unix outgoing
Add page flipping support on unix outgoing part by stopping VM with the new RunState RUN_STATE_MEMORY_STALE before invoking migration if unix_page_flipping enabled. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-unix.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/migration-unix.c b/migration-unix.c index 9beeafe..cbf2087 100644 --- a/migration-unix.c +++ b/migration-unix.c @@ -19,6 +19,7 @@ #include migration/migration.h #include migration/qemu-file.h #include block/block.h +#include sysemu/sysemu.h //#define DEBUG_MIGRATION_UNIX @@ -33,6 +34,7 @@ static void unix_wait_for_connect(int fd, void *opaque) { MigrationState *s = opaque; +int ret; if (fd 0) { DPRINTF(migrate connect error\n); @@ -47,6 +49,15 @@ static void unix_wait_for_connect(int fd, void *opaque) goto fail; } +/* Stop VM before invoking migration if unix_page_flipping enabled */ +if (migrate_unix_page_flipping()) { +ret = vm_stop_force_state(RUN_STATE_MEMORY_STALE); +if (ret 0) { +DPRINTF(failed to stop VM\n); +goto fail; +} +} + migrate_fd_connect(s); return; } -- 1.7.7.6
[Qemu-devel] [PATCH 16/17] migration: adjust migration_thread() process for page flipping
Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration.c | 10 +++--- 1 files changed, 7 insertions(+), 3 deletions(-) diff --git a/migration.c b/migration.c index 4ac466b..0f98ac1 100644 --- a/migration.c +++ b/migration.c @@ -579,10 +579,11 @@ static void *migration_thread(void *opaque) pending_size = qemu_savevm_state_pending(s-file, max_size); DPRINTF(pending size % PRIu64 max % PRIu64 \n, pending_size, max_size); -if (pending_size pending_size = max_size) { +if (pending_size pending_size = max_size +!runstate_needs_reset()) { qemu_savevm_state_iterate(s-file); } else { -int ret; +int ret = 0; DPRINTF(done iterating\n); qemu_mutex_lock_iothread(); @@ -590,7 +591,10 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); -ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +if (!runstate_needs_reset()) { +ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +} + if (ret = 0) { qemu_file_set_rate_limit(s-file, INT_MAX); qemu_savevm_state_complete(s-file); -- 1.7.7.6
[Qemu-devel] [PATCH 09/17] save_page: replace block_offset with a MemoryRegion
This patch exports MemoryRegion to save_page hook, replacing argument ram_addr_t block_offset with a MemoryRegion suggested by Paolo Bonzini. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- arch_init.c |4 ++-- include/migration/migration.h |2 +- include/migration/qemu-file.h |8 migration-rdma.c |4 ++-- savevm.c |8 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch_init.c b/arch_init.c index e0acbc5..daaa519 100644 --- a/arch_init.c +++ b/arch_init.c @@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* In doubt sent page as normal */ bytes_sent = -1; -ret = ram_control_save_page(f, block-offset, - offset, TARGET_PAGE_SIZE, bytes_sent); +ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE, +bytes_sent); if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { diff --git a/include/migration/migration.h b/include/migration/migration.h index 7e5d01a..ca852a8 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags); #define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index f9b104a..6646e89 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); * is saved (such as RDMA, for example.) */ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, - ram_addr_t block_offset, - ram_addr_t offset, - size_t size, - int *bytes_sent); + MemoryRegion *mr, + ram_addr_t offset, + size_t size, + int *bytes_sent); typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4..ae04de4 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque) * the protocol because most transfers are sent asynchronously. */ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, - ram_addr_t block_offset, ram_addr_t offset, + MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent) { QEMUFileRDMA *rfile = opaque; @@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, * is full, or the page doen't belong to the current chunk, * an actual RDMA write will occur and a new chunk will be formed. */ -ret = qemu_rdma_write(f, rdma, block_offset, offset, size); +ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size); if (ret 0) { fprintf(stderr, rdma migration: write error! %d\n, ret); goto err; diff --git a/savevm.c b/savevm.c index 2f631d4..3ee256e 100644 --- a/savevm.c +++ b/savevm.c @@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags) } } -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size, int *bytes_sent) +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) { if (f-ops-save_page) { -int ret = f-ops-save_page(f, f-opaque, block_offset, -offset, size, bytes_sent); +int ret = f-ops-save_page(f, f-opaque, mr, offset, +size, bytes_sent); if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent *bytes_sent 0) { -- 1.7.7.6
[Qemu-devel] [PATCH 17/17] hmp: better format for info migrate_capabilities
As there might be more capabilities introduced, better to display it in lines. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- hmp.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..dcfa2f9 100644 --- a/hmp.c +++ b/hmp.c @@ -226,13 +226,12 @@ void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict) caps = qmp_query_migrate_capabilities(NULL); if (caps) { -monitor_printf(mon, capabilities: ); +monitor_printf(mon, Capabilities:\n); for (cap = caps; cap; cap = cap-next) { -monitor_printf(mon, %s: %s , +monitor_printf(mon, %s: %s\n, MigrationCapability_lookup[cap-value-capability], cap-value-state ? on : off); } -monitor_printf(mon, \n); } qapi_free_MigrationCapabilityStatusList(caps); -- 1.7.7.6
[Qemu-devel] [PATCH 04/17] migration-local: add QEMUFileLocal with socket based QEMUFile
This patch adds QEMUFileLocal with copy of socket based QEMUFile, will be used as the basis code for Unix socket protocol migration and page flipping migration. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- Makefile.target |1 + migration-local.c | 121 + 2 files changed, 122 insertions(+), 0 deletions(-) create mode 100644 migration-local.c diff --git a/Makefile.target b/Makefile.target index af6ac7e..aa09960 100644 --- a/Makefile.target +++ b/Makefile.target @@ -117,6 +117,7 @@ obj-$(CONFIG_KVM) += kvm-all.o obj-y += memory.o savevm.o cputlb.o obj-y += memory_mapping.o obj-y += dump.o +obj-y += migration-local.o LIBS+=$(libs_softmmu) # xen support diff --git a/migration-local.c b/migration-local.c new file mode 100644 index 000..8b9e10e --- /dev/null +++ b/migration-local.c @@ -0,0 +1,121 @@ +/* + * QEMU localhost migration with page flipping + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Lei Li li...@linux.vnet.ibm.com + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include config-host.h +#include qemu-common.h +#include migration/migration.h +#include exec/cpu-common.h +#include config.h +#include exec/cpu-all.h +#include exec/memory.h +#include exec/memory-internal.h +#include monitor/monitor.h +#include migration/qemu-file.h +#include qemu/iov.h +#include sysemu/arch_init.h +#include sysemu/sysemu.h +#include block/block.h +#include qemu/sockets.h +#include migration/block.h +#include qemu/thread.h +#include qmp-commands.h +#include trace.h +#include qemu/osdep.h + +//#define DEBUG_MIGRATION_LOCAL + +#ifdef DEBUG_MIGRATION_LOCAL +#define DPRINTF(fmt, ...) \ +do { printf(migration-local: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + + +typedef struct QEMUFileLocal { +QEMUFile *file; +int sockfd; +int pipefd[2]; +bool unix_page_flipping; +} QEMUFileLocal; + +static int qemu_local_get_sockfd(void *opaque) +{ +QEMUFileLocal *s = opaque; + +return s-sockfd; +} + +static int qemu_local_get_buffer(void *opaque, uint8_t *buf, + int64_t pos, int size) +{ +QEMUFileLocal *s = opaque; +ssize_t len; + +for (;;) { +len = qemu_recv(s-sockfd, buf, size, 0); +if (len != -1) { +break; +} + +if (socket_error() == EAGAIN) { +yield_until_fd_readable(s-sockfd); +} else if (socket_error() != EINTR) { +break; +} +} + +if (len == -1) { +len = -socket_error(); +} + +return len; +} + +static ssize_t qemu_local_writev_buffer(void *opaque, struct iovec *iov, +int iovcnt, int64_t pos) +{ +QEMUFileLocal *s = opaque; +ssize_t len; +ssize_t size = iov_size(iov, iovcnt); + +len = iov_send(s-sockfd, iov, iovcnt, 0, size); +if (len size) { +len = -socket_error(); +} + +return len; +} + +static int qemu_local_close(void *opaque) +{ +QEMUFileLocal *s = opaque; + +closesocket(s-sockfd); +g_free(s); + +return 0; +} + +static const QEMUFileOps pipe_read_ops = { +.get_fd= qemu_local_get_sockfd, +.get_buffer= qemu_local_get_buffer, +.close = qemu_local_close, +}; + +static const QEMUFileOps pipe_write_ops = { +.get_fd = qemu_local_get_sockfd, +.writev_buffer = qemu_local_writev_buffer, +.close = qemu_local_close, +}; -- 1.7.7.6
[Qemu-devel] [PATCH 0/17 v3] Localhost migration with side channel for ram
This patch series tries to introduce a mechanism using side channel pipe for RAM via SCM_RIGHTS with unix domain socket protocol migration. This side channel is used for the page flipping by vmsplice, which is the internal mechanism for localhost migration that we are trying to add to QEMU. The backgroud info and previous patch series for reference, Localhost migration http://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg02916.html migration: Introduce side channel for RAM http://lists.gnu.org/archive/html/qemu-devel/2013-09/msg04043.html I have picked patches from the localhost migration series and rebased it on the series of side channel, now it is a complete series that passed the basic test. Please let me know if there is anything needs to be fixed or improved. Your suggestions and comments are very welcome, and thanks to Paolo for his continued review and useful suggestions. Changes since V2; Address comments from Paolo including: - Doc improvement for QAPI. - Use callback get_buffer as the only one receiver. - Rename the new RunState flipping-migrate to memory-stale, and add transition from 'prelaunch' to 'memory-stale'. - Other minor fixes. Changes since V1: Address suggestions from Paolo Bonzini including: - Use Unix socket QEMUFile as basis of code and adjust the way of overriding RDMA hooks. - Involve the vmsplice for page flipping. - Add new RunState RUN_STATE_FLIPPING_MIGRATE and add it to runstate_needs_reset() for the adjustment of the current migration process with page flipping. Lei Li (17): QAPI: introduce magration capability unix_page_flipping migration: add migrate_unix_page_flipping() qmp-command.hx: add missing docs for migration capabilites migration-local: add QEMUFileLocal with socket based QEMUFile migration-local: introduce qemu_fopen_socket_local() migration-local: add send_pipefd() migration-local: override before_ram_iterate to send pipefd add unix_msgfd_lookup() to callback get_buffer save_page: replace block_offset with a MemoryRegion migration-local: override save_page for page transmit savevm: adjust ram_control_save_page with page flipping migration-local: override hook_ram_load migration-unix: replace qemu_fopen_socket with qemu_fopen_socket_local add new RanState RAN_STATE_MEMORY_STALE migration-unix: page flipping support on unix outgoing migration: adjust migration_thread() process for unix_page_flipping hmp: better fomat for info migrate_capabilities Makefile.target | 1 + arch_init.c | 4 +- migration-local.c | 512 ++ hmp.c | 5 +- include/migration/migration.h | 3 + include/migration/qemu-file.h | 2 + migration-unix.c | 27 ++- migration-rdma.c | 4 +- migration.c | 18 +- qapi-schema.json | 18 +- qmp-commands.hx | 8 + savevm.c | 21 +- vl.c | 12 +- 13 files changed, 617 insertions(+), 27 deletions(-) create mode 100644 migration-local.c
[Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_MEMORY_STALE
Introduce new RanState RAN_STATE_MEMORY_STALE and add it to runstate_needs_reset(). Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json |7 +-- vl.c | 12 +++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index b290a0f..640a380 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -176,12 +176,15 @@ # @watchdog: the watchdog action is configured to pause and has been triggered # # @guest-panicked: guest has been panicked as a result of guest OS panic +# +# @memory-stale: guest is paused to transmit memory, the destination guest +# will has the newer contents of it. ## { 'enum': 'RunState', 'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused', 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm', -'running', 'save-vm', 'shutdown', 'suspended', 'watchdog', -'guest-panicked' ] } +'running', 'save-vm', 'shutdown', 'suspended', 'memory-stale', +'watchdog', 'guest-panicked' ] } ## # @SnapshotInfo diff --git a/vl.c b/vl.c index 8d5d874..0f38405 100644 --- a/vl.c +++ b/vl.c @@ -601,6 +601,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PAUSED, RUN_STATE_RUNNING }, { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_PAUSED, RUN_STATE_MEMORY_STALE }, { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE }, @@ -608,6 +609,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING }, { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE }, { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, +{ RUN_STATE_PRELAUNCH, RUN_STATE_MEMORY_STALE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, @@ -624,23 +626,30 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN }, { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG }, { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED }, +{ RUN_STATE_RUNNING, RUN_STATE_MEMORY_STALE }, { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING }, { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED }, { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SHUTDOWN, RUN_STATE_MEMORY_STALE }, { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED }, { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SUSPENDED, RUN_STATE_MEMORY_STALE }, { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_WATCHDOG, RUN_STATE_MEMORY_STALE }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_GUEST_PANICKED, RUN_STATE_MEMORY_STALE }, +{ RUN_STATE_MEMORY_STALE, RUN_STATE_RUNNING }, +{ RUN_STATE_MEMORY_STALE, RUN_STATE_POSTMIGRATE }, { RUN_STATE_MAX, RUN_STATE_MAX }, }; @@ -685,7 +694,8 @@ int runstate_is_running(void) bool runstate_needs_reset(void) { return runstate_check(RUN_STATE_INTERNAL_ERROR) || -runstate_check(RUN_STATE_SHUTDOWN); +runstate_check(RUN_STATE_SHUTDOWN) || +runstate_check(RUN_STATE_MEMORY_STALE); } StatusInfo *qmp_query_status(Error **errp) -- 1.7.7.6
[Qemu-devel] [PATCH 10/17] migration-local: override save_page for page transmit
This patch implements save_page callback for the outside of page flipping. It will write the address of the page on the Unix socket and flip the page data on pipe by vmsplice(). Every page address would have a header flag RAM_SAVE_FLAG_HOOK, which will trigger the load hook to receive it in incoming side as well. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 54 + 1 files changed, 54 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 0f0896b..14207e9 100644 --- a/migration-local.c +++ b/migration-local.c @@ -200,6 +200,59 @@ static int qemu_local_send_pipefd(QEMUFile *f, void *opaque, return 0; } +static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, + MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) +{ +QEMUFileLocal *s = opaque; +ram_addr_t current_addr = mr-ram_addr + offset; +void *ram_addr; +ssize_t ret; + +if (s-unix_page_flipping) { +qemu_fflush(s-file); +qemu_put_be64(s-file, RAM_SAVE_FLAG_HOOK); + +/* Write page address to unix socket */ +qemu_put_be64(s-file, current_addr); + +ram_addr = memory_region_get_ram_ptr(mr) + offset; + +/* vmsplice page data to pipe */ +struct iovec iov = { +.iov_base = ram_addr, +.iov_len = size, +}; + +/* + * The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will movie pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently falls + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ +ret = vmsplice(s-pipefd[1], iov, 1, SPLICE_F_GIFT | SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice save error: %s\n, strerror(errno)); +return ret; +} +} else { +if (bytes_sent) { +*bytes_sent = 1; +} +DPRINTF(block_offset: %lu, offset: %lu\n, block_offset, offset); +return 0; +} +} + +return RAM_SAVE_CONTROL_NOT_SUPP; +} + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, @@ -211,6 +264,7 @@ static const QEMUFileOps pipe_write_ops = { .writev_buffer = qemu_local_writev_buffer, .close = qemu_local_close, .before_ram_iterate = qemu_local_send_pipefd, +.save_page = qemu_local_save_ram }; QEMUFile *qemu_fopen_socket_local(int sockfd, const char *mode) -- 1.7.7.6
[Qemu-devel] [PATCH 12/17] migration-local: override hook_ram_load
Override hook_ram_load to receive the pipe file descriptor passed by source process and page address which will be extracted to vmsplice the page data from pipe. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- migration-local.c | 55 + 1 files changed, 55 insertions(+), 0 deletions(-) diff --git a/migration-local.c b/migration-local.c index 14207e9..8ac0af5 100644 --- a/migration-local.c +++ b/migration-local.c @@ -253,10 +253,65 @@ static size_t qemu_local_save_ram(QEMUFile *f, void *opaque, return RAM_SAVE_CONTROL_NOT_SUPP; } +static int qemu_local_ram_load(QEMUFile *f, void *opaque, + uint64_t flags) +{ +QEMUFileLocal *s = opaque; +ram_addr_t addr; +struct iovec iov; +ssize_t ret = -EINVAL; + +/* + * PIPE file descriptor will be received by another callback + * get_buffer. + */ +if (pipefd_passed) { +void *host; +/* + * Extract the page address from the 8-byte record and + * read the page data from the pipe. + */ +addr = qemu_get_be64(s-file); +host = qemu_get_ram_ptr(addr); + +iov.iov_base = host; +iov.iov_len = TARGET_PAGE_SIZE; + +/* The flag SPLICE_F_MOVE is introduced in kernel for the page + * flipping feature in QEMU, which will movie pages rather than + * copying, previously unused. + * + * If a move is not possible the kernel will transparently falls + * back to copying data. + * + * For older kernels the SPLICE_F_MOVE would be ignored and a copy + * would occur. + */ +ret = vmsplice(s-pipefd[0], iov, 1, SPLICE_F_MOVE); +if (ret == -1) { +if (errno != EAGAIN errno != EINTR) { +fprintf(stderr, vmsplice() load error: %s, strerror(errno)); +return ret; +} +DPRINTF(vmsplice load error\n); +} else if (ret == 0) { +DPRINTF(stderr, load_page: zero read\n); +} + +DPRINTF(vmsplice (read): %zu\n, ret); +return ret; +} + +return 0; +} + + + static const QEMUFileOps pipe_read_ops = { .get_fd= qemu_local_get_sockfd, .get_buffer= qemu_local_get_buffer, .close = qemu_local_close, +.hook_ram_load = qemu_local_ram_load }; static const QEMUFileOps pipe_write_ops = { -- 1.7.7.6
[Qemu-devel] [PATCH resend] save_page: replace block_offset with a MemoryRegion
This patch exports MemoryRegion to save_page hook, replacing argument ram_addr_t block_offset with a MemoryRegion suggested by Paolo Bonzini. Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- arch_init.c |4 ++-- include/migration/migration.h |2 +- include/migration/qemu-file.h |8 migration-rdma.c |4 ++-- savevm.c |8 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch_init.c b/arch_init.c index 7545d96..a9b97be 100644 --- a/arch_init.c +++ b/arch_init.c @@ -485,8 +485,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* In doubt sent page as normal */ bytes_sent = -1; -ret = ram_control_save_page(f, block-offset, - offset, TARGET_PAGE_SIZE, bytes_sent); +ret = ram_control_save_page(f, mr, offset, TARGET_PAGE_SIZE, +bytes_sent); if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { diff --git a/include/migration/migration.h b/include/migration/migration.h index 7e5d01a..ca852a8 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -161,7 +161,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags); #define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 0f757fb..d73dc4b 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -77,10 +77,10 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); * is saved (such as RDMA, for example.) */ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, - ram_addr_t block_offset, - ram_addr_t offset, - size_t size, - int *bytes_sent); + MemoryRegion *mr, + ram_addr_t offset, + size_t size, + int *bytes_sent); typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4..ae04de4 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2699,7 +2699,7 @@ static int qemu_rdma_close(void *opaque) * the protocol because most transfers are sent asynchronously. */ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, - ram_addr_t block_offset, ram_addr_t offset, + MemoryRegion *mr, ram_addr_t offset, size_t size, int *bytes_sent) { QEMUFileRDMA *rfile = opaque; @@ -2716,7 +2716,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, * is full, or the page doen't belong to the current chunk, * an actual RDMA write will occur and a new chunk will be formed. */ -ret = qemu_rdma_write(f, rdma, block_offset, offset, size); +ret = qemu_rdma_write(f, rdma, mr-ram_addr, offset, size); if (ret 0) { fprintf(stderr, rdma migration: write error! %d\n, ret); goto err; diff --git a/savevm.c b/savevm.c index 2f631d4..3ee256e 100644 --- a/savevm.c +++ b/savevm.c @@ -661,12 +661,12 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags) } } -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size, int *bytes_sent) +size_t ram_control_save_page(QEMUFile *f, MemoryRegion *mr, ram_addr_t offset, + size_t size, int *bytes_sent) { if (f-ops-save_page) { -int ret = f-ops-save_page(f, f-opaque, block_offset, -offset, size, bytes_sent); +int ret = f-ops-save_page(f, f-opaque, mr, offset, +size, bytes_sent); if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent *bytes_sent 0) { -- 1.7.7.6
Re: [Qemu-devel] [PATCH resend] sdl: Reverse support for video mode setting
Ping^2 On 10/24/2013 08:21 PM, Lei Li wrote: This patch has been confirmed by the reporter himself as link below, https://bugs.launchpad.net/qemu/+bug/1216368 It has been on the mailing list for a while, could it be merged? PING... On 09/04/2013 05:07 PM, Lei Li wrote: Currently, If the setting of video mode failed, qemu will exit. It should go back to the previous setting if the new screen resolution failed. This patch fixes LP#1216368, add support to revert to existing surface for the failure of video mode setting. Reported-by: Sascha Krissler sas...@srlabs.de Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- ui/sdl.c | 23 +++ 1 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ui/sdl.c b/ui/sdl.c index 39a42d6..9d8583c 100644 --- a/ui/sdl.c +++ b/ui/sdl.c @@ -86,6 +86,7 @@ static void sdl_update(DisplayChangeListener *dcl, static void do_sdl_resize(int width, int height, int bpp) { int flags; +SDL_Surface *tmp_screen; //printf(resizing to %d %d\n, w, h); @@ -98,12 +99,26 @@ static void do_sdl_resize(int width, int height, int bpp) if (gui_noframe) flags |= SDL_NOFRAME; -real_screen = SDL_SetVideoMode(width, height, bpp, flags); +tmp_screen = SDL_SetVideoMode(width, height, bpp, flags); if (!real_screen) { -fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n, width, -height, bpp, SDL_GetError()); -exit(1); +if (!tmp_screen) { +fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n, +width, height, bpp, SDL_GetError()); +exit(1); +} +} else { +/* + * Revert to the previous video mode if the change of resizing or + * resolution failed. + */ +if (!tmp_screen) { +fprintf(stderr, Failed to set SDL display (%dx%dx%d): %s\n, +width, height, bpp, SDL_GetError()); +return; +} } + +real_screen = tmp_screen; } static void sdl_switch(DisplayChangeListener *dcl, -- Lei
Re: [Qemu-devel] [PATCH 09/17] migration-local: override before_ram_iterate to send pipefd
On 10/25/2013 03:23 PM, Paolo Bonzini wrote: Il 25/10/2013 05:38, Lei Li ha scritto: Just want to confirm, normally, should I take these 'otherwise looks good/ok' as a 'Reviewed-by' from you If the other comment is fixed in the update version? Depends on how much the patch changes... right now I'm still expecting some changes so I didn't really look much at the patch and didn't test it. I prefer to take a more complete look at v3 before giving a formal Reviewed-by. I see, thanks for your explanation. Paolo -- Lei
Re: [Qemu-devel] [PATCH 14/17] add new RanState RAN_STATE_FLIPPING_MIGRATE
On 10/25/2013 03:31 PM, Paolo Bonzini wrote: Il 25/10/2013 05:30, Lei Li ha scritto: I am not sure about the name; for one thing, the new state would apply also to postcopy migration. About the name, how about 'live-upgrade'? OK, I'll add the transition between postcopy and this new state. Note I didn't mean postmigrate. For a description of postcopy, see my answer to the cover letter (patch Yes, I've realized that I misunderstood it... 0). The new state means somebody else has newer contents of the memory. Perhaps stale? And should it also apply from 'prelaunch' to 'flipping-migrate' too? Yes, it should. Good catch! Paolo -- Lei
Re: [Qemu-devel] [PATCH 0/17 v2] Localhost migration with side channel for ram
On 10/25/2013 03:30 PM, Paolo Bonzini wrote: Il 25/10/2013 06:58, Lei Li ha scritto: Right now just has inaccurate numbers without the new vmsplice, which based on the result from info migrate, as the guest ram size increases, although the 'total time' is number of times less compared with the current live migration, but the 'downtime' performs badly. Of course. For a 1GB ram guest, total time: 702 milliseconds downtime: 692 milliseconds And when the ram size of guest increasesexponentially, those numbers are proportional to it. I will make a list of the performance with the new vmsplice later, I am sure it'd be much better than this at least. Yes, please. Is the memory usage is still 2x without vmsplice? I think you have a nice proof of concept, but on the other hand this probably needs to be coupled with some kind of postcopy live migration, that is: * the source starts sending data * but the destination starts running immediately * if the machine needs a page that is missing, the destination asks the source to send it * as soon as it arrives, the destination can restart Using postcopy is problematic for reliability: if the destination fails, the virtual machine is lost because the source doesn't have the latest content of memory. However, this is a much, much smaller problem for live QEMU upgrade where the network cannot fail. If you do this, you can achieve pretty much instantaneous live upgrade, well within your original 200 ms goals. But the flipping code with vmsplice should be needed anyway to avoid doubling memory usage, and Yes, I have read the postcopy migration patches, it does perform very good on downtime, as just send the vmstates then switch the execution to destination host. And as you pointed out, it can not avoid doubling memory usage. The numbers list above are based on the old vmsplice as I have not yet worked on the benchmark for performance, it actually copys data rather than moving. As the feedback for this version is positive, now I am trying to get a real result out with the new vmsplice. BTW, kernel side is looking for huge page solution for the improvement of performance. The recently patches from kernel as link, http://article.gmane.org/gmane.linux.kernel/1574277 it's looking pretty good in this version already! I'm relieved that the RDMA code was designed right! I am happy with it too. :) Those RDMA hooks really make thingsmore flexible! Paolo -- Lei
[Qemu-devel] [PATCH 0/3 for 1.7] migration: introduce page flipping capability
This series is extracted from the lastest localhost migration with side channel for ram patch set with comments from Paolo fixed. Send it separately according to his suggestion. Localhost migration with side channel for ram: http://lists.gnu.org/archive/html/qemu-devel/2013-10/msg02787.html Lei Li (3): QAPI: introduce magration capability unix_page_flipping migration: add migrate_unix_page_flipping() qmp-command.hx: add missing docs for migration capabilites
[Qemu-devel] [PATCH 3/3] qmp-command.hx: add missing docs for migration capabilites
Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qmp-commands.hx |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..dcec433 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2898,6 +2898,10 @@ migrate-set-capabilities Enable/Disable migration capabilities - xbzrle: XBZRLE support +- x-rdma-pin-all: Pin all pages during RDMA support +- zero-blocks: Compress zero blocks during block migration +- auto-converge: Block VCPU to help convergence of migration +- unix-page-flipping: Page flipping for live QEMU upgrade Arguments: @@ -2922,6 +2926,10 @@ Query current migration capabilities - capabilities: migration capabilities state - xbzrle : XBZRLE state (json-bool) + - x-rdma-pin-all: RDMA state (json-bool) + - zero-blocks: zero-blocks state (json-bool) + - auto-converge: Auto converge state (json-bool) + - unix-page-flipping: Page flipping state (json-bool) Arguments: -- 1.7.7.6
[Qemu-devel] [PATCH 1/3] QAPI: introduce magration capability unix_page_flipping
Introduce unix_page_flipping to MigrationCapability for localhost migration. Signed-off-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- qapi-schema.json | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index 60f3fd1..7cb88af 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -661,10 +661,18 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @unix-page-flipping: If enabled, QEMU can optimize migration when the +# destination is a QEMU process that runs on the same host as +# the source (as is the case for live upgrade). If the migration +# transport is a Unix socket, QEMU will flip RAM pages directly to +# the destination, so that memory is only allocated twice for the +# source and destination processes. Disabled by default. (since 1.8) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks', + 'unix-page-flipping'] } ## # @MigrationCapabilityStatus -- 1.7.7.6
[Qemu-devel] [PATCH 2/3] migration: add migrate_unix_page_flipping()
Add migrate_unix_page_flipping() to check if MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING is enabled. Reviewed-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- include/migration/migration.h |3 +++ migration.c |9 + 2 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index 140e6b4..7e5d01a 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -131,10 +131,13 @@ void migrate_add_blocker(Error *reason); void migrate_del_blocker(Error *reason); bool migrate_rdma_pin_all(void); + bool migrate_zero_blocks(void); bool migrate_auto_converge(void); +bool migrate_unix_page_flipping(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); diff --git a/migration.c b/migration.c index 2b1ab20..4ac466b 100644 --- a/migration.c +++ b/migration.c @@ -541,6 +541,15 @@ int64_t migrate_xbzrle_cache_size(void) return s-xbzrle_cache_size; } +bool migrate_unix_page_flipping(void) +{ +MigrationState *s; + +s = migrate_get_current(); + +return s-enabled_capabilities[MIGRATION_CAPABILITY_UNIX_PAGE_FLIPPING]; +} + /* migration thread support */ static void *migration_thread(void *opaque) -- 1.7.7.6
Re: [Qemu-devel] [PATCH resend] sdl: Reverse support for video mode setting
This patch has been confirmed by the reporter himself as link below, https://bugs.launchpad.net/qemu/+bug/1216368 It has been on the mailing list for a while, could it be merged? PING... On 09/04/2013 05:07 PM, Lei Li wrote: Currently, If the setting of video mode failed, qemu will exit. It should go back to the previous setting if the new screen resolution failed. This patch fixes LP#1216368, add support to revert to existing surface for the failure of video mode setting. Reported-by: Sascha Krissler sas...@srlabs.de Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- ui/sdl.c | 23 +++ 1 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ui/sdl.c b/ui/sdl.c index 39a42d6..9d8583c 100644 --- a/ui/sdl.c +++ b/ui/sdl.c @@ -86,6 +86,7 @@ static void sdl_update(DisplayChangeListener *dcl, static void do_sdl_resize(int width, int height, int bpp) { int flags; +SDL_Surface *tmp_screen; //printf(resizing to %d %d\n, w, h); @@ -98,12 +99,26 @@ static void do_sdl_resize(int width, int height, int bpp) if (gui_noframe) flags |= SDL_NOFRAME; -real_screen = SDL_SetVideoMode(width, height, bpp, flags); +tmp_screen = SDL_SetVideoMode(width, height, bpp, flags); if (!real_screen) { - fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n, width, - height, bpp, SDL_GetError()); -exit(1); +if (!tmp_screen) { +fprintf(stderr, Could not open SDL display (%dx%dx%d): %s\n, +width, height, bpp, SDL_GetError()); +exit(1); +} +} else { +/* + * Revert to the previous video mode if the change of resizing or + * resolution failed. + */ +if (!tmp_screen) { +fprintf(stderr, Failed to set SDL display (%dx%dx%d): %s\n, +width, height, bpp, SDL_GetError()); +return; +} } + +real_screen = tmp_screen; } static void sdl_switch(DisplayChangeListener *dcl, -- Lei
Re: [Qemu-devel] [PATCH 01/17] rename is_active to is_block_active
On 10/24/2013 09:46 PM, Paolo Bonzini wrote: Il 22/10/2013 04:25, Lei Li ha scritto: is_active is used to identify block migration, rename to is_block_active to make it more clear. No, is_active is used to identify whether a set of SaveVMHandlers is active. The default is true, so only block migration is using it. But we could use it in the future for other features (probably using migration capabilities instead of a flag as is the case for block). It updates my knowledge. Thanks for your clarifying! Paolo Signed-off-by: Lei Li li...@linux.vnet.ibm.com --- block-migration.c |2 +- include/migration/vmstate.h |2 +- savevm.c| 16 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/block-migration.c b/block-migration.c index daf9ec1..b637695 100644 --- a/block-migration.c +++ b/block-migration.c @@ -834,7 +834,7 @@ SaveVMHandlers savevm_block_handlers = { .save_live_pending = block_save_pending, .load_state = block_load, .cancel = block_migration_cancel, -.is_active = block_is_active, +.is_block_active = block_is_active, }; void blk_mig_init(void) diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 9d09e60..c634d65 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -42,7 +42,7 @@ typedef struct SaveVMHandlers { int (*save_live_complete)(QEMUFile *f, void *opaque); /* This runs both outside and inside the iothread lock. */ -bool (*is_active)(void *opaque); +bool (*is_block_active)(void *opaque); /* This runs outside the iothread lock in the migration case, and * within the lock in the savevm case. The callback had better only diff --git a/savevm.c b/savevm.c index 2f631d4..56b8643 100644 --- a/savevm.c +++ b/savevm.c @@ -1867,8 +1867,8 @@ void qemu_savevm_state_begin(QEMUFile *f, if (!se-ops || !se-ops-save_live_setup) { continue; } -if (se-ops se-ops-is_active) { -if (!se-ops-is_active(se-opaque)) { +if (se-ops se-ops-is_block_active) { +if (!se-ops-is_block_active(se-opaque)) { continue; } } @@ -1907,8 +1907,8 @@ int qemu_savevm_state_iterate(QEMUFile *f) if (!se-ops || !se-ops-save_live_iterate) { continue; } -if (se-ops se-ops-is_active) { -if (!se-ops-is_active(se-opaque)) { +if (se-ops se-ops-is_block_active) { +if (!se-ops-is_block_active(se-opaque)) { continue; } } @@ -1948,8 +1948,8 @@ void qemu_savevm_state_complete(QEMUFile *f) if (!se-ops || !se-ops-save_live_complete) { continue; } -if (se-ops se-ops-is_active) { -if (!se-ops-is_active(se-opaque)) { +if (se-ops se-ops-is_block_active) { +if (!se-ops-is_block_active(se-opaque)) { continue; } } @@ -2002,8 +2002,8 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size) if (!se-ops || !se-ops-save_live_pending) { continue; } -if (se-ops se-ops-is_active) { -if (!se-ops-is_active(se-opaque)) { +if (se-ops se-ops-is_block_active) { +if (!se-ops-is_block_active(se-opaque)) { continue; } } -- Lei