From: "Michael R. Hines" <mrhi...@us.ibm.com>
Signed-off-by: Michael R. Hines <mrhi...@us.ibm.com>
---
arch_init.c | 116
+++++++++++++++++++++++++++++++++++++++--
include/migration/qemu-file.h | 1 +
savevm.c | 90 +++++++++++++++++++++++++++-----
3 files changed, 189 insertions(+), 18 deletions(-)
diff --git a/arch_init.c b/arch_init.c
index dada6de..7633fa6 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -42,6 +42,7 @@
#include "migration/migration.h"
#include "exec/gdbstub.h"
#include "hw/smbios.h"
+#include "qemu/rdma.h"
#include "exec/address-spaces.h"
#include "hw/pcspk.h"
#include "migration/page_cache.h"
@@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
#define RAM_SAVE_FLAG_EOS 0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE 0x40
+#define RAM_SAVE_FLAG_RDMA 0x80
#ifdef __ALTIVEC__
#include <altivec.h>
@@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
int bytes_sent = 0;
MemoryRegion *mr;
ram_addr_t current_addr;
+ static int not_sent = 1;
if (!block)
block = QTAILQ_FIRST(&ram_list.blocks);
@@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
int cont = (block == last_sent_block) ?
RAM_SAVE_FLAG_CONTINUE : 0;
+ current_addr = block->offset + offset;
p = memory_region_get_ram_ptr(mr) + offset;
/* In doubt sent page as normal */
bytes_sent = -1;
- if (is_dup_page(p)) {
+
+ /*
+ * RFC RDMA: The empirical cost of searching for zero pages here
+ * plus the cost of communicating with the other side
+ * seems to take significantly more time than simply
+ * dumping the page into remote memory.
+ */
+ if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
acct_info.dup_pages++;
bytes_sent = save_block_hdr(f, block, offset, cont,
RAM_SAVE_FLAG_COMPRESS);
qemu_put_byte(f, *p);
bytes_sent += 1;
+ /*
+ * RFC RDMA: Same comment as above. time(run-length encoding)
+ * + time(communication) is too big. RDMA throughput
tanks
+ * when this feature is enabled. But there's no need
+ * to change the code since the feature is optional.
+ */
} else if (migrate_use_xbzrle()) {
- current_addr = block->offset + offset;
bytes_sent = save_xbzrle_page(f, p, current_addr, block,
offset, cont, last_stage);
if (!last_stage) {
p = get_cached_data(XBZRLE.cache, current_addr);
}
+ } else if (qemu_rdma_migration_enabled()) {
+ int ret;
+
+ /*
+ * RFC RDMA: This bad hack was to cause the loop on the
+ * receiving side to break. Comments are welcome
+ * on how to get rid of it.
+ */
+ if (not_sent == 1) {
+ not_sent = 0;
+ bytes_sent = save_block_hdr(f, block, offset,
+ cont, RAM_SAVE_FLAG_RDMA);
+ }
+ acct_info.norm_pages++;
+ /*
+ * use RDMA to send page
+ */
+ if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
+ TARGET_PAGE_SIZE)) {
+ fprintf(stderr, "rdma migration: write error!\n");
+ qemu_file_set_error(f, -EIO);
+ return 0;
+ }
+
+ /*
+ * do some polling
+ */
+ while (1) {
+ ret = qemu_rdma_migration_poll(&rdma_mdata);
+ if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
+ break;
+ }
+ if (ret < 0) {
+ fprintf(stderr, "rdma migration: polling error!\n");
+ qemu_file_set_error(f, -EIO);
+ return 0;
+ }
+ }
+ bytes_sent += TARGET_PAGE_SIZE;
}
/* XBZRLE overflow or normal page */
@@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
return 0;
}
+
+int tprate = 1000;
+
static int ram_save_iterate(QEMUFile *f, void *opaque)
{
int ret;
int i;
- int64_t t0;
- int total_sent = 0;
+ int64_t t0, tp0;
+ int total_sent = 0, last_total_sent = 0;
qemu_mutex_lock_ramlist();
@@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
break;
}
total_sent += bytes_sent;
+ last_total_sent += bytes_sent;
acct_info.iterations++;
/* we want to check in the 1st loop, just in case it was the 1st time
and we had to sync the dirty bitmap.
qemu_get_clock_ns() is a bit expensive, so we only check each some
iterations
*/
+
+ /*
+ * RFC RDMA: Can we have something like this to periodically print
+ * out throughput.
+ * This is just a rough-sketch that partially worked for me.
+ * I assume there a better way that everyone would prefer.
+ * Perhaps we could set a QMP command that toggled a "periodic
printing"
+ * option that allowed more details to be printed on stdout.....?
+ */
if ((i & 63) == 0) {
- uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
+ uint64_t curr = qemu_get_clock_ns(rt_clock);
+ uint64_t t1 = (curr - t0) / 1000000;
+ double tp;
if (t1 > MAX_WAIT) {
DPRINTF("big wait: %" PRIu64 " milliseconds, %d
iterations\n",
t1, i);
break;
}
+
+ if ((i % tprate) == 0) {
+ uint64_t tp1 = (curr - tp0) / 1000000;
+ tp = ((double) last_total_sent * 8.0 /
+ ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
+ printf("throughput: %f mbps\n", tp);
+ last_total_sent = 0;
+ tp0 = curr;
+ }
}
i++;
}
+ /* flush buffer write */
+ if (qemu_rdma_migration_enabled()) {
+ int resp;
+ resp = qemu_rdma_migration_write_flush(&rdma_mdata);
+ if (resp < 0) {
+ fprintf(stderr, "rdma migration: write flush error!\n");
+ qemu_file_set_error(f, -EIO);
+ return 0;
+ }
+ }
+
qemu_mutex_unlock_ramlist();
if (ret < 0) {
@@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int
version_id)
ret = -EINVAL;
goto done;
}
+ } else if (flags & RAM_SAVE_FLAG_RDMA) {
+ /*
+ * RFC RDMA: This bad hack was to cause the loop break.
+ * Comments are welcome on how to get rid of it.
+ * Communicating here is unnecessary because the
+ * RDMA page has already arrived.
+ * Comments are welcome on how to get rif of this.
+ */
+ if (!qemu_rdma_migration_enabled()) {
+ return -EINVAL;
+ }
+ void *host = host_from_stream_offset(f, addr, flags);
+ if (!host) {
+ return -EINVAL;
+ }
+ /* rdma page is already here, nothing to do */
}
error = qemu_file_get_error(f);
if (error) {
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 68deefb..7c9968e 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
int64_t qemu_file_get_rate_limit(QEMUFile *f);
int qemu_file_get_error(QEMUFile *f);
+void qemu_file_set_error(QEMUFile *f, int ret);
static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
{
diff --git a/savevm.c b/savevm.c
index 304d1ef..071196e 100644
--- a/savevm.c
+++ b/savevm.c
@@ -24,6 +24,7 @@
#include "config-host.h"
#include "qemu-common.h"
+#include "qemu/rdma.h"
#include "hw/hw.h"
#include "hw/qdev.h"
#include "net/net.h"
@@ -50,7 +51,7 @@
#define ARP_OP_REQUEST_REV 0x3
static int announce_self_create(uint8_t *buf,
- uint8_t *mac_addr)
+ uint8_t *mac_addr)
{
/* Ethernet header. */
memset(buf, 0xff, 6); /* destination MAC addr */
@@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
} else {
- qemu_del_timer(timer);
- qemu_free_timer(timer);
+ qemu_del_timer(timer);
+ qemu_free_timer(timer);
}
}
void qemu_announce_self(void)
{
- static QEMUTimer *timer;
- timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
- qemu_announce_self_once(&timer);
+ static QEMUTimer *timer;
+ timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
+ qemu_announce_self_once(&timer);
}
/***********************************************************/
@@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
QEMUFileStdio *s;
if (mode == NULL ||
- (mode[0] != 'r' && mode[0] != 'w') ||
- mode[1] != 'b' || mode[2] != 0) {
+ (mode[0] != 'r' && mode[0] != 'w') ||
+ mode[1] != 'b' || mode[2] != 0) {
fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
return NULL;
}
@@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
QEMUFileStdio *s;
if (mode == NULL ||
- (mode[0] != 'r' && mode[0] != 'w') ||
- mode[1] != 'b' || mode[2] != 0) {
+ (mode[0] != 'r' && mode[0] != 'w') ||
+ mode[1] != 'b' || mode[2] != 0) {
fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
return NULL;
}
@@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
return f->last_error;
}
-static void qemu_file_set_error(QEMUFile *f, int ret)
+void qemu_file_set_error(QEMUFile *f, int ret)
{
if (f->last_error == 0) {
f->last_error = ret;
@@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
{
SaveStateEntry *se;
int ret = 1;
+ static int first_time = 1;
QTAILQ_FOREACH(se, &savevm_handlers, entry) {
if (!se->ops || !se->ops->save_live_iterate) {
@@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
}
}
if (ret != 0) {
+#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
+ /*
+ * We use two "sync" infiniband messages happen during migration.
+ * One at the beginning and one at the end, just to be thorough.
+ * This is the first one.
+ */
+ if (first_time && qemu_rdma_migration_enabled()) {
+ int r;
+ first_time = 0;
+ if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
+ QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
+ fprintf(stderr,
+ "rdma migration: error posting extra send sync!\n");
+ return -EIO;
+ }
+
+ r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+ QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
+ if (r < 0) {
+ fprintf(stderr,
+ "rdma migration: qemu_savevm_state_iterate"
+ " sync polling error!\n");
+ return -EIO;
+ }
+ }
+#endif
+
return ret;
}
+
ret = qemu_file_get_error(f);
if (ret != 0) {
qemu_savevm_state_cancel();
@@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
int len;
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
- continue;
+ continue;
}
trace_savevm_section_start();
/* Section type */
@@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
trace_savevm_section_end(se->section_id);
}
+ /*
+ * We use two "sync" infiniband messages happen during migration.
+ * One at the beginning and one at the end, just to be thorough.
+ * This is the second one.
+ */
+ if (qemu_rdma_migration_enabled()) {
+ if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
+ QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
+ fprintf(stderr, "rdma migration: error posting send sync!\n");
+ return -EIO;
+ }
+ }
+
qemu_put_byte(f, QEMU_VM_EOF);
+ /* wait for RDMA sync message to complete */
+ if (qemu_rdma_migration_enabled()) {
+ int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+ QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
+ if (ret < 0) {
+ fprintf(stderr, "rdma migration: qemu_savevm_state_full"
+ " sync polling error!\n");
+ return -EIO;
+ }
+ }
+
return qemu_file_get_error(f);
}
@@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
cpu_synchronize_all_post_init();
- ret = 0;
+ /* wait for RDMA sync message */
+ if (qemu_rdma_migration_enabled()) {
+ ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+ QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
+ if (ret < 0) {
+ fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
+ " sync polling error!\n");
+ goto out;
+ }
+ }
+ ret = 0;
out:
QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
QLIST_REMOVE(le, entry);