From: "Michael R. Hines" <mrhi...@us.ibm.com>
Signed-off-by: Michael R. Hines <mrhi...@us.ibm.com> --- arch_init.c | 116 +++++++++++++++++++++++++++++++++++++++-- include/migration/qemu-file.h | 1 + savevm.c | 90 +++++++++++++++++++++++++++----- 3 files changed, 189 insertions(+), 18 deletions(-) diff --git a/arch_init.c b/arch_init.c index dada6de..7633fa6 100644 --- a/arch_init.c +++ b/arch_init.c @@ -42,6 +42,7 @@ #include "migration/migration.h" #include "exec/gdbstub.h" #include "hw/smbios.h" +#include "qemu/rdma.h" #include "exec/address-spaces.h" #include "hw/pcspk.h" #include "migration/page_cache.h" @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH; #define RAM_SAVE_FLAG_EOS 0x10 #define RAM_SAVE_FLAG_CONTINUE 0x20 #define RAM_SAVE_FLAG_XBZRLE 0x40 +#define RAM_SAVE_FLAG_RDMA 0x80 #ifdef __ALTIVEC__ #include <altivec.h> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) int bytes_sent = 0; MemoryRegion *mr; ram_addr_t current_addr; + static int not_sent = 1; if (!block) block = QTAILQ_FIRST(&ram_list.blocks); @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage) int cont = (block == last_sent_block) ? RAM_SAVE_FLAG_CONTINUE : 0; + current_addr = block->offset + offset; p = memory_region_get_ram_ptr(mr) + offset; /* In doubt sent page as normal */ bytes_sent = -1; - if (is_dup_page(p)) { + + /* + * RFC RDMA: The empirical cost of searching for zero pages here + * plus the cost of communicating with the other side + * seems to take significantly more time than simply + * dumping the page into remote memory. + */ + if (!qemu_rdma_migration_enabled() && is_dup_page(p)) { acct_info.dup_pages++; bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS); qemu_put_byte(f, *p); bytes_sent += 1; + /* + * RFC RDMA: Same comment as above. time(run-length encoding) + * + time(communication) is too big. RDMA throughput tanks + * when this feature is enabled. But there's no need + * to change the code since the feature is optional. + */ } else if (migrate_use_xbzrle()) { - current_addr = block->offset + offset; bytes_sent = save_xbzrle_page(f, p, current_addr, block, offset, cont, last_stage); if (!last_stage) { p = get_cached_data(XBZRLE.cache, current_addr); } + } else if (qemu_rdma_migration_enabled()) { + int ret; + + /* + * RFC RDMA: This bad hack was to cause the loop on the + * receiving side to break. Comments are welcome + * on how to get rid of it. + */ + if (not_sent == 1) { + not_sent = 0; + bytes_sent = save_block_hdr(f, block, offset, + cont, RAM_SAVE_FLAG_RDMA); + } + acct_info.norm_pages++; + /* + * use RDMA to send page + */ + if (qemu_rdma_migration_write(&rdma_mdata, current_addr, + TARGET_PAGE_SIZE)) { + fprintf(stderr, "rdma migration: write error!\n"); + qemu_file_set_error(f, -EIO); + return 0; + } + + /* + * do some polling + */ + while (1) { + ret = qemu_rdma_migration_poll(&rdma_mdata); + if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) { + break; + } + if (ret < 0) { + fprintf(stderr, "rdma migration: polling error!\n"); + qemu_file_set_error(f, -EIO); + return 0; + } + } + bytes_sent += TARGET_PAGE_SIZE; } /* XBZRLE overflow or normal page */ @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return 0; } + +int tprate = 1000; + static int ram_save_iterate(QEMUFile *f, void *opaque) { int ret; int i; - int64_t t0; - int total_sent = 0; + int64_t t0, tp0; + int total_sent = 0, last_total_sent = 0; qemu_mutex_lock_ramlist(); @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) break; } total_sent += bytes_sent; + last_total_sent += bytes_sent; acct_info.iterations++; /* we want to check in the 1st loop, just in case it was the 1st time and we had to sync the dirty bitmap. qemu_get_clock_ns() is a bit expensive, so we only check each some iterations */ + + /* + * RFC RDMA: Can we have something like this to periodically print + * out throughput. + * This is just a rough-sketch that partially worked for me. + * I assume there a better way that everyone would prefer. + * Perhaps we could set a QMP command that toggled a "periodic printing" + * option that allowed more details to be printed on stdout.....? + */ if ((i & 63) == 0) { - uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000; + uint64_t curr = qemu_get_clock_ns(rt_clock); + uint64_t t1 = (curr - t0) / 1000000; + double tp; if (t1 > MAX_WAIT) { DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", t1, i); break; } + + if ((i % tprate) == 0) { + uint64_t tp1 = (curr - tp0) / 1000000; + tp = ((double) last_total_sent * 8.0 / + ((double) tp1 / 1000.0)) / 1000.0 / 1000.0; + printf("throughput: %f mbps\n", tp); + last_total_sent = 0; + tp0 = curr; + } } i++; } + /* flush buffer write */ + if (qemu_rdma_migration_enabled()) { + int resp; + resp = qemu_rdma_migration_write_flush(&rdma_mdata); + if (resp < 0) { + fprintf(stderr, "rdma migration: write flush error!\n"); + qemu_file_set_error(f, -EIO); + return 0; + } + } + qemu_mutex_unlock_ramlist(); if (ret < 0) { @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = -EINVAL; goto done; } + } else if (flags & RAM_SAVE_FLAG_RDMA) { + /* + * RFC RDMA: This bad hack was to cause the loop break. + * Comments are welcome on how to get rid of it. + * Communicating here is unnecessary because the + * RDMA page has already arrived. + * Comments are welcome on how to get rif of this. + */ + if (!qemu_rdma_migration_enabled()) { + return -EINVAL; + } + void *host = host_from_stream_offset(f, addr, flags); + if (!host) { + return -EINVAL; + } + /* rdma page is already here, nothing to do */ } error = qemu_file_get_error(f); if (error) { diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 68deefb..7c9968e 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f); int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate); int64_t qemu_file_get_rate_limit(QEMUFile *f); int qemu_file_get_error(QEMUFile *f); +void qemu_file_set_error(QEMUFile *f, int ret); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/savevm.c b/savevm.c index 304d1ef..071196e 100644 --- a/savevm.c +++ b/savevm.c @@ -24,6 +24,7 @@ #include "config-host.h" #include "qemu-common.h" +#include "qemu/rdma.h" #include "hw/hw.h" #include "hw/qdev.h" #include "net/net.h" @@ -50,7 +51,7 @@ #define ARP_OP_REQUEST_REV 0x3 static int announce_self_create(uint8_t *buf, - uint8_t *mac_addr) + uint8_t *mac_addr) { /* Ethernet header. */ memset(buf, 0xff, 6); /* destination MAC addr */ @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque) qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) + 50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100); } else { - qemu_del_timer(timer); - qemu_free_timer(timer); + qemu_del_timer(timer); + qemu_free_timer(timer); } } void qemu_announce_self(void) { - static QEMUTimer *timer; - timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer); - qemu_announce_self_once(&timer); + static QEMUTimer *timer; + timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer); + qemu_announce_self_once(&timer); } /***********************************************************/ @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode) QEMUFileStdio *s; if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { + (mode[0] != 'r' && mode[0] != 'w') || + mode[1] != 'b' || mode[2] != 0) { fprintf(stderr, "qemu_fdopen: Argument validity check failed\n"); return NULL; } @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode) QEMUFileStdio *s; if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { + (mode[0] != 'r' && mode[0] != 'w') || + mode[1] != 'b' || mode[2] != 0) { fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); return NULL; } @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f) return f->last_error; } -static void qemu_file_set_error(QEMUFile *f, int ret) +void qemu_file_set_error(QEMUFile *f, int ret) { if (f->last_error == 0) { f->last_error = ret; @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f) { SaveStateEntry *se; int ret = 1; + static int first_time = 1; QTAILQ_FOREACH(se, &savevm_handlers, entry) { if (!se->ops || !se->ops->save_live_iterate) { @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f) } } if (ret != 0) { +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC + /* + * We use two "sync" infiniband messages happen during migration. + * One at the beginning and one at the end, just to be thorough. + * This is the first one. + */ + if (first_time && qemu_rdma_migration_enabled()) { + int r; + first_time = 0; + if (qemu_rdma_migration_post_send_sync(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) { + fprintf(stderr, + "rdma migration: error posting extra send sync!\n"); + return -EIO; + } + + r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC); + if (r < 0) { + fprintf(stderr, + "rdma migration: qemu_savevm_state_iterate" + " sync polling error!\n"); + return -EIO; + } + } +#endif + return ret; } + ret = qemu_file_get_error(f); if (ret != 0) { qemu_savevm_state_cancel(); @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f) int len; if ((!se->ops || !se->ops->save_state) && !se->vmsd) { - continue; + continue; } trace_savevm_section_start(); /* Section type */ @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f) trace_savevm_section_end(se->section_id); } + /* + * We use two "sync" infiniband messages happen during migration. + * One at the beginning and one at the end, just to be thorough. + * This is the second one. + */ + if (qemu_rdma_migration_enabled()) { + if (qemu_rdma_migration_post_send_sync(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) { + fprintf(stderr, "rdma migration: error posting send sync!\n"); + return -EIO; + } + } + qemu_put_byte(f, QEMU_VM_EOF); + /* wait for RDMA sync message to complete */ + if (qemu_rdma_migration_enabled()) { + int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC); + if (ret < 0) { + fprintf(stderr, "rdma migration: qemu_savevm_state_full" + " sync polling error!\n"); + return -EIO; + } + } + return qemu_file_get_error(f); } @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_post_init(); - ret = 0; + /* wait for RDMA sync message */ + if (qemu_rdma_migration_enabled()) { + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC); + if (ret < 0) { + fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header" + " sync polling error!\n"); + goto out; + } + } + ret = 0; out: QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) { QLIST_REMOVE(le, entry); -- 1.7.10.4