From: "Michael R. Hines" <mrhi...@us.ibm.com>

This patch allows the preparation of the migration_bitmap
to be parallelized. For very large VMs, this can take on
the order of 10s of milliseconds, which translates as downtime.

We count the number of cores first, and then handout chunks of
the logdirty bitmap to a thread per core. Each thread scans for
dirty bits in parallel.

Signed-off-by: Michael R. Hines <mrhi...@us.ibm.com>
---
 arch_init.c                   | 228 +++++++++++++++++++++++++++++++++++++++---
 include/migration/migration.h |  10 ++
 include/qemu-common.h         |  12 +++
 qapi-schema.json              |  73 +++++++++++++-
 vl.c                          |  33 ++++++
 5 files changed, 340 insertions(+), 16 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 7545d96..4a71311 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -189,6 +189,8 @@ typedef struct AccountingInfo {
     uint64_t skipped_pages;
     uint64_t norm_pages;
     uint64_t iterations;
+    uint64_t log_dirty_time;
+    uint64_t migration_bitmap_time;
     uint64_t xbzrle_bytes;
     uint64_t xbzrle_pages;
     uint64_t xbzrle_cache_miss;
@@ -232,6 +234,16 @@ uint64_t norm_mig_pages_transferred(void)
     return acct_info.norm_pages;
 }
 
+uint64_t norm_mig_log_dirty_time(void)
+{
+    return acct_info.log_dirty_time;
+}
+
+uint64_t norm_mig_bitmap_time(void)
+{
+    return acct_info.migration_bitmap_time;
+}
+
 uint64_t xbzrle_mig_bytes_transferred(void)
 {
     return acct_info.xbzrle_bytes;
@@ -362,15 +374,189 @@ ram_addr_t 
migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
 static inline bool migration_bitmap_set_dirty(MemoryRegion *mr,
                                               ram_addr_t offset)
 {
-    bool ret;
-    int nr = (mr->ram_addr + offset) >> TARGET_PAGE_BITS;
+    return test_and_set_bit((mr->ram_addr + offset) >> TARGET_PAGE_BITS, 
+                                migration_bitmap);
+}
+
+typedef struct BitmapWalkerParams {
+    QemuMutex ready_mutex;
+    QemuMutex done_mutex;
+    QemuCond cond;
+    QemuThread walker;
+    MigrationState *s;
+    int core_id;
+    int keep_running;
+    ram_addr_t start;
+    ram_addr_t stop;
+    void *block;
+    uint64_t dirty_pages;
+} BitmapWalkerParams;
 
-    ret = test_and_set_bit(nr, migration_bitmap);
+static int nb_bitmap_workers = 0;
 
-    if (!ret) {
-        migration_dirty_pages++;
+BitmapWalkerParams *bitmap_walkers = NULL;
+
+/*
+ * Bitmap workers: This is a temporary performance-driven
+ * workaround for the slowness (10s of milliseconds) incurred
+ * during calls to migration_bitmap_sync().
+ *
+ * Ideally, migration_bitmap_sync() should be able to use the
+ * GET_LOG_DIRTY bitmap from KVM directly, but it does not right
+ * now because the bitmap is not retrieved as a single memory
+ * allocation which requires a couple of transformations into
+ * a 'unified' bitmap before the migration code can make good use
+ * of it.
+ *
+ * Bitmap workers perform this transformation in parallel
+ * in a multi-threaded fashion until a patch is ready to process
+ * the bitmaps from GET_LOG_DIRTY directly.
+ */
+static uint64_t migration_bitmap_sync_range(RAMBlock *block, 
+                            ram_addr_t start, ram_addr_t stop)
+{
+    ram_addr_t addr;
+    uint64_t dirty_pages = 0;
+    
+
+    for (addr = start; addr < stop; addr += TARGET_PAGE_SIZE) {
+        if (memory_region_test_and_clear_dirty(block->mr,
+                                               addr, TARGET_PAGE_SIZE,
+                                               DIRTY_MEMORY_MIGRATION)) {
+            if (!migration_bitmap_set_dirty(block->mr, addr)) {
+                dirty_pages++;
+            }
+        }
+    }
+
+    return dirty_pages;
+}
+
+/*
+ * The worker sleeps until it gets some work to transform a 
+ * chunk of bitmap from KVM to the migration_bitmap.
+ */
+void *migration_bitmap_worker(void *opaque)
+{
+    BitmapWalkerParams * bwp = opaque;
+
+    do {
+        qemu_mutex_lock(&bwp->ready_mutex);
+        qemu_mutex_lock(&bwp->done_mutex);
+        qemu_mutex_unlock(&bwp->ready_mutex);
+        qemu_cond_signal(&bwp->cond);
+
+        if(!bwp->keep_running) {
+                break;
+        }
+
+        bwp->dirty_pages = migration_bitmap_sync_range(bwp->block, bwp->start, 
bwp->stop);
+
+        qemu_cond_wait(&bwp->cond, &bwp->done_mutex);
+        qemu_mutex_unlock(&bwp->done_mutex);
+    } while(bwp->keep_running);
+
+    return NULL;
+}
+
+void migration_bitmap_worker_start(MigrationState *s)
+{
+    int core;
+
+    /* 
+     * CPUs N - 1 are reserved for N - 1 worker threads 
+     * processing the pc.ram bytemap => migration_bitmap.
+     * The migration thread goes on the last CPU,
+     * which process the remaining, smaller RAMblocks.
+     */
+    nb_bitmap_workers = getNumCores() - 1;
+
+    bitmap_walkers = g_malloc0(sizeof(struct BitmapWalkerParams) * 
+                                                nb_bitmap_workers);
+
+    memset(bitmap_walkers, 0, sizeof(BitmapWalkerParams) * nb_bitmap_workers);
+
+    for (core = 0; core < nb_bitmap_workers; core++) {
+        BitmapWalkerParams * bwp = &bitmap_walkers[core];
+        bwp->core_id = core;
+        bwp->keep_running = 1;
+        bwp->s = s;
+        qemu_cond_init(&bwp->cond);
+        qemu_mutex_init(&bwp->ready_mutex);
+        qemu_mutex_init(&bwp->done_mutex);
+        qemu_mutex_lock(&bwp->ready_mutex);
+    }
+
+    for (core = 0; core < nb_bitmap_workers; core++) {
+        BitmapWalkerParams * bwp = &bitmap_walkers[core];
+        qemu_thread_create(&bwp->walker, 
+            migration_bitmap_worker, bwp, QEMU_THREAD_DETACHED);
+    }
+}
+
+void migration_bitmap_worker_stop(MigrationState *s)
+{
+    int core;
+
+    for (core = 0; core < nb_bitmap_workers; core++) {
+        BitmapWalkerParams * bwp = &bitmap_walkers[core];
+        bwp->keep_running = 0;
+        qemu_mutex_unlock(&bwp->ready_mutex);
+    }
+
+    DPRINTF("Bitmap workers stopped.\n");
+
+       g_free(bitmap_walkers);
+       bitmap_walkers = NULL;
+    nb_bitmap_workers = 0;
+}
+
+
+static void migration_bitmap_distribute_specific_worker(MigrationState *s, 
RAMBlock * block, int core, ram_addr_t start, ram_addr_t stop)
+{
+    BitmapWalkerParams * bwp = &bitmap_walkers[core];
+
+    bwp->start = start;
+    bwp->stop = stop;
+    bwp->block = block;
+
+    qemu_cond_wait(&bwp->cond, &bwp->ready_mutex);
+} 
+
+static void migration_bitmap_join_worker(MigrationState *s, int core)
+{
+    BitmapWalkerParams * bwp = &bitmap_walkers[core];
+    qemu_mutex_lock(&bwp->done_mutex);
+    qemu_cond_signal(&bwp->cond);
+    qemu_mutex_unlock(&bwp->done_mutex);
+    migration_dirty_pages += bwp->dirty_pages;
+}
+
+/*
+ * Chop up the QEMU 'bytemap' built around GET_LOG_DIRTY and handout
+ * the migration_bitmap population work to all the workers.
+ * 
+ * If there are N cpus in the hypervisor, there will be N workers
+ * which each process equal chunks of the RAM block bytemap.
+ */
+static void migration_bitmap_distribute_work(MigrationState *s, RAMBlock * 
block)
+{
+    uint64_t pages = block->length / TARGET_PAGE_SIZE;
+    uint64_t inc = pages / nb_bitmap_workers;
+    uint64_t remainder = pages % inc;
+    int core;
+
+    for (core = 0; core < nb_bitmap_workers; core++) {
+        ram_addr_t start = core * inc, stop = core * inc + inc;
+
+        if(core == (nb_bitmap_workers - 1))
+                stop += remainder;
+
+        start *= TARGET_PAGE_SIZE;
+        stop *= TARGET_PAGE_SIZE;
+        
+        migration_bitmap_distribute_specific_worker(s, block, core, start, 
stop);
     }
-    return ret;
 }
 
 /* Needs iothread lock! */
@@ -378,7 +564,6 @@ static inline bool migration_bitmap_set_dirty(MemoryRegion 
*mr,
 static void migration_bitmap_sync(void)
 {
     RAMBlock *block;
-    ram_addr_t addr;
     uint64_t num_dirty_pages_init = migration_dirty_pages;
     MigrationState *s = migrate_get_current();
     static int64_t start_time;
@@ -386,33 +571,46 @@ static void migration_bitmap_sync(void)
     static int64_t num_dirty_pages_period;
     int64_t end_time;
     int64_t bytes_xfer_now;
+    int64_t begin_time;
+    int64_t dirty_time;
 
     if (!bytes_xfer_prev) {
         bytes_xfer_prev = ram_bytes_transferred();
     }
 
+    begin_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     if (!start_time) {
         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     }
-
     trace_migration_bitmap_sync_start();
     address_space_sync_dirty_bitmap(&address_space_memory);
 
+    dirty_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
     QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
-            if (memory_region_test_and_clear_dirty(block->mr,
-                                                   addr, TARGET_PAGE_SIZE,
-                                                   DIRTY_MEMORY_MIGRATION)) {
-                migration_bitmap_set_dirty(block->mr, addr);
-            }
+        if (!strcmp(block->idstr, "pc.ram") && nb_bitmap_workers) {
+            migration_bitmap_distribute_work(s, block);
+            continue;
         }
+        migration_dirty_pages += migration_bitmap_sync_range(block, 0, 
block->length);
     }
+
+    if (nb_bitmap_workers) {
+        int core;
+        for (core = 0; core < nb_bitmap_workers; core++) {
+            migration_bitmap_join_worker(s, core);
+        }
+    }
+
     trace_migration_bitmap_sync_end(migration_dirty_pages
                                     - num_dirty_pages_init);
     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 
-    /* more than 1 second = 1000 millisecons */
+    acct_info.log_dirty_time += dirty_time - begin_time;
+    acct_info.migration_bitmap_time += end_time - dirty_time;
+
+    /* more than 1 second = 1000 milliseconds */
     if (end_time > start_time + 1000) {
         if (migrate_auto_converge()) {
             /* The following detection logic can be refined later. For now:
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 140e6b4..3ffc433 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -45,6 +45,10 @@ struct MigrationState
     int64_t total_time;
     int64_t downtime;
     int64_t expected_downtime;
+    int64_t xmit_time;
+    int64_t ram_copy_time;
+    int64_t log_dirty_time;
+    int64_t bitmap_time;
     int64_t dirty_pages_rate;
     int64_t dirty_bytes_rate;
     bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
@@ -109,10 +113,16 @@ uint64_t skipped_mig_bytes_transferred(void);
 uint64_t skipped_mig_pages_transferred(void);
 uint64_t norm_mig_bytes_transferred(void);
 uint64_t norm_mig_pages_transferred(void);
+uint64_t norm_mig_log_dirty_time(void);
+uint64_t norm_mig_bitmap_time(void);
 uint64_t xbzrle_mig_bytes_transferred(void);
 uint64_t xbzrle_mig_pages_transferred(void);
 uint64_t xbzrle_mig_pages_overflow(void);
 uint64_t xbzrle_mig_pages_cache_miss(void);
+void *migration_bitmap_worker(void *opaque);
+void migration_bitmap_worker_start(MigrationState *s);
+void migration_bitmap_worker_stop(MigrationState *s);
+void migrate_set_state(MigrationState *s, int old_state, int new_state);
 
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 5054836..936dc02 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -478,4 +478,16 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t 
len);
  */
 int parse_debug_env(const char *name, int max, int initial);
 
+/*
+ * Headers to get number of host processors.
+ */
+int getNumCores(void);
+#if defined(WIN32)
+#include <windows.h>
+#elif defined(CONFIG_BSD)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#elif defined(CONFIG_LINUX)
+#include <unistd.h>
+#endif
 #endif
diff --git a/qapi-schema.json b/qapi-schema.json
index 60f3fd1..aac0894 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -577,6 +577,36 @@
            'cache-miss': 'int', 'overflow': 'int' } }
 
 ##
+# @MCStats
+#
+# Detailed Micro Checkpointing (MC) statistics
+#
+# @mbps: throughput of transmitting last MC 
+#
+# @xmit-time: milliseconds to transmit last MC 
+#
+# @log-dirty-time: milliseconds to GET_LOG_DIRTY for last MC 
+#
+# @migration-bitmap-time: milliseconds to prepare dirty bitmap for last MC
+#
+# @ram-copy-time: milliseconds to ram_save_live() last MC to staging memory
+#
+# @copy-mbps: throughput of ram_save_live() to staging memory for last MC 
+#
+# @checkpoints: cummulative total number of MCs generated 
+#
+# Since: 1.8
+##
+{ 'type': 'MCStats',
+  'data': {'mbps': 'number',
+           'xmit-time': 'uint64',
+           'log-dirty-time': 'uint64',
+           'migration-bitmap-time': 'uint64', 
+           'ram-copy-time': 'uint64',
+           'checkpoints' : 'uint64',
+           'copy-mbps': 'number' }}
+
+##
 # @MigrationInfo
 #
 # Information about current migration process.
@@ -622,6 +652,7 @@
   'data': {'*status': 'str', '*ram': 'MigrationStats',
            '*disk': 'MigrationStats',
            '*xbzrle-cache': 'XBZRLECacheStats',
+           '*mc': 'MCStats',
            '*total-time': 'int',
            '*expected-downtime': 'int',
            '*downtime': 'int',
@@ -661,10 +692,50 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #          to speed up convergence of RAM migration. (since 1.6)
 #
+# @x-mc: The migration will never end, and the VM will instead be continuously
+#          micro-checkpointed (MC). Use the command migrate-set-mc-delay to 
+#          control the frequency at which the checkpoints occur. 
+#          Disabled by default. (Since 1.8)
+#
+# @mc-net-disable: Deactivate network buffering against outbound network 
+#          traffic while Micro-Checkpointing (@mc) is active.
+#          Enabled by default. Disabling will make the MC protocol inconsistent
+#          and potentially break network connections upon an actual failure.
+#          Only for performance testing. (Since 1.8)
+#
+# @mc-rdma-copy: MC requires creating a local-memory checkpoint before
+#          transmission to the destination. This requires heavy use of 
+#          memcpy() which dominates the processor pipeline. This option 
+#          makes use of *local* RDMA to perform the copy instead of the CPU.
+#          Enabled by default only if the migration transport is RDMA.
+#          Disabled by default otherwise. (Since 1.8)
+#
+# @bitworkers: Allow the QEMU migration bitmap to be scanned in parallel
+#          by using multiple processors on the host machine.
+#          This capability has no effect without also enabling @mc.
+#          Enabled by default. (Since 1.8)
+#
+# @rdma-keepalive: RDMA connections do not timeout by themselves if a peer
+#         has disconnected prematurely or failed. User-level keepalives
+#         allow the migration to abort cleanly if there is a problem with the
+#         destination host. For debugging, this can be problematic as
+#         the keepalive may cause the peer to abort prematurely if we are
+#         at a GDB breakpoint, for example.
+#         Enabled by default. (Since 1.8)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 
+           'x-rdma-pin-all', 
+           'auto-converge', 
+           'zero-blocks',
+           'x-mc', 
+           'mc-net-disable',
+           'mc-rdma-copy',
+           'bitworkers',
+           'rdma-keepalive'
+          ] }
 
 ##
 # @MigrationCapabilityStatus
diff --git a/vl.c b/vl.c
index b42ac67..e2ba2e8 100644
--- a/vl.c
+++ b/vl.c
@@ -2818,6 +2818,39 @@ static int object_create(QemuOpts *opts, void *opaque)
     return 0;
 }
 
+/*
+ * Currently, only used for migration_bitmap_sync(),
+ * but can be queried by anyone in the future.
+ */
+int getNumCores(void) 
+{
+    uint32_t count;
+#if defined(WIN32)
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    count = sysinfo.dwNumberOfProcessors;
+#elif defined(CONFIG_BSD)
+    int nm[2];
+    size_t len = 4;
+    nm[0] = CTL_HW; 
+    nm[1] = HW_AVAILCPU;
+    sysctl(nm, 2, &count, &len, NULL, 0);
+
+    if (count < 1) {
+        nm[1] = HW_NCPU;
+        sysctl(nm, 2, &count, &len, NULL, 0);
+        if(count < 1) { 
+           count = 1; 
+        }
+    }
+#elif defined(CONFIG_LINUX)
+    count = sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    count = 1;
+#endif
+    return count;
+}
+
 int main(int argc, char **argv, char **envp)
 {
     int i;
-- 
1.8.1.2


Reply via email to