If failover is requested, after some cleanup work, PVM or SVM will exit COLO mode, and resume to normal run.
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com> Signed-off-by: Li Zhijian <lizhij...@cn.fujitsu.com> Signed-off-by: Lai Jiangshan <la...@cn.fujitsu.com> --- include/migration/migration-colo.h | 14 ++++ include/migration/migration-failover.h | 2 + migration/colo-comm.c | 10 +++ migration/colo-failover.c | 12 +++- migration/colo.c | 122 ++++++++++++++++++++++++++++++++- stubs/migration-colo.c | 5 ++ 6 files changed, 163 insertions(+), 2 deletions(-) diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h index 27a515a..3bdd1ae 100644 --- a/include/migration/migration-colo.h +++ b/include/migration/migration-colo.h @@ -21,6 +21,13 @@ void colo_info_mig_init(void); +/* Checkpoint control, called in migration/checkpoint thread */ +enum { + COLO_UNPROTECTED_MODE = 0, + COLO_PRIMARY_MODE, + COLO_SECONDARY_MODE, +}; + struct colo_incoming { QEMUFile *file; QemuThread thread; @@ -35,8 +42,15 @@ bool loadvm_enable_colo(void); void loadvm_exit_colo(void); void *colo_process_incoming_checkpoints(void *opaque); bool loadvm_in_colo_state(void); + +int get_colo_mode(void); + /* ram cache */ void create_and_init_ram_cache(void); void colo_flush_ram_cache(void); void release_ram_cache(void); + +/* failover */ +void colo_do_failover(MigrationState *s); + #endif diff --git a/include/migration/migration-failover.h b/include/migration/migration-failover.h index 5fd376a..385fab3 100644 --- a/include/migration/migration-failover.h +++ b/include/migration/migration-failover.h @@ -16,5 +16,7 @@ #include "qemu-common.h" void failover_request_set(void); +void failover_request_clear(void); +bool failover_request_is_set(void); #endif diff --git a/migration/colo-comm.c b/migration/colo-comm.c index 038d12f..57bc6cd 100644 --- a/migration/colo-comm.c +++ b/migration/colo-comm.c @@ -32,6 +32,16 @@ static void colo_info_save(QEMUFile *f, void *opaque) } /* restore */ +int get_colo_mode(void) +{ + if (migrate_in_colo_state()) { + return COLO_PRIMARY_MODE; + } else if (loadvm_in_colo_state()) { + return COLO_SECONDARY_MODE; + } else { + return COLO_UNPROTECTED_MODE; + } +} static int colo_info_load(QEMUFile *f, void *opaque, int version_id) { int value = qemu_get_byte(f); diff --git a/migration/colo-failover.c b/migration/colo-failover.c index af78054..850b05c 100644 --- a/migration/colo-failover.c +++ b/migration/colo-failover.c @@ -22,7 +22,7 @@ static void colo_failover_bh(void *opaque) { qemu_bh_delete(failover_bh); failover_bh = NULL; - /*TODO: Do failover work */ + colo_do_failover(NULL); } void failover_request_set(void) @@ -32,6 +32,16 @@ void failover_request_set(void) qemu_bh_schedule(failover_bh); } +void failover_request_clear(void) +{ + failover_request = false; +} + +bool failover_request_is_set(void) +{ + return failover_request; +} + void qmp_colo_lost_heartbeat(Error **errp) { failover_request_set(); diff --git a/migration/colo.c b/migration/colo.c index cd84e4d..bcde1ec 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -65,6 +65,68 @@ static Coroutine *colo; #define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL) QEMUSizedBuffer *colo_buffer; +static bool colo_runstate_is_stopped(void) +{ + return runstate_check(RUN_STATE_COLO) || !runstate_is_running(); +} + +/* + * there are two way to entry this function + * 1. From colo checkpoint incoming thread, in this case + * we should protect it by iothread lock + * 2. From user command, because hmp/qmp command + * was happened in main loop, iothread lock will cause a + * dead lock. + */ +static void slave_do_failover(void) +{ + DPRINTF("do_failover!\n"); + + colo = NULL; + + if (!autostart) { + error_report("\"-S\" qemu option will be ignored in colo slave side"); + /* recover runstate to normal migration finish state */ + autostart = true; + } + + /* On slave side, jump to incoming co */ + if (migration_incoming_co) { + qemu_coroutine_enter(migration_incoming_co, NULL); + } +} + +static void master_do_failover(void) +{ + MigrationState *s = migrate_get_current(); + + if (!colo_runstate_is_stopped()) { + vm_stop_force_state(RUN_STATE_COLO); + } + + if (s->state != MIG_STATE_ERROR) { + migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED); + } + + vm_start(); +} + +static bool failover_completed; +void colo_do_failover(MigrationState *s) +{ + /* Make sure vm stopped while failover */ + if (!colo_runstate_is_stopped()) { + vm_stop_force_state(RUN_STATE_COLO); + } + + if (get_colo_mode() == COLO_SECONDARY_MODE) { + slave_do_failover(); + } else { + master_do_failover(); + } + failover_completed = true; +} + /* colo checkpoint control helper */ static int colo_ctl_put(QEMUFile *f, uint64_t request) { @@ -142,11 +204,23 @@ static int do_colo_transaction(MigrationState *s, QEMUFile *control) goto out; } + if (failover_request_is_set()) { + ret = -1; + goto out; + } /* suspend and save vm state to colo buffer */ qemu_mutex_lock_iothread(); vm_stop_force_state(RUN_STATE_COLO); qemu_mutex_unlock_iothread(); DPRINTF("vm is stoped\n"); + /* + * failover request bh could be called after + * vm_stop_force_state so we check failover_request_is_set() again. + */ + if (failover_request_is_set()) { + ret = -1; + goto out; + } /* Disable block migration */ s->params.blk = 0; @@ -242,7 +316,18 @@ static void *colo_thread(void *opaque) } out: - migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED); + fprintf(stderr, "colo: some error happens in colo_thread\n"); + qemu_mutex_lock_iothread(); + if (!failover_request_is_set()) { + error_report("master takeover from checkpoint channel"); + failover_request_set(); + } + qemu_mutex_unlock_iothread(); + + while (!failover_completed) { + ; + } + failover_request_clear(); if (colo_buffer) { qsb_free(colo_buffer); @@ -284,6 +369,11 @@ void colo_init_checkpointer(MigrationState *s) qemu_bh_schedule(colo_bh); } +bool loadvm_in_colo_state(void) +{ + return colo != NULL; +} + /* * return: * 0: start a checkpoint @@ -347,6 +437,10 @@ void *colo_process_incoming_checkpoints(void *opaque) if (slave_wait_new_checkpoint(f)) { break; } + if (failover_request_is_set()) { + error_report("failover request from heartbeat channel"); + goto out; + } /* suspend guest */ qemu_mutex_lock_iothread(); @@ -415,6 +509,32 @@ void *colo_process_incoming_checkpoints(void *opaque) } out: + fprintf(stderr, "Detect some error or get a failover request\n"); + /* determine whether we need to failover */ + if (!failover_request_is_set()) { + /* + * TODO: Here, maybe we should raise a qmp event to the user, + * It can help user to know what happens, and help deciding whether to + * do failover. + */ + usleep(2000 * 1000); + } + /* check flag again*/ + if (!failover_request_is_set()) { + /* + * We assume that master is still alive according to heartbeat, + * just kill slave + */ + error_report("SVM is going to exit!"); + exit(1); + } else { + /* if we went here, means master may dead, we are doing failover */ + while (!failover_completed) { + ; + } + failover_request_clear(); + } + colo = NULL; if (fb) { diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c index a690b04..c3514c8 100644 --- a/stubs/migration-colo.c +++ b/stubs/migration-colo.c @@ -27,6 +27,11 @@ bool migrate_in_colo_state(void) return false; } +bool loadvm_in_colo_state(void) +{ + return false; +} + void qmp_colo_lost_heartbeat(Error **errp) { error_setg(errp, "COLO is not supported, please rerun configure" -- 1.7.12.4