Use the return-path capability with colo and reuse the opened return path
file on both primary and secondary side.

This fixes a crash in colo where migration_cancel() races with colo closing
s->rp_state.from_dst_file.

Signed-off-by: Lukas Straub <[email protected]>
---
 docs/system/qemu-colo.rst          |  4 ++--
 migration/colo.c                   | 26 +++++++++-----------------
 migration/options.c                | 10 +++++++++-
 tests/qtest/migration/colo-tests.c |  1 +
 tests/qtest/migration/framework.c  | 13 +++++++++++++
 5 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/docs/system/qemu-colo.rst b/docs/system/qemu-colo.rst
index 
f7d3b6439cf3401a58c412634239d1a43999a10e..9ec3eb79b006a24e0d3a360f1c8f49a68f4d8086
 100644
--- a/docs/system/qemu-colo.rst
+++ b/docs/system/qemu-colo.rst
@@ -225,7 +225,7 @@ any IP's here, except for the ``$primary_ip`` variable::
 **3.** On Secondary VM's QEMU monitor, issue command::
 
     {"execute":"qmp_capabilities"}
-    {"execute": "migrate-set-capabilities", "arguments": {"capabilities": [ 
{"capability": "x-colo", "state": true } ] } }
+    {"execute": "migrate-set-capabilities", "arguments": {"capabilities": [ 
{"capability": "return-path", "state": true }, {"capability": "x-colo", 
"state": true } ] } }
     {"execute": "nbd-server-start", "arguments": {"addr": {"type": "inet", 
"data": {"host": "0.0.0.0", "port": "9999"} } } }
     {"execute": "nbd-server-add", "arguments": {"device": "parent0", 
"writable": true } }
 
@@ -242,7 +242,7 @@ Note:
     {"execute":"qmp_capabilities"}
     {"execute": "blockdev-add", "arguments": {"driver": "nbd", "node-name": 
"nbd0", "server": {"type": "inet", "host": "127.0.0.2", "port": "9999"}, 
"export": "parent0", "detect-zeroes": "on"} }
     {"execute": "x-blockdev-change", "arguments":{"parent": "colo-disk0", 
"node": "nbd0" } }
-    {"execute": "migrate-set-capabilities", "arguments": {"capabilities": [ 
{"capability": "x-colo", "state": true } ] } }
+    {"execute": "migrate-set-capabilities", "arguments": {"capabilities": [ 
{"capability": "return-path", "state": true }, {"capability": "x-colo", 
"state": true } ] } }
     {"execute": "migrate", "arguments": {"uri": "tcp:127.0.0.2:9998" } }
 
 Note:
diff --git a/migration/colo.c b/migration/colo.c
index 
dc47d03874039b686d2a4072ac4e6c77e4ff1f87..2d36f933cf155c1084565162294a61d32e28fe86
 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -539,6 +539,8 @@ static void colo_process_checkpoint(MigrationState *s)
     Error *local_err = NULL;
     int ret;
 
+    assert(s->rp_state.from_dst_file);
+    assert(!s->rp_state.rp_thread_created);
     if (get_colo_mode() != COLO_MODE_PRIMARY) {
         error_report("COLO mode must be COLO_MODE_PRIMARY");
         return;
@@ -546,12 +548,6 @@ static void colo_process_checkpoint(MigrationState *s)
 
     failover_init_state();
 
-    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
-    if (!s->rp_state.from_dst_file) {
-        error_report("Open QEMUFile from_dst_file failed");
-        goto out;
-    }
-
     packets_compare_notifier.notify = colo_compare_notify_checkpoint;
     colo_compare_register_notifier(&packets_compare_notifier);
 
@@ -636,16 +632,6 @@ out:
     colo_compare_unregister_notifier(&packets_compare_notifier);
     timer_free(s->colo_delay_timer);
     qemu_event_destroy(&s->colo_checkpoint_event);
-
-    /*
-     * Must be called after failover BH is completed,
-     * Or the failover BH may shutdown the wrong fd that
-     * re-used by other threads after we release here.
-     */
-    if (s->rp_state.from_dst_file) {
-        qemu_fclose(s->rp_state.from_dst_file);
-        s->rp_state.from_dst_file = NULL;
-    }
 }
 
 void migrate_start_colo_process(MigrationState *s)
@@ -838,6 +824,7 @@ static void *colo_process_incoming_thread(void *opaque)
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
 
+    assert(mis->to_src_file);
     if (get_colo_mode() != COLO_MODE_SECONDARY) {
         error_report("COLO mode must be COLO_MODE_SECONDARY");
         return NULL;
@@ -854,7 +841,6 @@ static void *colo_process_incoming_thread(void *opaque)
 
     failover_init_state();
 
-    mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
     /*
      * Note: the communication between Primary side and Secondary side
      * should be sequential, we set the fd to unblocked in migration incoming
@@ -866,6 +852,12 @@ static void *colo_process_incoming_thread(void *opaque)
         goto out;
     }
 
+    /*
+     * rp thread still running on primary side, shut it down to go into
+     * colo state.
+     */
+    migrate_send_rp_shut(mis, 0);
+
     colo_incoming_start_dirty_log();
 
     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
diff --git a/migration/options.c b/migration/options.c
index 
1ffe85a2d8ccc8de9cc04d3c12293538a367abf7..f33b2979290f30ecfd0ffc4654498f25750f6720
 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -575,7 +575,15 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, 
Error **errp)
     ERRP_GUARD();
     MigrationIncomingState *mis = migration_incoming_get_current();
 
-#ifndef CONFIG_REPLICATION
+#ifdef CONFIG_REPLICATION
+    if (new_caps[MIGRATION_CAPABILITY_X_COLO]) {
+        if (!new_caps[MIGRATION_CAPABILITY_RETURN_PATH]) {
+            error_setg(errp, "Capability 'x-colo' requires capability "
+                             "'return-path'");
+            return false;
+        }
+    }
+#else
     if (new_caps[MIGRATION_CAPABILITY_X_COLO]) {
         error_setg(errp, "QEMU compiled without replication module"
                    " can't enable COLO");
diff --git a/tests/qtest/migration/colo-tests.c 
b/tests/qtest/migration/colo-tests.c
index 
598a1d3821ed0a90318732702027cebad47352fd..366b9f7ac6d56a6e74a29d3e062124ad4234608e
 100644
--- a/tests/qtest/migration/colo-tests.c
+++ b/tests/qtest/migration/colo-tests.c
@@ -46,6 +46,7 @@ static int test_colo_common(MigrateCommon *args,
      * used in production.
      */
     args->start.oob = true;
+    args->start.caps[MIGRATION_CAPABILITY_RETURN_PATH] = true;
     args->start.caps[MIGRATION_CAPABILITY_X_COLO] = true;
 
     if (migrate_start(&from, &to, args->listen_uri, &args->start)) {
diff --git a/tests/qtest/migration/framework.c 
b/tests/qtest/migration/framework.c
index 
2a3efeb80740cbfe876f649f0af685c5839e00a2..0bfc241914462c023ceab2fab0386009dc5a4320
 100644
--- a/tests/qtest/migration/framework.c
+++ b/tests/qtest/migration/framework.c
@@ -216,6 +216,19 @@ static void migrate_start_set_capabilities(QTestState 
*from, QTestState *to,
      * MigrationCapability_lookup and MIGRATION_CAPABILITY_ constants
      * are from qapi-types-migration.h.
      */
+
+    /*
+     * Enable return path first, since other features depend on it.
+     */
+    if (args->caps[MIGRATION_CAPABILITY_RETURN_PATH]) {
+        if (from) {
+            migrate_set_capability(from, "return-path", true);
+        }
+        if (to) {
+            migrate_set_capability(to, "return-path", true);
+        }
+    }
+
     for (uint8_t i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
         if (!args->caps[i]) {
             continue;

-- 
2.39.5


Reply via email to