[Qemu-devel] [RFC v4 71/71] cpus-common: wait on the CPU lock for exclusive work completion

Emilio G. Cota Thu, 25 Oct 2018 07:58:44 -0700

The current implementation of exclusive work can suffer from high
contention when the number of guest CPUs is large (> 16 or so). The
reason is that all CPUs end up waiting on the same condvar/mutex pair,
which unnecessarily slows them down to wake up.


Fix it by having them wait on their "local" cpu->lock. This shifts
the burden of waking up threads to the exclusive thread, but this
is preferable to the existing solution because it induces a lot
less contention.

Some perf numbers when compiling a linux kernel with `make tinyconfig'
in an aarch64 guest:

- Host: 32-core Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz
- Workload: Linux kernel compilation with `make -j $n' in a VM with $n vCPUs

                             [ Y axis: speedup over $n=1 ]
         8 +----------------------------------------------------------------+
           |      +    +      +      +     #D############+      +      +    |
           |                             ##*B*********   D#############D    |
         7 |-+                         ##**XXXXXXXXXX ***                 +-|
           |                         ##**X               B**********        |
           |                       ##**                             ***     |
         6 |-+                  X##*                                   B  +-|
           |                  X##*                                          |
           |                 XD*                                            |
         5 |-+              X#                                            +-|
           |               X#                                               |
           |              *#                                                |
         4 |-+           *#                                               +-|
           |            *#                                                  |
           |          XB#                                                   |
           |          XD                                                    |
         3 |-+       X#                                                   +-|
           |         ##                                                     |
           |        ##                                                      |
         2 |-+      #                                                     +-|
           |       #                                                        |
           |       #                                                        |
         1 |-+    D                                                       +-|
           |                                                    after ##D## |
           |      +    +      +      +      +     +      +     before **B** |
         0 +----------------------------------------------------------------+
                  1    4      8      12     16    20     24     28     32
                                      Guest vCPUs
  png: https://imgur.com/jskOcxR

With this change we can't obtain an additional speedup, although we mitigate
the performance collapse. This is due to the heavy-duty nature of async
safe work, and the frequency at which it is run.

A proper fix would reduce the overhead of safe async work.

Signed-off-by: Emilio G. Cota <c...@braap.org>
---
 include/qom/cpu.h |   4 +-
 cpus-common.c     | 146 ++++++++++++++++++----------------------------
 2 files changed, 61 insertions(+), 89 deletions(-)

diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 0e90d9d093..204bc94056 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -345,7 +345,6 @@ struct CPUState {
     HANDLE hThread;
 #endif
     int thread_id;
-    bool running, has_waiter;
     bool thread_kicked;
     bool crash_occurred;
     bool exit_request;
@@ -367,6 +366,9 @@ struct CPUState {
     bool stop;
     bool stopped;
     bool unplug;
+    bool running;
+    bool exclusive_waiter;
+    bool exclusive_ongoing;
 
     CPUAddressSpace *cpu_ases;
     int num_ases;
diff --git a/cpus-common.c b/cpus-common.c
index ad8a8ef535..cffb2b71ac 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -24,22 +24,22 @@
 #include "sysemu/cpus.h"
 
 static QemuMutex qemu_cpu_list_lock;
-static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
 
 /* >= 1 if a thread is inside start_exclusive/end_exclusive.  Written
  * under qemu_cpu_list_lock, read with atomic operations.
  */
 static int pending_cpus;
+static bool exclusive_work_ongoing;
 
 void qemu_init_cpu_list(void)
 {
     /* This is needed because qemu_init_cpu_list is also called by the
      * child process in a fork.  */
     pending_cpus = 0;
+    exclusive_work_ongoing = false;
 
     qemu_mutex_init(&qemu_cpu_list_lock);
-    qemu_cond_init(&exclusive_cond);
     qemu_cond_init(&exclusive_resume);
 }
 
@@ -77,7 +77,7 @@ static void finish_safe_work(CPUState *cpu)
    must be held.  */
 static inline void exclusive_idle(void)
 {
-    while (pending_cpus) {
+    while (exclusive_work_ongoing) {
         qemu_cond_wait(&exclusive_resume, &qemu_cpu_list_lock);
     }
 }
@@ -91,6 +91,10 @@ void cpu_list_add(CPUState *cpu)
     } else {
         assert(!cpu_index_auto_assigned);
     }
+
+    /* make sure no exclusive jobs are running before touching the list */
+    exclusive_idle();
+
     QTAILQ_INSERT_TAIL_RCU(&cpus, cpu, node);
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 
@@ -108,6 +112,9 @@ void cpu_list_remove(CPUState *cpu)
 
     assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ)));
 
+    /* make sure no exclusive jobs are running before touching the list */
+    exclusive_idle();
+
     QTAILQ_REMOVE_RCU(&cpus, cpu, node);
     cpu->cpu_index = UNASSIGNED_CPU_INDEX;
     qemu_mutex_unlock(&qemu_cpu_list_lock);
@@ -214,120 +221,83 @@ void async_run_on_cpu_no_bql(CPUState *cpu, 
run_on_cpu_func func,
 void start_exclusive(void)
 {
     CPUState *other_cpu;
-    int running_cpus;
 
     qemu_mutex_lock(&qemu_cpu_list_lock);
     exclusive_idle();
+    exclusive_work_ongoing = true;
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
 
     /* Make all other cpus stop executing.  */
-    atomic_set(&pending_cpus, 1);
-
-    /* Write pending_cpus before reading other_cpu->running.  */
-    smp_mb();
-    running_cpus = 0;
     CPU_FOREACH(other_cpu) {
-        if (atomic_read(&other_cpu->running)) {
-            other_cpu->has_waiter = true;
-            running_cpus++;
+        cpu_mutex_lock(other_cpu);
+        if (other_cpu->running) {
+            g_assert(!other_cpu->exclusive_waiter);
+            other_cpu->exclusive_waiter = true;
             qemu_cpu_kick(other_cpu);
         }
+        other_cpu->exclusive_ongoing = true;
+        cpu_mutex_unlock(other_cpu);
     }
 
-    atomic_set(&pending_cpus, running_cpus + 1);
-    while (pending_cpus > 1) {
-        qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
+    /* wait for CPUs that were running to clear us */
+    CPU_FOREACH(other_cpu) {
+        cpu_mutex_lock(other_cpu);
+        while (other_cpu->exclusive_waiter) {
+            qemu_cond_wait(&other_cpu->cond, &other_cpu->lock);
+        }
+        cpu_mutex_unlock(other_cpu);
     }
-
-    /* Can release mutex, no one will enter another exclusive
-     * section until end_exclusive resets pending_cpus to 0.
-     */
-    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
 /* Finish an exclusive operation.  */
 void end_exclusive(void)
 {
+    CPUState *other_cpu;
+
+    CPU_FOREACH(other_cpu) {
+        cpu_mutex_lock(other_cpu);
+        g_assert(!other_cpu->exclusive_waiter);
+        g_assert(other_cpu->exclusive_ongoing);
+        other_cpu->exclusive_ongoing = false;
+        qemu_cond_signal(&other_cpu->cond);
+        cpu_mutex_unlock(other_cpu);
+    }
+
     qemu_mutex_lock(&qemu_cpu_list_lock);
-    atomic_set(&pending_cpus, 0);
+    exclusive_work_ongoing = false;
     qemu_cond_broadcast(&exclusive_resume);
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
+static void cpu_exec_exclusive_locked(CPUState *cpu)
+{
+    g_assert(cpu_mutex_locked(cpu));
+
+    if (cpu->exclusive_waiter) {
+        cpu->exclusive_waiter = false;
+        qemu_cond_signal(&cpu->cond);
+    }
+    while (cpu->exclusive_ongoing) {
+        qemu_cond_wait(&cpu->cond, &cpu->lock);
+    }
+}
+
 /* Wait for exclusive ops to finish, and begin cpu execution.  */
 void cpu_exec_start(CPUState *cpu)
 {
-    atomic_set(&cpu->running, true);
-
-    /* Write cpu->running before reading pending_cpus.  */
-    smp_mb();
-
-    /* 1. start_exclusive saw cpu->running == true and pending_cpus >= 1.
-     * After taking the lock we'll see cpu->has_waiter == true and run---not
-     * for long because start_exclusive kicked us.  cpu_exec_end will
-     * decrement pending_cpus and signal the waiter.
-     *
-     * 2. start_exclusive saw cpu->running == false but pending_cpus >= 1.
-     * This includes the case when an exclusive item is running now.
-     * Then we'll see cpu->has_waiter == false and wait for the item to
-     * complete.
-     *
-     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
-     * see cpu->running == true, and it will kick the CPU.
-     */
-    if (unlikely(atomic_read(&pending_cpus))) {
-        qemu_mutex_lock(&qemu_cpu_list_lock);
-        if (!cpu->has_waiter) {
-            /* Not counted in pending_cpus, let the exclusive item
-             * run.  Since we have the lock, just set cpu->running to true
-             * while holding it; no need to check pending_cpus again.
-             */
-            atomic_set(&cpu->running, false);
-            exclusive_idle();
-            /* Now pending_cpus is zero.  */
-            atomic_set(&cpu->running, true);
-        } else {
-            /* Counted in pending_cpus, go ahead and release the
-             * waiter at cpu_exec_end.
-             */
-        }
-        qemu_mutex_unlock(&qemu_cpu_list_lock);
-    }
+    cpu_mutex_lock(cpu);
+    cpu_exec_exclusive_locked(cpu);
+    cpu->running = true;
+    cpu_mutex_unlock(cpu);
 }
 
 /* Mark cpu as not executing, and release pending exclusive ops.  */
 void cpu_exec_end(CPUState *cpu)
 {
-    atomic_set(&cpu->running, false);
-
-    /* Write cpu->running before reading pending_cpus.  */
-    smp_mb();
-
-    /* 1. start_exclusive saw cpu->running == true.  Then it will increment
-     * pending_cpus and wait for exclusive_cond.  After taking the lock
-     * we'll see cpu->has_waiter == true.
-     *
-     * 2. start_exclusive saw cpu->running == false but here pending_cpus >= 1.
-     * This includes the case when an exclusive item started after setting
-     * cpu->running to false and before we read pending_cpus.  Then we'll see
-     * cpu->has_waiter == false and not touch pending_cpus.  The next call to
-     * cpu_exec_start will run exclusive_idle if still necessary, thus waiting
-     * for the item to complete.
-     *
-     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
-     * see cpu->running == false, and it can ignore this CPU until the
-     * next cpu_exec_start.
-     */
-    if (unlikely(atomic_read(&pending_cpus))) {
-        qemu_mutex_lock(&qemu_cpu_list_lock);
-        if (cpu->has_waiter) {
-            cpu->has_waiter = false;
-            atomic_set(&pending_cpus, pending_cpus - 1);
-            if (pending_cpus == 1) {
-                qemu_cond_signal(&exclusive_cond);
-            }
-        }
-        qemu_mutex_unlock(&qemu_cpu_list_lock);
-    }
+    cpu_mutex_lock(cpu);
+    cpu->running = false;
+    cpu_exec_exclusive_locked(cpu);
+    cpu_mutex_unlock(cpu);
 }
 
 void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
-- 
2.17.1

[Qemu-devel] [RFC v4 71/71] cpus-common: wait on the CPU lock for exclusive work completion

Reply via email to