In order to break the big lock, using per-cpu_lock in kvm_cpu_exec() to protect the race from other cpu's access to env->apic_state & related field in env. Also, we need to protect agaist run_on_cpu().
Race condition can be like this: 1. vcpu-1 IPI vcpu-2 vcpu-3 IPI vcpu-2 Open window exists for accessing to vcpu-2's apic_state & env 2. run_on_cpu() write env->queued_work_last, while flush_queued_work() read Signed-off-by: Liu Ping Fan <pingf...@linux.vnet.ibm.com> --- cpus.c | 6 ++++-- hw/apic.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- hw/pc.c | 8 +++++++- kvm-all.c | 13 +++++++++++-- 4 files changed, 76 insertions(+), 9 deletions(-) diff --git a/cpus.c b/cpus.c index 554f7bc..ac99afe 100644 --- a/cpus.c +++ b/cpus.c @@ -649,6 +649,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void *data), void *data) wi.func = func; wi.data = data; + qemu_mutex_lock(env->cpu_lock); if (!env->queued_work_first) { env->queued_work_first = &wi; } else { @@ -657,6 +658,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void *data), void *data) env->queued_work_last = &wi; wi.next = NULL; wi.done = false; + qemu_mutex_unlock(env->cpu_lock); qemu_cpu_kick(env); while (!wi.done) { @@ -718,7 +720,7 @@ static void qemu_tcg_wait_io_event(void) static void qemu_kvm_wait_io_event(CPUArchState *env) { while (cpu_thread_is_idle(env)) { - qemu_cond_wait(env->halt_cond, &qemu_global_mutex); + qemu_cond_wait(env->halt_cond, env->cpu_lock); } qemu_kvm_eat_signals(env); @@ -729,8 +731,8 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) { CPUArchState *env = arg; int r; + qemu_mutex_lock_cpu(env); - qemu_mutex_lock(&qemu_global_mutex); qemu_thread_get_self(env->thread); env->thread_id = qemu_get_thread_id(); cpu_single_env = env; diff --git a/hw/apic.c b/hw/apic.c index 4eeaf88..b999a40 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -22,6 +22,7 @@ #include "host-utils.h" #include "trace.h" #include "pc.h" +#include "qemu-thread.h" #define MAX_APIC_WORDS 8 @@ -94,6 +95,7 @@ static int get_highest_priority_int(uint32_t *tab) return -1; } +/* Caller must hold the lock */ static void apic_sync_vapic(APICCommonState *s, int sync_type) { VAPICState vapic_state; @@ -141,11 +143,13 @@ static void apic_sync_vapic(APICCommonState *s, int sync_type) } } +/* Caller must hold lock */ static void apic_vapic_base_update(APICCommonState *s) { apic_sync_vapic(s, SYNC_TO_VAPIC); } +/* Caller must hold the lock */ static void apic_local_deliver(APICCommonState *s, int vector) { uint32_t lvt = s->lvt[vector]; @@ -175,9 +179,11 @@ static void apic_local_deliver(APICCommonState *s, int vector) (lvt & APIC_LVT_LEVEL_TRIGGER)) trigger_mode = APIC_TRIGGER_LEVEL; apic_set_irq(s, lvt & 0xff, trigger_mode); + break; } } +/* Caller must hold the lock */ void apic_deliver_pic_intr(DeviceState *d, int level) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); @@ -200,9 +206,12 @@ void apic_deliver_pic_intr(DeviceState *d, int level) } } +/* Must hold lock */ static void apic_external_nmi(APICCommonState *s) { + qemu_mutex_lock_cpu(s->cpu_env); apic_local_deliver(s, APIC_LVT_LINT1); + qemu_mutex_unlock_cpu(s->cpu_env); } #define foreach_apic(apic, deliver_bitmask, code) \ @@ -215,7 +224,9 @@ static void apic_external_nmi(APICCommonState *s) if (__mask & (1 << __j)) {\ apic = local_apics[__i * 32 + __j];\ if (apic) {\ + qemu_mutex_lock_cpu(apic->cpu_env);\ code;\ + qemu_mutex_unlock_cpu(apic->cpu_env);\ }\ }\ }\ @@ -244,7 +255,9 @@ static void apic_bus_deliver(const uint32_t *deliver_bitmask, if (d >= 0) { apic_iter = local_apics[d]; if (apic_iter) { + qemu_mutex_lock_cpu(apic_iter->cpu_env); apic_set_irq(apic_iter, vector_num, trigger_mode); + qemu_mutex_unlock_cpu(apic_iter->cpu_env); } } } @@ -293,6 +306,7 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode, apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, trigger_mode); } +/* Caller must hold lock */ static void apic_set_base(APICCommonState *s, uint64_t val) { s->apicbase = (val & 0xfffff000) | @@ -305,6 +319,7 @@ static void apic_set_base(APICCommonState *s, uint64_t val) } } +/* caller must hold lock */ static void apic_set_tpr(APICCommonState *s, uint8_t val) { /* Updates from cr8 are ignored while the VAPIC is active */ @@ -314,12 +329,14 @@ static void apic_set_tpr(APICCommonState *s, uint8_t val) } } +/* caller must hold lock */ static uint8_t apic_get_tpr(APICCommonState *s) { apic_sync_vapic(s, SYNC_FROM_VAPIC); return s->tpr >> 4; } +/* Caller must hold the lock */ static int apic_get_ppr(APICCommonState *s) { int tpr, isrv, ppr; @@ -348,6 +365,7 @@ static int apic_get_arb_pri(APICCommonState *s) * 0 - no interrupt, * >0 - interrupt number */ +/* Caller must hold cpu_lock */ static int apic_irq_pending(APICCommonState *s) { int irrv, ppr; @@ -363,6 +381,7 @@ static int apic_irq_pending(APICCommonState *s) return irrv; } +/* caller must ensure the lock has been taken */ /* signal the CPU if an irq is pending */ static void apic_update_irq(APICCommonState *s) { @@ -380,11 +399,13 @@ static void apic_update_irq(APICCommonState *s) void apic_poll_irq(DeviceState *d) { APICCommonState *s = APIC_COMMON(d); - + qemu_mutex_lock_cpu(s->cpu_env); apic_sync_vapic(s, SYNC_FROM_VAPIC); apic_update_irq(s); + qemu_mutex_unlock_cpu(s->cpu_env); } +/* caller must hold the lock */ static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) { apic_report_irq_delivered(!get_bit(s->irr, vector_num)); @@ -407,6 +428,7 @@ static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) apic_update_irq(s); } +/* caller must hold the lock */ static void apic_eoi(APICCommonState *s) { int isrv; @@ -415,6 +437,7 @@ static void apic_eoi(APICCommonState *s) return; reset_bit(s->isr, isrv); if (!(s->spurious_vec & APIC_SV_DIRECTED_IO) && get_bit(s->tmr, isrv)) { + //fix me,need to take extra lock for the bus? ioapic_eoi_broadcast(isrv); } apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC); @@ -480,18 +503,23 @@ static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask, static void apic_startup(APICCommonState *s, int vector_num) { s->sipi_vector = vector_num; + qemu_mutex_lock_cpu(s->cpu_env); cpu_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI); + qemu_mutex_unlock_cpu(s->cpu_env); } void apic_sipi(DeviceState *d) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); - + qemu_mutex_lock_cpu(s->cpu_env); cpu_reset_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI); - if (!s->wait_for_sipi) + if (!s->wait_for_sipi) { + qemu_mutex_unlock_cpu(s->cpu_env); return; + } cpu_x86_load_seg_cache_sipi(s->cpu_env, s->sipi_vector); + qemu_mutex_unlock_cpu(s->cpu_env); s->wait_for_sipi = 0; } @@ -543,6 +571,7 @@ static void apic_deliver(DeviceState *d, uint8_t dest, uint8_t dest_mode, apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, trigger_mode); } +/* Caller must take lock */ int apic_get_interrupt(DeviceState *d) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); @@ -572,6 +601,7 @@ int apic_get_interrupt(DeviceState *d) return intno; } +/* Caller must hold the cpu_lock */ int apic_accept_pic_intr(DeviceState *d) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); @@ -589,6 +619,7 @@ int apic_accept_pic_intr(DeviceState *d) return 0; } +/* Caller hold lock */ static uint32_t apic_get_current_count(APICCommonState *s) { int64_t d; @@ -619,9 +650,10 @@ static void apic_timer_update(APICCommonState *s, int64_t current_time) static void apic_timer(void *opaque) { APICCommonState *s = opaque; - + qemu_mutex_lock_cpu(s->cpu_env); apic_local_deliver(s, APIC_LVT_TIMER); apic_timer_update(s, s->next_time); + qemu_mutex_unlock_cpu(s->cpu_env); } static uint32_t apic_mem_readb(void *opaque, target_phys_addr_t addr) @@ -664,18 +696,22 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr) val = 0x11 | ((APIC_LVT_NB - 1) << 16); /* version 0x11 */ break; case 0x08: + qemu_mutex_lock_cpu(s->cpu_env); apic_sync_vapic(s, SYNC_FROM_VAPIC); if (apic_report_tpr_access) { cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ); } val = s->tpr; + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x09: val = apic_get_arb_pri(s); break; case 0x0a: /* ppr */ + qemu_mutex_lock_cpu(s->cpu_env); val = apic_get_ppr(s); + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x0b: val = 0; @@ -712,7 +748,9 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr) val = s->initial_count; break; case 0x39: + qemu_mutex_lock_cpu(s->cpu_env); val = apic_get_current_count(s); + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x3e: val = s->divide_conf; @@ -767,18 +805,22 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) case 0x03: break; case 0x08: + qemu_mutex_lock_cpu(s->cpu_env); if (apic_report_tpr_access) { cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE); } s->tpr = val; apic_sync_vapic(s, SYNC_TO_VAPIC); apic_update_irq(s); + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x09: case 0x0a: break; case 0x0b: /* EOI */ + qemu_mutex_lock_cpu(s->cpu_env); apic_eoi(s); + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x0d: s->log_dest = val >> 24; @@ -787,8 +829,10 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) s->dest_mode = val >> 28; break; case 0x0f: + qemu_mutex_lock_cpu(s->cpu_env); s->spurious_vec = val & 0x1ff; apic_update_irq(s); + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x10 ... 0x17: case 0x18 ... 0x1f: @@ -807,24 +851,30 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) case 0x32 ... 0x37: { int n = index - 0x32; + qemu_mutex_lock_cpu(s->cpu_env); s->lvt[n] = val; if (n == APIC_LVT_TIMER) apic_timer_update(s, qemu_get_clock_ns(vm_clock)); + qemu_mutex_unlock_cpu(s->cpu_env); } break; case 0x38: + qemu_mutex_lock_cpu(s->cpu_env); s->initial_count = val; s->initial_count_load_time = qemu_get_clock_ns(vm_clock); apic_timer_update(s, s->initial_count_load_time); + qemu_mutex_unlock_cpu(s->cpu_env); break; case 0x39: break; case 0x3e: { int v; + qemu_mutex_lock_cpu(s->cpu_env); s->divide_conf = val & 0xb; v = (s->divide_conf & 3) | ((s->divide_conf >> 1) & 4); s->count_shift = (v + 1) & 7; + qemu_mutex_unlock_cpu(s->cpu_env); } break; default: diff --git a/hw/pc.c b/hw/pc.c index 4d34a33..5e7350c 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -147,7 +147,7 @@ void cpu_smm_update(CPUX86State *env) smm_set(!!(env->hflags & HF_SMM_MASK), smm_arg); } - +/* Called by kvm_cpu_exec(), already with lock protection */ /* IRQ handling */ int cpu_get_pic_interrupt(CPUX86State *env) { @@ -173,9 +173,15 @@ static void pic_irq_request(void *opaque, int irq, int level) DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq); if (env->apic_state) { while (env) { + if (!qemu_cpu_is_self(env)) { + qemu_mutex_lock_cpu(env); + } if (apic_accept_pic_intr(env->apic_state)) { apic_deliver_pic_intr(env->apic_state, level); } + if (!qemu_cpu_is_self(env)) { + qemu_mutex_unlock_cpu(env); + } env = env->next_cpu; } } else { diff --git a/kvm-all.c b/kvm-all.c index 9b73ccf..dc9aa54 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -29,6 +29,7 @@ #include "bswap.h" #include "memory.h" #include "exec-memory.h" +#include "qemu-thread.h" /* This check must be after config-host.h is included */ #ifdef CONFIG_EVENTFD @@ -1252,12 +1253,15 @@ int kvm_cpu_exec(CPUArchState *env) */ qemu_cpu_kick_self(); } - qemu_mutex_unlock_iothread(); + qemu_mutex_unlock(env->cpu_lock); run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); - qemu_mutex_lock_iothread(); + qemu_mutex_lock(env->cpu_lock); kvm_arch_post_run(env, run); + qemu_mutex_unlock_cpu(env); + + qemu_mutex_lock_iothread(); kvm_flush_coalesced_mmio_buffer(); @@ -1265,6 +1269,8 @@ int kvm_cpu_exec(CPUArchState *env) if (run_ret == -EINTR || run_ret == -EAGAIN) { DPRINTF("io window exit\n"); ret = EXCP_INTERRUPT; + qemu_mutex_unlock_iothread(); + qemu_mutex_lock_cpu(env); break; } fprintf(stderr, "error: kvm run failed %s\n", @@ -1312,6 +1318,9 @@ int kvm_cpu_exec(CPUArchState *env) ret = kvm_arch_handle_exit(env, run); break; } + + qemu_mutex_unlock_iothread(); + qemu_mutex_lock_cpu(env); } while (ret == 0); if (ret < 0) { -- 1.7.4.4