On Sat, Jun 23, 2012 at 4:06 AM, Anthony Liguori <anth...@codemonkey.ws> wrote: > On 06/21/2012 09:49 AM, Liu Ping Fan wrote: >> >> In order to break the big lock, using per-cpu_lock in kvm_cpu_exec() >> to protect the race from other cpu's access to env->apic_state& related >> >> field in env. >> Also, we need to protect agaist run_on_cpu(). >> >> Race condition can be like this: >> 1. vcpu-1 IPI vcpu-2 >> vcpu-3 IPI vcpu-2 >> Open window exists for accessing to vcpu-2's apic_state& env >> >> >> 2. run_on_cpu() write env->queued_work_last, while flush_queued_work() >> read >> >> Signed-off-by: Liu Ping Fan<pingf...@linux.vnet.ibm.com> >> --- >> cpus.c | 6 ++++-- >> hw/apic.c | 58 >> ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- >> hw/pc.c | 8 +++++++- >> kvm-all.c | 13 +++++++++++-- >> 4 files changed, 76 insertions(+), 9 deletions(-) >> >> diff --git a/cpus.c b/cpus.c >> index 554f7bc..ac99afe 100644 >> --- a/cpus.c >> +++ b/cpus.c >> @@ -649,6 +649,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void >> *data), void *data) >> >> wi.func = func; >> wi.data = data; >> + qemu_mutex_lock(env->cpu_lock); >> if (!env->queued_work_first) { >> env->queued_work_first =&wi; >> } else { >> @@ -657,6 +658,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void >> *data), void *data) >> env->queued_work_last =&wi; >> wi.next = NULL; >> wi.done = false; >> + qemu_mutex_unlock(env->cpu_lock); >> >> qemu_cpu_kick(env); >> while (!wi.done) { >> @@ -718,7 +720,7 @@ static void qemu_tcg_wait_io_event(void) >> static void qemu_kvm_wait_io_event(CPUArchState *env) >> { >> while (cpu_thread_is_idle(env)) { >> - qemu_cond_wait(env->halt_cond,&qemu_global_mutex); >> + qemu_cond_wait(env->halt_cond, env->cpu_lock); >> } >> >> qemu_kvm_eat_signals(env); >> @@ -729,8 +731,8 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) >> { >> CPUArchState *env = arg; >> int r; >> + qemu_mutex_lock_cpu(env); >> >> - qemu_mutex_lock(&qemu_global_mutex); >> qemu_thread_get_self(env->thread); >> env->thread_id = qemu_get_thread_id(); >> cpu_single_env = env; >> diff --git a/hw/apic.c b/hw/apic.c >> index 4eeaf88..b999a40 100644 >> --- a/hw/apic.c >> +++ b/hw/apic.c >> @@ -22,6 +22,7 @@ >> #include "host-utils.h" >> #include "trace.h" >> #include "pc.h" >> +#include "qemu-thread.h" >> >> #define MAX_APIC_WORDS 8 >> >> @@ -94,6 +95,7 @@ static int get_highest_priority_int(uint32_t *tab) >> return -1; >> } >> >> +/* Caller must hold the lock */ >> static void apic_sync_vapic(APICCommonState *s, int sync_type) >> { >> VAPICState vapic_state; >> @@ -141,11 +143,13 @@ static void apic_sync_vapic(APICCommonState *s, int >> sync_type) >> } >> } >> >> +/* Caller must hold lock */ >> static void apic_vapic_base_update(APICCommonState *s) >> { >> apic_sync_vapic(s, SYNC_TO_VAPIC); >> } >> >> +/* Caller must hold the lock */ >> static void apic_local_deliver(APICCommonState *s, int vector) >> { >> uint32_t lvt = s->lvt[vector]; >> @@ -175,9 +179,11 @@ static void apic_local_deliver(APICCommonState *s, >> int vector) >> (lvt& APIC_LVT_LEVEL_TRIGGER)) >> trigger_mode = APIC_TRIGGER_LEVEL; >> apic_set_irq(s, lvt& 0xff, trigger_mode); >> >> + break; >> } >> } >> >> +/* Caller must hold the lock */ >> void apic_deliver_pic_intr(DeviceState *d, int level) >> { >> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >> @@ -200,9 +206,12 @@ void apic_deliver_pic_intr(DeviceState *d, int level) >> } >> } >> >> +/* Must hold lock */ >> static void apic_external_nmi(APICCommonState *s) >> { >> + qemu_mutex_lock_cpu(s->cpu_env); >> apic_local_deliver(s, APIC_LVT_LINT1); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> } >> >> #define foreach_apic(apic, deliver_bitmask, code) \ >> @@ -215,7 +224,9 @@ static void apic_external_nmi(APICCommonState *s) >> if (__mask& (1<< __j)) {\ >> >> apic = local_apics[__i * 32 + __j];\ >> if (apic) {\ >> + qemu_mutex_lock_cpu(apic->cpu_env);\ >> code;\ >> + qemu_mutex_unlock_cpu(apic->cpu_env);\ >> }\ >> }\ >> }\ >> @@ -244,7 +255,9 @@ static void apic_bus_deliver(const uint32_t >> *deliver_bitmask, >> if (d>= 0) { >> apic_iter = local_apics[d]; >> if (apic_iter) { >> + qemu_mutex_lock_cpu(apic_iter->cpu_env); >> apic_set_irq(apic_iter, vector_num, >> trigger_mode); >> + qemu_mutex_unlock_cpu(apic_iter->cpu_env); >> } >> } >> } >> @@ -293,6 +306,7 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, >> uint8_t delivery_mode, >> apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, >> trigger_mode); >> } >> >> +/* Caller must hold lock */ >> static void apic_set_base(APICCommonState *s, uint64_t val) >> { >> s->apicbase = (val& 0xfffff000) | >> >> @@ -305,6 +319,7 @@ static void apic_set_base(APICCommonState *s, uint64_t >> val) >> } >> } >> >> +/* caller must hold lock */ >> static void apic_set_tpr(APICCommonState *s, uint8_t val) >> { >> /* Updates from cr8 are ignored while the VAPIC is active */ >> @@ -314,12 +329,14 @@ static void apic_set_tpr(APICCommonState *s, uint8_t >> val) >> } >> } >> >> +/* caller must hold lock */ >> static uint8_t apic_get_tpr(APICCommonState *s) >> { >> apic_sync_vapic(s, SYNC_FROM_VAPIC); >> return s->tpr>> 4; >> } >> >> +/* Caller must hold the lock */ >> static int apic_get_ppr(APICCommonState *s) >> { >> int tpr, isrv, ppr; >> @@ -348,6 +365,7 @@ static int apic_get_arb_pri(APICCommonState *s) >> * 0 - no interrupt, >> *>0 - interrupt number >> */ >> +/* Caller must hold cpu_lock */ >> static int apic_irq_pending(APICCommonState *s) >> { >> int irrv, ppr; >> @@ -363,6 +381,7 @@ static int apic_irq_pending(APICCommonState *s) >> return irrv; >> } >> >> +/* caller must ensure the lock has been taken */ >> /* signal the CPU if an irq is pending */ >> static void apic_update_irq(APICCommonState *s) >> { >> @@ -380,11 +399,13 @@ static void apic_update_irq(APICCommonState *s) >> void apic_poll_irq(DeviceState *d) >> { >> APICCommonState *s = APIC_COMMON(d); >> - >> + qemu_mutex_lock_cpu(s->cpu_env); >> apic_sync_vapic(s, SYNC_FROM_VAPIC); >> apic_update_irq(s); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> } >> >> +/* caller must hold the lock */ >> static void apic_set_irq(APICCommonState *s, int vector_num, int >> trigger_mode) >> { >> apic_report_irq_delivered(!get_bit(s->irr, vector_num)); >> @@ -407,6 +428,7 @@ static void apic_set_irq(APICCommonState *s, int >> vector_num, int trigger_mode) >> apic_update_irq(s); >> } >> >> +/* caller must hold the lock */ >> static void apic_eoi(APICCommonState *s) >> { >> int isrv; >> @@ -415,6 +437,7 @@ static void apic_eoi(APICCommonState *s) >> return; >> reset_bit(s->isr, isrv); >> if (!(s->spurious_vec& APIC_SV_DIRECTED_IO)&& get_bit(s->tmr, >> isrv)) { >> >> + //fix me,need to take extra lock for the bus? >> ioapic_eoi_broadcast(isrv); >> } >> apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC); >> @@ -480,18 +503,23 @@ static void apic_get_delivery_bitmask(uint32_t >> *deliver_bitmask, >> static void apic_startup(APICCommonState *s, int vector_num) >> { >> s->sipi_vector = vector_num; >> + qemu_mutex_lock_cpu(s->cpu_env); >> cpu_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> } >> >> void apic_sipi(DeviceState *d) >> { >> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >> - >> + qemu_mutex_lock_cpu(s->cpu_env); >> cpu_reset_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI); >> >> - if (!s->wait_for_sipi) >> + if (!s->wait_for_sipi) { >> + qemu_mutex_unlock_cpu(s->cpu_env); >> return; >> + } >> cpu_x86_load_seg_cache_sipi(s->cpu_env, s->sipi_vector); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> s->wait_for_sipi = 0; >> } >> >> @@ -543,6 +571,7 @@ static void apic_deliver(DeviceState *d, uint8_t dest, >> uint8_t dest_mode, >> apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, >> trigger_mode); >> } >> >> +/* Caller must take lock */ >> int apic_get_interrupt(DeviceState *d) >> { >> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >> @@ -572,6 +601,7 @@ int apic_get_interrupt(DeviceState *d) >> return intno; >> } >> >> +/* Caller must hold the cpu_lock */ >> int apic_accept_pic_intr(DeviceState *d) >> { >> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >> @@ -589,6 +619,7 @@ int apic_accept_pic_intr(DeviceState *d) >> return 0; >> } >> >> +/* Caller hold lock */ >> static uint32_t apic_get_current_count(APICCommonState *s) >> { >> int64_t d; >> @@ -619,9 +650,10 @@ static void apic_timer_update(APICCommonState *s, >> int64_t current_time) >> static void apic_timer(void *opaque) >> { >> APICCommonState *s = opaque; >> - >> + qemu_mutex_lock_cpu(s->cpu_env); >> apic_local_deliver(s, APIC_LVT_TIMER); >> apic_timer_update(s, s->next_time); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> } >> >> static uint32_t apic_mem_readb(void *opaque, target_phys_addr_t addr) >> @@ -664,18 +696,22 @@ static uint32_t apic_mem_readl(void *opaque, >> target_phys_addr_t addr) >> val = 0x11 | ((APIC_LVT_NB - 1)<< 16); /* version 0x11 */ >> break; >> case 0x08: >> + qemu_mutex_lock_cpu(s->cpu_env); >> apic_sync_vapic(s, SYNC_FROM_VAPIC); >> if (apic_report_tpr_access) { >> cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ); >> } >> val = s->tpr; >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x09: >> val = apic_get_arb_pri(s); >> break; >> case 0x0a: >> /* ppr */ >> + qemu_mutex_lock_cpu(s->cpu_env); >> val = apic_get_ppr(s); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x0b: >> val = 0; >> @@ -712,7 +748,9 @@ static uint32_t apic_mem_readl(void *opaque, >> target_phys_addr_t addr) >> val = s->initial_count; >> break; >> case 0x39: >> + qemu_mutex_lock_cpu(s->cpu_env); >> val = apic_get_current_count(s); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x3e: >> val = s->divide_conf; >> @@ -767,18 +805,22 @@ static void apic_mem_writel(void *opaque, >> target_phys_addr_t addr, uint32_t val) >> case 0x03: >> break; >> case 0x08: >> + qemu_mutex_lock_cpu(s->cpu_env); >> if (apic_report_tpr_access) { >> cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE); >> } >> s->tpr = val; >> apic_sync_vapic(s, SYNC_TO_VAPIC); >> apic_update_irq(s); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x09: >> case 0x0a: >> break; >> case 0x0b: /* EOI */ >> + qemu_mutex_lock_cpu(s->cpu_env); >> apic_eoi(s); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x0d: >> s->log_dest = val>> 24; >> @@ -787,8 +829,10 @@ static void apic_mem_writel(void *opaque, >> target_phys_addr_t addr, uint32_t val) >> s->dest_mode = val>> 28; >> break; >> case 0x0f: >> + qemu_mutex_lock_cpu(s->cpu_env); >> s->spurious_vec = val& 0x1ff; >> >> apic_update_irq(s); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x10 ... 0x17: >> case 0x18 ... 0x1f: >> @@ -807,24 +851,30 @@ static void apic_mem_writel(void *opaque, >> target_phys_addr_t addr, uint32_t val) >> case 0x32 ... 0x37: >> { >> int n = index - 0x32; >> + qemu_mutex_lock_cpu(s->cpu_env); >> s->lvt[n] = val; >> if (n == APIC_LVT_TIMER) >> apic_timer_update(s, qemu_get_clock_ns(vm_clock)); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> } >> break; >> case 0x38: >> + qemu_mutex_lock_cpu(s->cpu_env); >> s->initial_count = val; >> s->initial_count_load_time = qemu_get_clock_ns(vm_clock); >> apic_timer_update(s, s->initial_count_load_time); >> + qemu_mutex_unlock_cpu(s->cpu_env); >> break; >> case 0x39: >> break; >> case 0x3e: >> { >> int v; >> + qemu_mutex_lock_cpu(s->cpu_env); >> s->divide_conf = val& 0xb; >> v = (s->divide_conf& 3) | ((s->divide_conf>> 1)& 4); >> s->count_shift = (v + 1)& 7; >> >> + qemu_mutex_unlock_cpu(s->cpu_env); >> } >> break; >> default: >> diff --git a/hw/pc.c b/hw/pc.c >> index 4d34a33..5e7350c 100644 >> --- a/hw/pc.c >> +++ b/hw/pc.c >> @@ -147,7 +147,7 @@ void cpu_smm_update(CPUX86State *env) >> smm_set(!!(env->hflags& HF_SMM_MASK), smm_arg); >> >> } >> >> - >> +/* Called by kvm_cpu_exec(), already with lock protection */ >> /* IRQ handling */ >> int cpu_get_pic_interrupt(CPUX86State *env) >> { >> @@ -173,9 +173,15 @@ static void pic_irq_request(void *opaque, int irq, >> int level) >> DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq); >> if (env->apic_state) { >> while (env) { >> + if (!qemu_cpu_is_self(env)) { >> + qemu_mutex_lock_cpu(env); >> + } >> if (apic_accept_pic_intr(env->apic_state)) { >> apic_deliver_pic_intr(env->apic_state, level); >> } >> + if (!qemu_cpu_is_self(env)) { >> + qemu_mutex_unlock_cpu(env); >> + } >> env = env->next_cpu; >> } >> } else { >> diff --git a/kvm-all.c b/kvm-all.c >> index 9b73ccf..dc9aa54 100644 >> --- a/kvm-all.c >> +++ b/kvm-all.c >> @@ -29,6 +29,7 @@ >> #include "bswap.h" >> #include "memory.h" >> #include "exec-memory.h" >> +#include "qemu-thread.h" >> >> /* This check must be after config-host.h is included */ >> #ifdef CONFIG_EVENTFD >> @@ -1252,12 +1253,15 @@ int kvm_cpu_exec(CPUArchState *env) >> */ >> qemu_cpu_kick_self(); >> } >> - qemu_mutex_unlock_iothread(); >> + qemu_mutex_unlock(env->cpu_lock); >> >> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >> >> - qemu_mutex_lock_iothread(); >> + qemu_mutex_lock(env->cpu_lock); >> kvm_arch_post_run(env, run); >> + qemu_mutex_unlock_cpu(env); >> + >> + qemu_mutex_lock_iothread(); >> >> kvm_flush_coalesced_mmio_buffer(); >> >> @@ -1265,6 +1269,8 @@ int kvm_cpu_exec(CPUArchState *env) >> if (run_ret == -EINTR || run_ret == -EAGAIN) { >> DPRINTF("io window exit\n"); >> ret = EXCP_INTERRUPT; >> + qemu_mutex_unlock_iothread(); >> + qemu_mutex_lock_cpu(env); >> break; >> } >> fprintf(stderr, "error: kvm run failed %s\n", >> @@ -1312,6 +1318,9 @@ int kvm_cpu_exec(CPUArchState *env) >> ret = kvm_arch_handle_exit(env, run); >> break; >> } >> + >> + qemu_mutex_unlock_iothread(); >> + qemu_mutex_lock_cpu(env); >> } while (ret == 0); > > > This really confuses me. > > Shouldn't we be moving qemu_mutex_unlock_iothread() to the very top of > kvm_cpu_exec()? > Sorry, the code is not arranged good enough, but I think the final style will be like this: do { qemu_mutex_lock_iothread() kvm_flush_coalesced_mmio_buffer(); qemu_mutex_unlock_iothread()
switch (run->exit_reason) { case KVM_EXIT_IO: qemu_mutex_lock_iothread() kvm_handle_io(); qemu_mutex_unlock_iothread() break; case KVM_EXIT_MMIO: qemu_mutex_lock_iothread() cpu_physical_memory_rw() qemu_mutex_unlock_iothread() break; ................ }while() Right? Can we push the qemu_mutex_lock_iothread out of the do{}? > There's only a fixed amount of state that gets manipulated too in > kvm_cpu_exec() so shouldn't we try to specifically describe what state is > covered by cpu_lock? > In current code, apart from run_on_cpu() writes env->queued_work_last, while flush_queued_work() reads, the BQL protects the CPUArchState and the deviceState from the other threads. As to the CPUArchState, the emulated registers are local. So the only part can be accessed by other threads is for the irq emulation. These component including env's (interrupt_request,eflags,halted,exit_request,exception_injected), all of them are decided by env->apic_state, so we consider them as a atomic context, that is say during the updating of env's these context, we do not allow apic_state changed. > What's your strategy for ensuring that all accesses to that state is > protected by cpu_lock? > The cpu_lock is a lock for env->apic_state and all related irq emulation context. And the apic_state is the only entrance, through which, other threads can affect this env's irq emulation context. So when other threads want to access this_env->apic_state, they must firstly acquire the cpu_lock. Thanks and regards, pingfan > Regards, > > Anthony Liguori > >> >> if (ret< 0) { > >