> On 9 Jun 2015, at 11:12, Alex Bennée <alex.ben...@linaro.org> wrote: > > > fred.kon...@greensocs.com writes: > >> From: KONRAD Frederic <fred.kon...@greensocs.com> >> >> This mechanism replaces the existing load/store exclusive mechanism which >> seems >> to be broken for multithread. >> It follows the intention of the existing mechanism and stores the target >> address >> and data values during a load operation and checks that they remain unchanged >> before a store. >> >> In common with the older approach, this provides weaker semantics than >> required >> in that it could be that a different processor writes the same value as a >> non-exclusive write, however in practise this seems to be irrelevant. > > You can WFE on the global monitor and use it for a lockless sleep on a > flag value. I don't think the Linux kernel does it but certainly > anything trying to be power efficient may use it. >
I dont see how this changes the logic actually. The assumption is - I believe - that you spin lock waiting for a mutex to be free, using ld/strex, if you fail to get the mutex you execute a WFE to lower your power consumption. I dont see how that changes the semantics of the ld/strex part of the equation? As far as I can see, the semantics we have are still robust enough to handle that. >> >> The old implementation didn’t correctly store it’s values as globals, but >> rather >> kept a local copy per CPU. >> >> This new mechanism stores the values globally and also uses the atomic >> cmpxchg >> macros to ensure atomicity - it is therefore very efficient and threadsafe. >> >> Signed-off-by: KONRAD Frederic <fred.kon...@greensocs.com> >> --- >> target-arm/cpu.c | 21 +++++++ >> target-arm/cpu.h | 6 ++ >> target-arm/helper.c | 12 ++++ >> target-arm/helper.h | 6 ++ >> target-arm/op_helper.c | 146 >> ++++++++++++++++++++++++++++++++++++++++++++++++- >> target-arm/translate.c | 98 ++++++--------------------------- >> 6 files changed, 207 insertions(+), 82 deletions(-) >> >> diff --git a/target-arm/cpu.c b/target-arm/cpu.c >> index 4a888ab..0400061 100644 >> --- a/target-arm/cpu.c >> +++ b/target-arm/cpu.c >> @@ -31,6 +31,26 @@ >> #include "sysemu/kvm.h" >> #include "kvm_arm.h" >> >> +/* Protect cpu_exclusive_* variable .*/ >> +__thread bool cpu_have_exclusive_lock; >> +QemuMutex cpu_exclusive_lock; >> + >> +inline void arm_exclusive_lock(void) >> +{ >> + if (!cpu_have_exclusive_lock) { >> + qemu_mutex_lock(&cpu_exclusive_lock); >> + cpu_have_exclusive_lock = true; >> + } >> +} >> + >> +inline void arm_exclusive_unlock(void) >> +{ >> + if (cpu_have_exclusive_lock) { >> + cpu_have_exclusive_lock = false; >> + qemu_mutex_unlock(&cpu_exclusive_lock); >> + } >> +} > > I don't quite follow. If these locks are mean to be protecting access to > variables then how do they do that? The lock won't block if another > thread is currently messing with the protected values. > We are only protecting the stored exclusive values here… As the current implementation has them, but this time at least they are CPU specific. it’s not protecting the actual memory address. >> + >> static void arm_cpu_set_pc(CPUState *cs, vaddr value) >> { >> ARMCPU *cpu = ARM_CPU(cs); >> @@ -425,6 +445,7 @@ static void arm_cpu_initfn(Object *obj) >> cpu->psci_version = 2; /* TCG implements PSCI 0.2 */ >> if (!inited) { >> inited = true; >> + qemu_mutex_init(&cpu_exclusive_lock); >> arm_translate_init(); >> } >> } >> diff --git a/target-arm/cpu.h b/target-arm/cpu.h >> index 21b5b8e..257ed05 100644 >> --- a/target-arm/cpu.h >> +++ b/target-arm/cpu.h >> @@ -506,6 +506,9 @@ static inline bool is_a64(CPUARMState *env) >> int cpu_arm_signal_handler(int host_signum, void *pinfo, >> void *puc); >> >> +int arm_get_phys_addr(CPUARMState *env, target_ulong address, int >> access_type, >> + hwaddr *phys_ptr, int *prot, target_ulong *page_size); >> + >> /** >> * pmccntr_sync >> * @env: CPUARMState >> @@ -1922,4 +1925,7 @@ enum { >> QEMU_PSCI_CONDUIT_HVC = 2, >> }; >> >> +void arm_exclusive_lock(void); >> +void arm_exclusive_unlock(void); >> + >> #endif >> diff --git a/target-arm/helper.c b/target-arm/helper.c >> index 3da0c05..31ee1e5 100644 >> --- a/target-arm/helper.c >> +++ b/target-arm/helper.c >> @@ -23,6 +23,14 @@ static inline int get_phys_addr(CPUARMState *env, >> target_ulong address, >> #define PMCRE 0x1 >> #endif >> >> +int arm_get_phys_addr(CPUARMState *env, target_ulong address, int >> access_type, >> + hwaddr *phys_ptr, int *prot, target_ulong *page_size) >> +{ >> + MemTxAttrs attrs = {}; >> + return get_phys_addr(env, address, access_type, cpu_mmu_index(env), >> + phys_ptr, &attrs, prot, page_size); >> +} >> + >> static int vfp_gdb_get_reg(CPUARMState *env, uint8_t *buf, int reg) >> { >> int nregs; >> @@ -4731,6 +4739,10 @@ void arm_cpu_do_interrupt(CPUState *cs) >> >> arm_log_exception(cs->exception_index); >> >> + arm_exclusive_lock(); >> + env->exclusive_addr = -1; >> + arm_exclusive_unlock(); >> + >> if (arm_is_psci_call(cpu, cs->exception_index)) { >> arm_handle_psci_call(cpu); >> qemu_log_mask(CPU_LOG_INT, "...handled as PSCI call\n"); >> diff --git a/target-arm/helper.h b/target-arm/helper.h >> index fc885de..63c2809 100644 >> --- a/target-arm/helper.h >> +++ b/target-arm/helper.h >> @@ -529,6 +529,12 @@ DEF_HELPER_2(dc_zva, void, env, i64) >> DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64) >> DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64) >> >> +DEF_HELPER_4(atomic_cmpxchg64, i32, env, i32, i64, i32) >> +DEF_HELPER_2(atomic_check, i32, env, i32) >> +DEF_HELPER_1(atomic_release, void, env) >> +DEF_HELPER_1(atomic_clear, void, env) >> +DEF_HELPER_3(atomic_claim, void, env, i32, i64) >> + >> #ifdef TARGET_AARCH64 >> #include "helper-a64.h" >> #endif >> diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c >> index 7583ae7..81dd403 100644 >> --- a/target-arm/op_helper.c >> +++ b/target-arm/op_helper.c >> @@ -30,12 +30,157 @@ static void raise_exception(CPUARMState *env, uint32_t >> excp, >> CPUState *cs = CPU(arm_env_get_cpu(env)); >> >> assert(!excp_is_internal(excp)); >> + arm_exclusive_lock(); >> cs->exception_index = excp; >> env->exception.syndrome = syndrome; >> env->exception.target_el = target_el; >> + /* >> + * We MAY already have the lock - in which case we are exiting the >> + * instruction due to an exception. Otherwise we better make sure we >> are not >> + * about to enter a STREX anyway. >> + */ >> + env->exclusive_addr = -1; >> + arm_exclusive_unlock(); >> cpu_loop_exit(cs); >> } >> >> +/* NB return 1 for fail, 0 for pass */ >> +uint32_t HELPER(atomic_cmpxchg64)(CPUARMState *env, uint32_t addr, >> + uint64_t newval, uint32_t size) >> +{ >> + ARMCPU *cpu = arm_env_get_cpu(env); >> + CPUState *cs = CPU(cpu); >> + >> + bool result = false; >> + hwaddr len = 8 << size; >> + >> + hwaddr paddr; >> + target_ulong page_size; >> + int prot; >> + >> + arm_exclusive_lock(); >> + >> + if (env->exclusive_addr != addr) { >> + arm_exclusive_unlock(); >> + return 1; >> + } >> + >> + if (arm_get_phys_addr(env, addr, 1, &paddr, &prot, &page_size) != 0) { >> + tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, >> cpu_mmu_index(env), 0); >> + if (arm_get_phys_addr(env, addr, 1, &paddr, &prot, &page_size) != >> 0) { >> + arm_exclusive_unlock(); >> + return 1; >> + } >> + } > > Some comments as to what we are doing here would be useful. in this case, within the lock, we are checking the actual value stored in the location, At this point we are getting the physical address such that we can get the data from memory. The other alternative would be to add some code such that this because a ‘primitive’ in the tcg like the rest of load/stores, but it seemed easier to keep it separate and just do the lookup here. We’ll add a comment to this effect - if it’s clear enough? Cheers Mark. > >> + >> + switch (size) { >> + case 0: >> + { >> + uint8_t oldval, *p; >> + p = address_space_map(cs->as, paddr, &len, true); >> + if (len == 8 << size) { >> + oldval = (uint8_t)env->exclusive_val; >> + result = (atomic_cmpxchg(p, oldval, (uint8_t)newval) == oldval); >> + } >> + address_space_unmap(cs->as, p, len, true, result ? 8 : 0); >> + } >> + break; >> + case 1: >> + { >> + uint16_t oldval, *p; >> + p = address_space_map(cs->as, paddr, &len, true); >> + if (len == 8 << size) { >> + oldval = (uint16_t)env->exclusive_val; >> + result = (atomic_cmpxchg(p, oldval, (uint16_t)newval) == >> oldval); >> + } >> + address_space_unmap(cs->as, p, len, true, result ? 8 : 0); >> + } >> + break; >> + case 2: >> + { >> + uint32_t oldval, *p; >> + p = address_space_map(cs->as, paddr, &len, true); >> + if (len == 8 << size) { >> + oldval = (uint32_t)env->exclusive_val; >> + result = (atomic_cmpxchg(p, oldval, (uint32_t)newval) == >> oldval); >> + } >> + address_space_unmap(cs->as, p, len, true, result ? 8 : 0); >> + } >> + break; >> + case 3: >> + { >> + uint64_t oldval, *p; >> + p = address_space_map(cs->as, paddr, &len, true); >> + if (len == 8 << size) { >> + oldval = (uint64_t)env->exclusive_val; >> + result = (atomic_cmpxchg(p, oldval, (uint64_t)newval) == >> oldval); >> + } >> + address_space_unmap(cs->as, p, len, true, result ? 8 : 0); >> + } >> + break; >> + default: >> + abort(); >> + break; >> + } >> + >> + env->exclusive_addr = -1; >> + arm_exclusive_unlock(); >> + if (result) { >> + return 0; >> + } else { >> + return 1; >> + } >> +} >> + >> + >> +uint32_t HELPER(atomic_check)(CPUARMState *env, uint32_t addr) >> +{ >> + arm_exclusive_lock(); >> + if (env->exclusive_addr == addr) { >> + return true; >> + } >> + /* we failed to keep the address tagged, so we fail */ >> + env->exclusive_addr = -1; /* for efficiency */ >> + arm_exclusive_unlock(); >> + return false; >> +} >> + >> +void HELPER(atomic_release)(CPUARMState *env) >> +{ >> + env->exclusive_addr = -1; >> + arm_exclusive_unlock(); >> +} >> + >> +void HELPER(atomic_clear)(CPUARMState *env) >> +{ >> + /* make sure no STREX is about to start */ >> + arm_exclusive_lock(); >> + env->exclusive_addr = -1; >> + arm_exclusive_unlock(); >> +} >> + >> + >> +void HELPER(atomic_claim)(CPUARMState *env, uint32_t addr, uint64_t val) >> +{ >> + CPUState *cpu; >> + CPUARMState *current_cpu; >> + >> + /* ensure that there are no STREX's executing */ >> + arm_exclusive_lock(); >> + >> + CPU_FOREACH(cpu) { >> + current_cpu = &ARM_CPU(cpu)->env; >> + if (current_cpu->exclusive_addr == addr) { >> + /* We steal the atomic of this CPU. */ >> + current_cpu->exclusive_addr = -1; >> + } >> + } >> + >> + env->exclusive_val = val; >> + env->exclusive_addr = addr; >> + arm_exclusive_unlock(); >> +} >> + >> static int exception_target_el(CPUARMState *env) >> { >> int target_el = MAX(1, arm_current_el(env)); >> @@ -582,7 +727,6 @@ void HELPER(exception_return)(CPUARMState *env) >> >> aarch64_save_sp(env, cur_el); >> >> - env->exclusive_addr = -1; >> >> /* We must squash the PSTATE.SS bit to zero unless both of the >> * following hold: >> diff --git a/target-arm/translate.c b/target-arm/translate.c >> index 39692d7..ba4eb65 100644 >> --- a/target-arm/translate.c >> +++ b/target-arm/translate.c >> @@ -65,8 +65,8 @@ TCGv_ptr cpu_env; >> static TCGv_i64 cpu_V0, cpu_V1, cpu_M0; >> static TCGv_i32 cpu_R[16]; >> static TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF; >> -static TCGv_i64 cpu_exclusive_addr; >> static TCGv_i64 cpu_exclusive_val; >> +static TCGv_i64 cpu_exclusive_addr; >> #ifdef CONFIG_USER_ONLY >> static TCGv_i64 cpu_exclusive_test; >> static TCGv_i32 cpu_exclusive_info; >> @@ -7391,6 +7391,7 @@ static void gen_load_exclusive(DisasContext *s, int >> rt, int rt2, >> TCGv_i32 addr, int size) >> { >> TCGv_i32 tmp = tcg_temp_new_i32(); >> + TCGv_i64 val = tcg_temp_new_i64(); >> >> s->is_ldex = true; >> >> @@ -7415,20 +7416,20 @@ static void gen_load_exclusive(DisasContext *s, int >> rt, int rt2, >> >> tcg_gen_addi_i32(tmp2, addr, 4); >> gen_aa32_ld32u(tmp3, tmp2, get_mem_index(s)); >> + tcg_gen_concat_i32_i64(val, tmp, tmp3); >> tcg_temp_free_i32(tmp2); >> - tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3); >> store_reg(s, rt2, tmp3); >> } else { >> - tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp); >> + tcg_gen_extu_i32_i64(val, tmp); >> } >> - >> + gen_helper_atomic_claim(cpu_env, addr, val); >> + tcg_temp_free_i64(val); >> store_reg(s, rt, tmp); >> - tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr); >> } >> >> static void gen_clrex(DisasContext *s) >> { >> - tcg_gen_movi_i64(cpu_exclusive_addr, -1); >> + gen_helper_atomic_clear(cpu_env); >> } >> >> #ifdef CONFIG_USER_ONLY >> @@ -7445,84 +7446,19 @@ static void gen_store_exclusive(DisasContext *s, int >> rd, int rt, int rt2, >> TCGv_i32 addr, int size) >> { >> TCGv_i32 tmp; >> - TCGv_i64 val64, extaddr; >> - TCGLabel *done_label; >> - TCGLabel *fail_label; >> - >> - /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) { >> - [addr] = {Rt}; >> - {Rd} = 0; >> - } else { >> - {Rd} = 1; >> - } */ >> - fail_label = gen_new_label(); >> - done_label = gen_new_label(); >> - extaddr = tcg_temp_new_i64(); >> - tcg_gen_extu_i32_i64(extaddr, addr); >> - tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, >> fail_label); >> - tcg_temp_free_i64(extaddr); >> - >> - tmp = tcg_temp_new_i32(); >> - switch (size) { >> - case 0: >> - gen_aa32_ld8u(tmp, addr, get_mem_index(s)); >> - break; >> - case 1: >> - gen_aa32_ld16u(tmp, addr, get_mem_index(s)); >> - break; >> - case 2: >> - case 3: >> - gen_aa32_ld32u(tmp, addr, get_mem_index(s)); >> - break; >> - default: >> - abort(); >> - } >> - >> - val64 = tcg_temp_new_i64(); >> - if (size == 3) { >> - TCGv_i32 tmp2 = tcg_temp_new_i32(); >> - TCGv_i32 tmp3 = tcg_temp_new_i32(); >> - tcg_gen_addi_i32(tmp2, addr, 4); >> - gen_aa32_ld32u(tmp3, tmp2, get_mem_index(s)); >> - tcg_temp_free_i32(tmp2); >> - tcg_gen_concat_i32_i64(val64, tmp, tmp3); >> - tcg_temp_free_i32(tmp3); >> - } else { >> - tcg_gen_extu_i32_i64(val64, tmp); >> - } >> - tcg_temp_free_i32(tmp); >> - >> - tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label); >> - tcg_temp_free_i64(val64); >> + TCGv_i32 tmp2; >> + TCGv_i64 val = tcg_temp_new_i64(); >> + TCGv_i32 tmp_size = tcg_const_i32(size); >> >> tmp = load_reg(s, rt); >> - switch (size) { >> - case 0: >> - gen_aa32_st8(tmp, addr, get_mem_index(s)); >> - break; >> - case 1: >> - gen_aa32_st16(tmp, addr, get_mem_index(s)); >> - break; >> - case 2: >> - case 3: >> - gen_aa32_st32(tmp, addr, get_mem_index(s)); >> - break; >> - default: >> - abort(); >> - } >> + tmp2 = load_reg(s, rt2); >> + tcg_gen_concat_i32_i64(val, tmp, tmp2); >> tcg_temp_free_i32(tmp); >> - if (size == 3) { >> - tcg_gen_addi_i32(addr, addr, 4); >> - tmp = load_reg(s, rt2); >> - gen_aa32_st32(tmp, addr, get_mem_index(s)); >> - tcg_temp_free_i32(tmp); >> - } >> - tcg_gen_movi_i32(cpu_R[rd], 0); >> - tcg_gen_br(done_label); >> - gen_set_label(fail_label); >> - tcg_gen_movi_i32(cpu_R[rd], 1); >> - gen_set_label(done_label); >> - tcg_gen_movi_i64(cpu_exclusive_addr, -1); >> + tcg_temp_free_i32(tmp2); >> + >> + gen_helper_atomic_cmpxchg64(cpu_R[rd], cpu_env, addr, val, tmp_size); >> + tcg_temp_free_i64(val); >> + tcg_temp_free_i32(tmp_size); >> } >> #endif > > -- > Alex Bennée +44 (0)20 7100 3485 x 210 +33 (0)5 33 52 01 77x 210 +33 (0)603762104 mark.burton