This set of locks will allow us to correctly emulate cmpxchg16 in a parallel TCG. The key observation is that no architecture supports 16-byte regular atomic load/stores; only "locked" accesses (e.g. via cmpxchg16b on x86) are allowed, and therefore we can emulate them by using locks.
We use a small array of locks so that we can have some scalability. Further improvements are possible (e.g. using a radix tree); but we should have a workload to benchmark in order to justify the additional complexity. Signed-off-by: Emilio G. Cota <c...@braap.org> --- cpu-exec.c | 1 + linux-user/main.c | 1 + tcg/tcg.h | 5 +++++ translate-all.c | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/cpu-exec.c b/cpu-exec.c index b840e1d..26f3bd6 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -643,6 +643,7 @@ int cpu_exec(CPUState *cpu) #endif /* buggy compiler */ cpu->can_do_io = 1; tb_lock_reset(); + tcg_cmpxchg_lock_reset(); } } /* for(;;) */ diff --git a/linux-user/main.c b/linux-user/main.c index 78d8d04..af9e8e3 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -140,6 +140,7 @@ void fork_end(int child) pthread_cond_init(&exclusive_cond, NULL); pthread_cond_init(&exclusive_resume, NULL); qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); + tcg_cmpxchg_lock_init(); gdbserver_fork(thread_cpu); } else { pthread_mutex_unlock(&exclusive_lock); diff --git a/tcg/tcg.h b/tcg/tcg.h index 1fd7ec3..1c9c8bc 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -650,6 +650,11 @@ void tb_lock(void); void tb_unlock(void); void tb_lock_reset(void); +void tcg_cmpxchg_lock(uintptr_t addr); +void tcg_cmpxchg_unlock(void); +void tcg_cmpxchg_lock_reset(void); +void tcg_cmpxchg_lock_init(void); + static inline void *tcg_malloc(int size) { TCGContext *s = &tcg_ctx; diff --git a/translate-all.c b/translate-all.c index eaa95e4..19432e5 100644 --- a/translate-all.c +++ b/translate-all.c @@ -153,6 +153,44 @@ void tb_lock_reset(void) #endif } +#define TCG_CMPXCHG_NR_LOCKS 16 +static QemuMutex tcg_cmpxchg_locks[TCG_CMPXCHG_NR_LOCKS]; +static __thread QemuMutex *tcg_cmpxchg_curr_lock; + +void tcg_cmpxchg_lock(uintptr_t addr) +{ + assert(tcg_cmpxchg_curr_lock == NULL); + /* choose lock based on cache line address. We assume lines are 64b long */ + addr >>= 6; + addr &= TCG_CMPXCHG_NR_LOCKS - 1; + tcg_cmpxchg_curr_lock = &tcg_cmpxchg_locks[addr]; + qemu_mutex_lock(tcg_cmpxchg_curr_lock); +} + +void tcg_cmpxchg_unlock(void) +{ + qemu_mutex_unlock(tcg_cmpxchg_curr_lock); + tcg_cmpxchg_curr_lock = NULL; +} + +void tcg_cmpxchg_lock_reset(void) +{ + if (unlikely(tcg_cmpxchg_curr_lock)) { + tcg_cmpxchg_unlock(); + } +} + +void tcg_cmpxchg_lock_init(void) +{ + int i; + + /* set current to NULL; useful after a child forks in user-mode */ + tcg_cmpxchg_curr_lock = NULL; + for (i = 0; i < TCG_CMPXCHG_NR_LOCKS; i++) { + qemu_mutex_init(&tcg_cmpxchg_locks[i]); + } +} + static TranslationBlock *tb_find_pc(uintptr_t tc_ptr); void cpu_gen_init(void) @@ -731,6 +769,7 @@ static inline void code_gen_alloc(size_t tb_size) tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks); qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); + tcg_cmpxchg_lock_init(); } static void tb_htable_init(void) -- 2.5.0