This is a first attempt at making tb_flush not have to stop all CPUs. There are issues as pointed out below, but this could be a good start.
Context: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html Known issues: - Basically compile-tested only, since I've only run this with single-threaded TCG; I also tried running it with linux-user, but in order to trigger tb_flush I had to make code_gen_buffer so small that the CPU calling tb_flush would immediately fill the 2nd buffer, triggering the assert. If you have a working multi-threaded workload that would be good to test this, please let me know. - Windows; not even compile-tested! Signed-off-by: Emilio G. Cota <c...@braap.org> --- translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 5 deletions(-) diff --git a/translate-all.c b/translate-all.c index bba9b62..4c14b4d 100644 --- a/translate-all.c +++ b/translate-all.c @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1) #endif #ifdef USE_STATIC_CODE_GEN_BUFFER -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] __attribute__((aligned(CODE_GEN_ALIGN))); +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] + __attribute__((aligned(CODE_GEN_ALIGN))); +static int static_buf_mask = 1; +static void *static_buf1; +static void *static_buf2; # ifdef _WIN32 static inline void do_protect(void *addr, long size, int prot) @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size) } # endif /* WIN32 */ -static inline void *alloc_code_gen_buffer(void) +static void *alloc_static_code_gen_buffer(void *buf) { - void *buf = static_code_gen_buffer; size_t full_size, size; /* The size of the buffer, rounded down to end on a page boundary. */ - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1)) & qemu_real_host_page_mask) - (uintptr_t)buf; /* Reserve a guard page. */ @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void) return buf; } + +static inline void *alloc_code_gen_buffer(void) +{ + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1); + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2); + + assert(static_buf_mask == 1); + return static_buf1; +} #elif defined(_WIN32) static inline void *alloc_code_gen_buffer(void) { @@ -829,8 +842,100 @@ static void page_flush_tb(void) } } +#ifdef USE_STATIC_CODE_GEN_BUFFER + +struct code_gen_desc { + struct rcu_head rcu; + int clear_bit; +}; + +static void code_gen_buffer_clear(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + tb_lock(); + static_buf_mask &= ~desc->clear_bit; + tb_unlock(); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); + + /* + * If both bits are set, we're having two concurrent flushes. This + * can easily happen if the buffers are heavily undersized. + */ + assert(static_buf_mask == 1 || static_buf_mask == 2); + + desc->clear_bit = static_buf_mask; + call_rcu1(&desc->rcu, code_gen_buffer_clear); + + if (static_buf_mask == 1) { + static_buf_mask |= 2; + return static_buf2; + } + static_buf_mask |= 1; + return static_buf1; +} + +#elif defined(_WIN32) + +struct code_gen_desc { + struct rcu_head rcu; + void *buf; +}; + +static void code_gen_buffer_vfree(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + VirtualFree(desc->buf, 0, MEM_RELEASE); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc; + + desc = g_malloc0(sizeof(*desc)); + desc->buf = tcg_ctx.code_gen_buffer; + call_rcu1(&desc->rcu, code_gen_buffer_vfree); + + return alloc_code_gen_buffer(); +} + +#else /* UNIX, dynamically-allocated code buffer */ + +struct code_gen_desc { + struct rcu_head rcu; + void *buf; + size_t size; +}; + +static void code_gen_buffer_unmap(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + munmap(desc->buf, desc->size + qemu_real_host_page_size); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc; + + desc = g_malloc0(sizeof(*desc)); + desc->buf = tcg_ctx.code_gen_buffer; + desc->size = tcg_ctx.code_gen_buffer_size; + call_rcu1(&desc->rcu, code_gen_buffer_unmap); + + return alloc_code_gen_buffer(); +} +#endif /* USE_STATIC_CODE_GEN_BUFFER */ + /* flush all the translation blocks */ -/* XXX: tb_flush is currently not thread safe */ void tb_flush(CPUState *cpu) { #if defined(DEBUG_FLUSH) @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu) qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; + tcg_prologue_init(&tcg_ctx); /* XXX: flush processor icache at this point if cache flush is expensive */ tcg_ctx.tb_ctx.tb_flush_count++; + + /* exit all CPUs so that the old buffer is quickly cleared. */ + CPU_FOREACH(cpu) { + cpu_exit(cpu); + } } #ifdef DEBUG_TB_CHECK -- 2.5.0