On 28 January 2014 17:31, Xin Tong <trent.t...@gmail.com> wrote: > This patch adds a victim TLB to the QEMU system mode TLB. > > QEMU system mode page table walks are expensive. Taken by running QEMU > qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a > 4-level page tables in guest Linux OS takes ~450 X86 instructions on > average.
My review below is largely limited to style issues; I'm assuming rth will do the substantive review. > Signed-off-by: Xin Tong <trent.t...@gmail.com> > > --- > cputlb.c | 50 ++++++++++++++++++++++++++++++++++++- > include/exec/cpu-defs.h | 12 ++++++--- > include/exec/exec-all.h | 2 ++ > include/exec/softmmu_template.h | 55 > ++++++++++++++++++++++++++++++++++++++--- > 4 files changed, 111 insertions(+), 8 deletions(-) > > diff --git a/cputlb.c b/cputlb.c > index b533f3f..caee78e 100644 > --- a/cputlb.c > +++ b/cputlb.c > @@ -34,6 +34,22 @@ > /* statistics */ > int tlb_flush_count; > > +/* swap the 2 given TLB entries as well as their corresponding IOTLB */ > +inline void swap_tlb(CPUTLBEntry *te, CPUTLBEntry *se, hwaddr *iote, > + hwaddr *iose) > +{ > + hwaddr iotmp; > + CPUTLBEntry tmp; > + /* swap tlb */ > + tmp = *te; > + *te = *se; > + *se = tmp; > + /* swap iotlb */ > + iotmp = *iote; > + *iote = *iose; > + *iose = iotmp; > +} > + > /* NOTE: > * If flush_global is true (the usual case), flush all tlb entries. > * If flush_global is false, flush (at least) all tlb entries not > @@ -58,8 +74,10 @@ void tlb_flush(CPUArchState *env, int flush_global) > cpu->current_tb = NULL; > > memset(env->tlb_table, -1, sizeof(env->tlb_table)); > + memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table)); > memset(env->tb_jmp_cache, 0, sizeof(env->tb_jmp_cache)); > > + env->vtlb_index = 0; > env->tlb_flush_addr = -1; > env->tlb_flush_mask = 0; > tlb_flush_count++; > @@ -106,6 +124,14 @@ void tlb_flush_page(CPUArchState *env, target_ulong addr) > tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr); > } > > + /* check whether there are entries that need to be flushed in the vtlb */ > + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > + unsigned int k; Just plain "int" is fine. > + for (k = 0; k < CPU_VTLB_SIZE; k++) { > + tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr); > + } > + } > + > tb_flush_jmp_cache(env, addr); > } > > @@ -170,6 +196,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1, > ram_addr_t length) > tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i], > start1, length); > } > + > + for (i = 0; i < CPU_VTLB_SIZE; i++) { > + tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i], > + start1, length); > + } > } > } > } > @@ -193,6 +224,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong vaddr) > for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr); > } > + > + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > + unsigned int k; > + for (k = 0; k < CPU_VTLB_SIZE; k++) { > + tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr); > + } > + } > } > > /* Our TLB does not support large pages, so remember the area covered by > @@ -264,8 +302,18 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, > prot, &address); > > index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); > - env->iotlb[mmu_idx][index] = iotlb - vaddr; > te = &env->tlb_table[mmu_idx][index]; > + > + /* do not discard the translation in te, evict it into a victim tlb */ > + unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE; > + env->tlb_v_table[mmu_idx][vidx].addr_read = te->addr_read; > + env->tlb_v_table[mmu_idx][vidx].addr_write = te->addr_write; > + env->tlb_v_table[mmu_idx][vidx].addr_code = te->addr_code; > + env->tlb_v_table[mmu_idx][vidx].addend = te->addend; You're still writing structure assignments out longhand. These four lines should all be replaced with env->tlb_v_table[mmu_idx][vidx] = *te; > + env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index]; > + > + /* refill the tlb */ > + env->iotlb[mmu_idx][index] = iotlb - vaddr; > te->addend = addend - vaddr; > if (prot & PAGE_READ) { > te->addr_read = address; > diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h > index 01cd8c7..2631d6b 100644 > --- a/include/exec/cpu-defs.h > +++ b/include/exec/cpu-defs.h > @@ -74,6 +74,8 @@ typedef uint64_t target_ulong; > #if !defined(CONFIG_USER_ONLY) > #define CPU_TLB_BITS 8 > #define CPU_TLB_SIZE (1 << CPU_TLB_BITS) > +/* use a fully associative victim tlb */ > +#define CPU_VTLB_SIZE 8 > > #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32 > #define CPU_TLB_ENTRY_BITS 4 > @@ -103,12 +105,16 @@ typedef struct CPUTLBEntry { > > QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS)); > > +/* The meaning of the MMU modes is defined in the target code. */ Why this addition? It's duplicating a comment that already exists below. > #define CPU_COMMON_TLB \ > /* The meaning of the MMU modes is defined in the target code. */ \ > - CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \ > - hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \ > + hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + hwaddr iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE]; \ Don't try to line up the field names like this -- it just means more code churn, because next time somebody has to add a line here with a typename longer than "CPUTLBEntry" they need to change every line in the macro. Single space is fine. (Lining up the '\' on the right is OK.) > target_ulong tlb_flush_addr; \ > - target_ulong tlb_flush_mask; > + target_ulong tlb_flush_mask; \ > + target_ulong vtlb_index; \ > > #else > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index ea90b64..7e88b08 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -102,6 +102,8 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, > hwaddr paddr, int prot, > int mmu_idx, target_ulong size); > void tb_invalidate_phys_addr(hwaddr addr); > +/* swap the 2 given tlb entries as well as their iotlb */ > +void swap_tlb(CPUTLBEntry *te, CPUTLBEntry *se, hwaddr *iote, hwaddr *iose); > #else > static inline void tlb_flush_page(CPUArchState *env, target_ulong addr) > { > diff --git a/include/exec/softmmu_template.h b/include/exec/softmmu_template.h > index c6a5440..38d18de 100644 > --- a/include/exec/softmmu_template.h > +++ b/include/exec/softmmu_template.h > @@ -112,6 +112,21 @@ > # define helper_te_st_name helper_le_st_name > #endif > > +/* macro to check the victim tlb */ > +#define HELPER_CHECK_VICTIM_TLB(ACCESS_TYPE) \ > +do { \ > + for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) { \ > + if (ACCESS_TYPE == (addr & TARGET_PAGE_MASK)) { \ > + /* found entry in victim tlb */ \ > + swap_tlb(&env->tlb_table[mmu_idx][index], \ > + &env->tlb_v_table[mmu_idx][vtlb_idx], \ > + &env->iotlb[mmu_idx][index], \ > + &env->iotlb_v[mmu_idx][vtlb_idx]); \ This is the only place swap_tlb gets used, so probably better to ditch that as a separate function and just inline it here. (Then everywhere that uses softmmu_template.h gets the inlined version, rather than cputlb.c getting to inline and the rest not.) > + break; \ > + } \ > + } \ > +} while (0); I think we could make this macro use a bit less ugly. (1) just call it VICTIM_TLB_HIT; 'helper' has a particular meaning in QEMU (function called from generated code), and besides it makes the calling code read a bit less naturally. (2) rather than having it take as an argument "env->tlb_v_table[mmu_idx][vtlb_idx].ADDR_READ", just take the "ADDR_READ" part. This makes the callers simpler and avoids giving the impression that the macro is going to simply evaluate its argument once (ie that it is function-like). (3) Make it a gcc statement-expression, so it can return a value. Then you can (a) make the scope of vtlb_idx be only inside teh macro, and avoid forcing the caller to define it and (b) have the usage pattern be if (!VICTIM_TLB_HIT(ADDR_READ)) { tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); } (which means you don't need that "miss victim tlb" comment any more, because it's obvious from the macro name what is happening.) > + > static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env, > hwaddr physaddr, > target_ulong addr, > @@ -141,6 +156,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > uintptr_t haddr; > DATA_TYPE res; > + int vtlb_idx; > > /* Adjust the given return address. */ > retaddr -= GETPC_ADJ; > @@ -153,7 +169,14 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > do_unaligned_access(env, addr, READ_ACCESS_TYPE, mmu_idx, > retaddr); > } > #endif > - tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); > + /* we are about to do a page table walk. our last hope is the > + * victim tlb. try to refill from the victim tlb before walking the > + * page table. */ You might as well put this comment in your helper macro; there's no need to repeat it in every callsite. > + > HELPER_CHECK_VICTIM_TLB(env->tlb_v_table[mmu_idx][vtlb_idx].ADDR_READ); > + /* miss victim tlb */ > + if (vtlb_idx < 0) { > + tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > } thanks -- PMM