Disable for all TCG backends for now. Signed-off-by: Emilio G. Cota <c...@braap.org> --- include/exec/cpu-defs.h | 43 +++++++++++- include/exec/cpu_ldst.h | 21 ++++++ tcg/aarch64/tcg-target.h | 1 + tcg/arm/tcg-target.h | 1 + tcg/i386/tcg-target.h | 1 + tcg/mips/tcg-target.h | 1 + tcg/ppc/tcg-target.h | 1 + tcg/s390/tcg-target.h | 1 + tcg/sparc/tcg-target.h | 1 + tcg/tci/tcg-target.h | 1 + accel/tcg/cputlb.c | 138 +++++++++++++++++++++++++++++++++++++-- 11 files changed, 201 insertions(+), 9 deletions(-)
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index 4ff62f32bf..40cd5d4774 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -67,6 +67,19 @@ typedef uint64_t target_ulong; #define CPU_TLB_ENTRY_BITS 5 #endif +#if TCG_TARGET_IMPLEMENTS_DYN_TLB +#define CPU_TLB_DYN_MIN_BITS 6 +#define CPU_TLB_DYN_DEFAULT_BITS 8 +/* + * Assuming TARGET_PAGE_BITS==12, with 2**22 entries we can cover 2**(22+12) == + * 2**34 == 16G of address space. This is roughly what one would expect a + * TLB to cover in a modern (as of 2018) x86_64 CPU. For instance, Intel + * Skylake's Level-2 STLB has 16 1G entries. + */ +#define CPU_TLB_DYN_MAX_BITS 22 + +#else /* !TCG_TARGET_IMPLEMENTS_DYN_TLB */ + /* TCG_TARGET_TLB_DISPLACEMENT_BITS is used in CPU_TLB_BITS to ensure that * the TLB is not unnecessarily small, but still small enough for the * TLB lookup instruction sequence used by the TCG target. @@ -98,6 +111,7 @@ typedef uint64_t target_ulong; NB_MMU_MODES <= 8 ? 3 : 4)) #define CPU_TLB_SIZE (1 << CPU_TLB_BITS) +#endif /* TCG_TARGET_IMPLEMENTS_DYN_TLB */ typedef struct CPUTLBEntry { /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address @@ -141,13 +155,36 @@ typedef struct CPUIOTLBEntry { MemTxAttrs attrs; } CPUIOTLBEntry; -#define CPU_COMMON_TLB \ +#if TCG_TARGET_IMPLEMENTS_DYN_TLB + +typedef struct CPUTLBDesc { + size_t n_used_entries; + size_t n_flushes_low_rate; +} CPUTLBDesc; + +#define CPU_TLB \ + CPUTLBDesc tlb_desc[NB_MMU_MODES]; \ + /* tlb_mask[i] contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */ \ + uintptr_t tlb_mask[NB_MMU_MODES]; \ + CPUTLBEntry *tlb_table[NB_MMU_MODES]; + +#define CPU_IOTLB \ + CPUIOTLBEntry *iotlb[NB_MMU_MODES]; +#else +#define CPU_TLB \ + CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; + +#define CPU_IOTLB \ + CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; +#endif /* TCG_TARGET_IMPLEMENTS_DYN_TLB */ + +#define CPU_COMMON_TLB \ /* The meaning of the MMU modes is defined in the target code. */ \ /* tlb_lock serializes updates to tlb_table and tlb_v_table */ \ QemuSpin tlb_lock; \ - CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \ + CPU_TLB \ CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \ - CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ + CPU_IOTLB \ CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE]; \ size_t tlb_flush_count; \ target_ulong tlb_flush_addr; \ diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h index e3d8d738aa..91f29c1188 100644 --- a/include/exec/cpu_ldst.h +++ b/include/exec/cpu_ldst.h @@ -126,6 +126,21 @@ extern __thread uintptr_t helper_retaddr; /* The memory helpers for tcg-generated code need tcg_target_long etc. */ #include "tcg.h" +#if TCG_TARGET_IMPLEMENTS_DYN_TLB +/* Find the TLB index corresponding to the mmu_idx + address pair. */ +static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx, + target_ulong addr) +{ + uintptr_t size_mask = env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS; + + return (addr >> TARGET_PAGE_BITS) & size_mask; +} + +static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx) +{ + return (env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS) + 1; +} +#else /* Find the TLB index corresponding to the mmu_idx + address pair. */ static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx, target_ulong addr) @@ -133,6 +148,12 @@ static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx, return (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); } +static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx) +{ + return CPU_TLB_SIZE; +} +#endif /* TCG_TARGET_IMPLEMENTS_DYN_TLB */ + /* Find the TLB entry corresponding to the mmu_idx + address pair. */ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx, target_ulong addr) diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index 9aea1d1771..3060d83d14 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -15,6 +15,7 @@ #define TCG_TARGET_INSN_UNIT_SIZE 4 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 #undef TCG_TARGET_STACK_GROWSUP typedef enum { diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 94b3578c55..0e8b79d20f 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -60,6 +60,7 @@ extern int arm_arch; #undef TCG_TARGET_STACK_GROWSUP #define TCG_TARGET_INSN_UNIT_SIZE 4 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 typedef enum { TCG_REG_R0 = 0, diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 9fdf37f23c..9e4bfa90d1 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -27,6 +27,7 @@ #define TCG_TARGET_INSN_UNIT_SIZE 1 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 #ifdef __x86_64__ # define TCG_TARGET_REG_BITS 64 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h index a8222476f0..a97f31113e 100644 --- a/tcg/mips/tcg-target.h +++ b/tcg/mips/tcg-target.h @@ -37,6 +37,7 @@ #define TCG_TARGET_INSN_UNIT_SIZE 4 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 #define TCG_TARGET_NB_REGS 32 typedef enum { diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index be52ad1d2e..8f03328af4 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -34,6 +34,7 @@ #define TCG_TARGET_NB_REGS 32 #define TCG_TARGET_INSN_UNIT_SIZE 4 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 typedef enum { TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, TCG_REG_R3, diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h index 6f2b06a7d1..df92f3065a 100644 --- a/tcg/s390/tcg-target.h +++ b/tcg/s390/tcg-target.h @@ -27,6 +27,7 @@ #define TCG_TARGET_INSN_UNIT_SIZE 2 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 19 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 typedef enum TCGReg { TCG_REG_R0 = 0, diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h index d8339bf010..975ddc7b0d 100644 --- a/tcg/sparc/tcg-target.h +++ b/tcg/sparc/tcg-target.h @@ -29,6 +29,7 @@ #define TCG_TARGET_INSN_UNIT_SIZE 4 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 32 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 #define TCG_TARGET_NB_REGS 32 typedef enum { diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h index 26140d78cb..bcfd8d69e6 100644 --- a/tcg/tci/tcg-target.h +++ b/tcg/tci/tcg-target.h @@ -43,6 +43,7 @@ #define TCG_TARGET_INTERPRETER 1 #define TCG_TARGET_INSN_UNIT_SIZE 1 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 32 +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0 #if UINTPTR_MAX == UINT32_MAX # define TCG_TARGET_REG_BITS 32 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 6ee18308d5..b7bc4bb32f 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -74,11 +74,128 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data)); QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16); #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1) +#if TCG_TARGET_IMPLEMENTS_DYN_TLB +static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx) +{ + return env->tlb_mask[mmu_idx] + (1 << CPU_TLB_ENTRY_BITS); +} + +static void tlb_dyn_init(CPUArchState *env) +{ + int i; + + for (i = 0; i < NB_MMU_MODES; i++) { + size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS; + + env->tlb_desc[i].n_used_entries = 0; + env->tlb_desc[i].n_flushes_low_rate = 0; + env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS; + env->tlb_table[i] = g_new(CPUTLBEntry, n_entries); + env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries); + } +} + +/* + * Perform the resizing only on flushes, otherwise we'd have to take a perf + * hit by either rehashing the array or unnecessarily flushing it. + * + * We grow the array aggressively, and reduce the size more slowly. This + * accommodates mixed workloads, where some processes might be memory-heavy + * while others might not. + * + * Called with tlb_lock held. + */ +static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx) +{ + CPUTLBDesc *desc = &env->tlb_desc[mmu_idx]; + size_t old_size = tlb_n_entries(env, mmu_idx); + size_t rate = desc->n_used_entries * 100 / old_size; + size_t new_size = old_size; + + if (rate == 100) { + new_size = MIN(old_size << 2, 1 << CPU_TLB_DYN_MAX_BITS); + } else if (rate > 70) { + new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS); + } else if (rate < 30) { + desc->n_flushes_low_rate++; + if (desc->n_flushes_low_rate == 100) { + new_size = MAX(old_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS); + desc->n_flushes_low_rate = 0; + } + } + + if (new_size == old_size) { + return; + } + g_free(env->tlb_table[mmu_idx]); + g_free(env->iotlb[mmu_idx]); + + /* desc->n_used_entries is cleared by the caller */ + desc->n_flushes_low_rate = 0; + env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS; + env->tlb_table[mmu_idx] = g_new(CPUTLBEntry, new_size); + env->iotlb[mmu_idx] = g_new(CPUIOTLBEntry, new_size); +} + +static inline void tlb_table_flush(CPUArchState *env) +{ + int i; + + for (i = 0; i < NB_MMU_MODES; i++) { + tlb_mmu_resize_locked(env, i); + memset(env->tlb_table[i], -1, sizeof_tlb(env, i)); + env->tlb_desc[i].n_used_entries = 0; + } +} + +static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx) +{ + tlb_mmu_resize_locked(env, mmu_idx); + memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx)); + env->tlb_desc[mmu_idx].n_used_entries = 0; +} + +static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx) +{ + env->tlb_desc[mmu_idx].n_used_entries++; +} + +static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx) +{ + env->tlb_desc[mmu_idx].n_used_entries--; +} + +#else /* !TCG_TARGET_IMPLEMENTS_DYN_TLB */ + +static inline void tlb_dyn_init(CPUArchState *env) +{ +} + +static inline void tlb_table_flush(CPUArchState *env) +{ + memset(env->tlb_table, -1, sizeof(env->tlb_table)); +} + +static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx) +{ + memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0])); +} + +static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx) +{ +} + +static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx) +{ +} +#endif /* TCG_TARGET_IMPLEMENTS_DYN_TLB */ + void tlb_init(CPUState *cpu) { CPUArchState *env = cpu->env_ptr; qemu_spin_init(&env->tlb_lock); + tlb_dyn_init(env); } /* flush_all_helper: run fn across all cpus @@ -140,7 +257,7 @@ static void tlb_flush_nocheck(CPUState *cpu) * that do not hold the lock are performed by the same owner thread. */ qemu_spin_lock(&env->tlb_lock); - memset(env->tlb_table, -1, sizeof(env->tlb_table)); + tlb_table_flush(env); memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table)); qemu_spin_unlock(&env->tlb_lock); @@ -201,7 +318,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data) if (test_bit(mmu_idx, &mmu_idx_bitmask)) { tlb_debug("%d\n", mmu_idx); - memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0])); + tlb_table_flush_by_mmuidx(env, mmu_idx); memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0])); } } @@ -263,12 +380,14 @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry, } /* Called with tlb_lock held */ -static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry, +static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry, target_ulong page) { if (tlb_hit_page_anyprot(tlb_entry, page)) { memset(tlb_entry, -1, sizeof(*tlb_entry)); + return true; } + return false; } /* Called with tlb_lock held */ @@ -279,7 +398,9 @@ static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx, assert_cpu_is_self(ENV_GET_CPU(env)); for (k = 0; k < CPU_VTLB_SIZE; k++) { - tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page); + if (tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page)) { + tlb_n_used_entries_dec(env, mmu_idx); + } } } @@ -306,7 +427,9 @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data) addr &= TARGET_PAGE_MASK; qemu_spin_lock(&env->tlb_lock); for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { - tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr); + if (tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr)) { + tlb_n_used_entries_dec(env, mmu_idx); + } tlb_flush_vtlb_page_locked(env, mmu_idx, addr); } qemu_spin_unlock(&env->tlb_lock); @@ -524,8 +647,9 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length) qemu_spin_lock(&env->tlb_lock); for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { unsigned int i; + unsigned int n = tlb_n_entries(env, mmu_idx); - for (i = 0; i < CPU_TLB_SIZE; i++) { + for (i = 0; i < n; i++) { tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1, length); } @@ -685,6 +809,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr, /* Evict the old entry into the victim tlb. */ copy_tlb_helper_locked(tv, te); env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index]; + tlb_n_used_entries_dec(env, mmu_idx); } /* refill the tlb */ @@ -736,6 +861,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr, } copy_tlb_helper_locked(te, &tn); + tlb_n_used_entries_inc(env, mmu_idx); qemu_spin_unlock(&env->tlb_lock); } -- 2.17.1