Add support for the cr3 cache feature on Intel VMX CPU's. This avoids vmexits on context switch if the cr3 value is cached in one of the entries (currently 4 are present).
This is especially important for Xenner, where each guest syscalls involves a cr3 switch. Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]> Cc: Anthony Liguori <[EMAIL PROTECTED]> Index: kvm.paravirt/arch/x86/kernel/kvm.c =================================================================== --- kvm.paravirt.orig/arch/x86/kernel/kvm.c +++ kvm.paravirt/arch/x86/kernel/kvm.c @@ -26,14 +26,16 @@ #include <linux/cpu.h> #include <linux/mm.h> #include <linux/hardirq.h> +#include <asm/tlbflush.h> #define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry)) struct kvm_para_state { + struct kvm_cr3_cache cr3_cache; struct kvm_multicall_entry queue[MAX_MULTICALL_NR]; int queue_index; enum paravirt_lazy_mode mode; -}; +} __attribute__ ((aligned(PAGE_SIZE))); static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -101,6 +103,98 @@ static void kvm_io_delay(void) { } +static void kvm_new_cr3(unsigned long cr3) +{ + kvm_hypercall1(KVM_HYPERCALL_SET_CR3, cr3); +} + +/* + * Special, register-to-cr3 instruction based hypercall API + * variant to the KVM host. This utilizes the cr3 filter capability + * of the hardware - if this works out then no VM exit happens, + * if a VM exit happens then KVM will get the virtual address too. + */ +static void kvm_write_cr3(unsigned long guest_cr3) +{ + struct kvm_para_state *para_state = &get_cpu_var(para_state); + struct kvm_cr3_cache *cache = ¶_state->cr3_cache; + int idx; + + /* + * Check the cache (maintained by the host) for a matching + * guest_cr3 => host_cr3 mapping. Use it if found: + */ + for (idx = 0; idx < cache->max_idx; idx++) { + if (cache->entry[idx].guest_cr3 == guest_cr3) { + /* + * Cache-hit: we load the cached host-CR3 value. + * This never causes any VM exit. (if it does then the + * hypervisor could do nothing with this instruction + * and the guest OS would be aborted) + */ + native_write_cr3(cache->entry[idx].host_cr3); + goto out; + } + } + + /* + * Cache-miss. Tell the host the new cr3 via hypercall (to avoid + * aliasing problems with a cached host_cr3 == guest_cr3). + */ + kvm_new_cr3(guest_cr3); +out: + put_cpu_var(para_state); +} + +/* + * Avoid the VM exit upon cr3 load by using the cached + * ->active_mm->pgd value: + */ +static void kvm_flush_tlb_user(void) +{ + kvm_write_cr3(__pa(current->active_mm->pgd)); +} + +/* + * Disable global pages, do a flush, then enable global pages: + */ +static fastcall void kvm_flush_tlb_kernel(void) +{ + unsigned long orig_cr4 = read_cr4(); + + write_cr4(orig_cr4 & ~X86_CR4_PGE); + kvm_flush_tlb_user(); + write_cr4(orig_cr4); +} + +static void register_cr3_cache(void *cache) +{ + struct kvm_para_state *state; + + state = &per_cpu(para_state, raw_smp_processor_id()); + wrmsrl(KVM_MSR_SET_CR3_CACHE, __pa(&state->cr3_cache)); +} + +static unsigned __init kvm_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + switch (type) { + case PARAVIRT_PATCH(pv_mmu_ops.write_cr3): + return paravirt_patch_default(type, clobbers, ibuf, addr, len); + default: + return native_patch(type, clobbers, ibuf, addr, len); + } +} + +static void __init setup_guest_cr3_cache(void) +{ + on_each_cpu(register_cr3_cache, NULL, 0, 1); + + pv_mmu_ops.write_cr3 = kvm_write_cr3; + pv_mmu_ops.flush_tlb_user = kvm_flush_tlb_user; + pv_mmu_ops.flush_tlb_kernel = kvm_flush_tlb_kernel; +} + static void kvm_mmu_write(void *dest, const void *src, size_t size) { const uint8_t *p = src; @@ -117,6 +211,28 @@ static void kvm_mmu_write(void *dest, co } /* + * CR3 cache initialization uses on_each_cpu(), so it can't + * happen at kvm_guest_init time. + */ +int __init kvm_cr3_cache_init(void) +{ + unsigned long flags; + + if (!kvm_para_available()) + return -ENOSYS; + + if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE)) { + setup_guest_cr3_cache(); + local_irq_save(flags); + apply_paravirt(__parainstructions, __parainstructions_end); + local_irq_restore(flags); + } + + return 0; +} +module_init(kvm_cr3_cache_init); + +/* * We only need to hook operations that are MMU writes. We hook these so that * we can use lazy MMU mode to batch these operations. We could probably * improve the performance of the host code if we used some of the information @@ -236,6 +352,9 @@ static void paravirt_ops_setup(void) pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; } + + if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE)) + pv_init_ops.patch = kvm_patch; } void __init kvm_guest_init(void) Index: kvm.paravirt/arch/x86/kvm/mmu.c =================================================================== --- kvm.paravirt.orig/arch/x86/kvm/mmu.c +++ kvm.paravirt/arch/x86/kvm/mmu.c @@ -258,6 +258,16 @@ static int mmu_topup_memory_cache(struct } return 0; } +static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu) +{ + struct kvm_cr3_cache *cache; + + if (!vcpu->arch.cr3_cache) + return; + cache = vcpu->arch.cr3_cache; + memset(cache->entry, 0, sizeof(cache->entry)); +} + static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { @@ -978,7 +988,7 @@ static void nonpaging_new_cr3(struct kvm static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, gfn_t gfn, struct page *page, int level) { - hpa_t table_addr = vcpu->arch.mmu.root_hpa; + hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]; int pt_write = 0; for (; ; level--) { @@ -1058,49 +1068,71 @@ static void nonpaging_prefetch_page(stru static void mmu_free_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; struct kvm_mmu_page *sp; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; + /* + * Skip to the next cr3 filter entry and free it (if it's occupied). + */ + vcpu->arch.cr3_cache_idx++; + if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit)) + vcpu->arch.cr3_cache_idx = 0; + + j = vcpu->arch.cr3_cache_idx; + /* + * Clear the guest-visible entry. + */ + if (vcpu->arch.cr3_cache) { + vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0; + vcpu->arch.cr3_cache->entry[j].host_cr3 = 0; + } spin_lock(&vcpu->kvm->mmu_lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + hpa_t root = vcpu->arch.mmu.root_hpa[j]; + + if (!VALID_PAGE(root)) { + spin_unlock(&vcpu->kvm->mmu_lock); + return; + } sp = page_header(root); --sp->root_count; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; } #endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - --sp->root_count; + ASSERT(vcpu->arch.mmu.pae_root[j]); + if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) { + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[j][i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + } + vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE; } - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE; } static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; gfn_t root_gfn; struct kvm_mmu_page *sp; int metaphysical = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + j = vcpu->arch.cr3_cache_idx; #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + hpa_t root = vcpu->arch.mmu.root_hpa[j]; ASSERT(!VALID_PAGE(root)); if (tdp_enabled) @@ -1110,7 +1142,7 @@ static void mmu_alloc_roots(struct kvm_v ACC_ALL, NULL, NULL); root = __pa(sp->spt); ++sp->root_count; - vcpu->arch.mmu.root_hpa = root; + vcpu->arch.mmu.root_hpa[j] = root; return; } #endif @@ -1118,7 +1150,7 @@ static void mmu_alloc_roots(struct kvm_v if (tdp_enabled) metaphysical = 1; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu.pae_root[j][i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { @@ -1134,9 +1166,9 @@ static void mmu_alloc_roots(struct kvm_v ACC_ALL, NULL, NULL); root = __pa(sp->spt); ++sp->root_count; - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -1156,7 +1188,7 @@ static int nonpaging_page_fault(struct k return r; ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j])); gfn = gva >> PAGE_SHIFT; @@ -1196,12 +1228,19 @@ static int tdp_page_fault(struct kvm_vcp static void nonpaging_free(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + int j; + + /* + * This will cycle through all existing roots and free them. + */ + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); } static int nonpaging_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; @@ -1210,7 +1249,8 @@ static int nonpaging_init_context(struct context->prefetch_page = nonpaging_prefetch_page; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; return 0; } @@ -1249,6 +1289,7 @@ static void paging_free(struct kvm_vcpu static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -1258,7 +1299,8 @@ static int paging64_init_context_common( context->free = paging_free; context->root_level = level; context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; return 0; } @@ -1270,6 +1312,7 @@ static int paging64_init_context(struct static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -1278,7 +1321,8 @@ static int paging32_init_context(struct context->prefetch_page = paging32_prefetch_page; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; return 0; } @@ -1290,13 +1334,15 @@ static int paging32E_init_context(struct static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->shadow_root_level = TDP_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -1318,7 +1364,7 @@ static int init_kvm_tdp_mmu(struct kvm_v static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx])); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); @@ -1340,11 +1386,14 @@ static int init_kvm_mmu(struct kvm_vcpu static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { + int j; ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { - vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } + + for(j = 0; j < KVM_CR3_CACHE_SIZE; j++) + if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) { + vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE; + } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -1357,6 +1406,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context) int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; + int j = vcpu->arch.cr3_cache_idx; r = mmu_topup_memory_caches(vcpu); if (r) @@ -1365,8 +1415,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); + /* setting CR3 will flush the TLB */ + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]); out: return r; } @@ -1374,7 +1424,9 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + int j; + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); } static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, @@ -1546,6 +1598,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu * pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_cr3_cache_clear(vcpu); ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -1607,6 +1660,8 @@ int kvm_mmu_unprotect_page_virt(struct k spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (r) + kvm_cr3_cache_clear(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); return r; } @@ -1619,6 +1674,7 @@ void __kvm_mmu_free_some_pages(struct kv sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_cr3_cache_clear(vcpu); ++vcpu->kvm->stat.mmu_recycled; } } @@ -1669,19 +1725,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; + int j; while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu->kvm, sp); } - free_page((unsigned long)vcpu->arch.mmu.pae_root); + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + ASSERT(vcpu->arch.mmu.pae_root[j]); + free_page((unsigned long)vcpu->arch.mmu.pae_root[j]); + vcpu->arch.mmu.pae_root[j] = NULL; + } } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { struct page *page; - int i; + int i, j; ASSERT(vcpu); @@ -1691,17 +1752,23 @@ static int alloc_mmu_pages(struct kvm_vc else vcpu->kvm->arch.n_free_mmu_pages = vcpu->kvm->arch.n_alloc_mmu_pages; - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->arch.mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate shadow page tables + * in the first 4GB of memory, which happens to fit the DMA32 + * zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + + ASSERT(!vcpu->arch.mmu.pae_root[j]); + vcpu->arch.mmu.pae_root[j] = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE; + } return 0; @@ -1713,7 +1780,7 @@ error_1: int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx])); return alloc_mmu_pages(vcpu); } @@ -1721,7 +1788,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu int kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx])); return init_kvm_mmu(vcpu); } @@ -1881,15 +1948,16 @@ static void audit_mappings(struct kvm_vc { unsigned i; - if (vcpu->arch.mmu.root_level == 4) + if (vcpu->arch.mmu.root_level == 4) { audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else + return; + } + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK) audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); + vcpu->arch.mmu.pae_root[j][i], i << 30, 2); + } } static int count_rmaps(struct kvm_vcpu *vcpu) Index: kvm.paravirt/arch/x86/kvm/mmu.h =================================================================== --- kvm.paravirt.orig/arch/x86/kvm/mmu.h +++ kvm.paravirt/arch/x86/kvm/mmu.h @@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pag static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + int idx = vcpu->arch.cr3_cache_idx; + if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE)) return 0; return kvm_mmu_load(vcpu); Index: kvm.paravirt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.paravirt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.paravirt/arch/x86/kvm/paging_tmpl.h @@ -283,10 +283,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu if (!is_present_pte(walker->ptes[walker->level - 1])) return NULL; - shadow_addr = vcpu->arch.mmu.root_hpa; + shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]; level = vcpu->arch.mmu.shadow_root_level; if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr = vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; --level; } Index: kvm.paravirt/arch/x86/kvm/vmx.c =================================================================== --- kvm.paravirt.orig/arch/x86/kvm/vmx.c +++ kvm.paravirt/arch/x86/kvm/vmx.c @@ -216,6 +216,10 @@ static inline int cpu_has_vmx_vpid(void) return (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID); } +static inline bool cpu_has_cr3_cache(void) +{ + return true; +} static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { @@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu * return 0; } +int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data) +{ + struct page *page; + hva_t cr3_cache_hva; + + if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache) + return -EINVAL; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(page)) { + kvm_release_page_clean(page); + return -EINVAL; + } + + cr3_cache_hva = (hva_t)__va(page_to_phys(page)); + vcpu->arch.cr3_cache = (void *)cr3_cache_hva; + vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit; + + return 0; +} + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu * case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); break; + case KVM_MSR_SET_CR3_CACHE: + ret = vmx_cr3_cache_msr(vcpu, data); + break; default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm_cr3_cache *cache; + int idx; + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); + + if (!vcpu->arch.cr3_cache) + return; + + idx = vcpu->arch.cr3_cache_idx; + cache = vcpu->arch.cr3_cache; + + cache->entry[idx].host_cr3 = cr3; + cache->entry[idx].guest_cr3 = vcpu->arch.cr3; + vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1503,6 +1547,39 @@ out: up_read(¤t->mm->mmap_sem); return ret; } +/* + * Set up the cr3 validity hardware cache. + */ +static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu) +{ + unsigned int cr3_target_values, i; + u64 msr_val; + + rdmsrl(MSR_IA32_VMX_MISC, msr_val); + + printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val); + + /* + * 9 bits of "CR3 target values": + */ + cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1); + printk(" cr3 target values: %d\n", cr3_target_values); + if (cr3_target_values > KVM_CR3_CACHE_SIZE) { + printk("KVM: limiting cr3 cache size from %d to %d\n", + cr3_target_values, KVM_CR3_CACHE_SIZE); + cr3_target_values = KVM_CR3_CACHE_SIZE; + } + + vcpu->arch.cr3_cache_idx = 0; + vcpu->arch.cr3_cache_limit = cr3_target_values; + /* + * Initialize. TODO: set this to guest physical memory. + */ + for (i = 0; i < cr3_target_values; i++) + vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL); + + vmcs_write32(CR3_TARGET_COUNT, cr3_target_values); +} static void seg_setup(int seg) { @@ -1599,7 +1676,7 @@ static int vmx_vcpu_setup(struct vcpu_vm vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + vmcs_setup_cr3_cache(&vmx->vcpu); vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ @@ -2396,6 +2473,56 @@ static void fixup_rmode_irq(struct vcpu_ | vmx->rmode.irq.vector; } +static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu) +{ + void *guest_cr3_hva; + hpa_t guest_cr3_hpa; + struct kvm_cr3_cache *cache; + int j; + int idx = vcpu->arch.cr3_cache_idx; + + if (!vcpu->arch.cr3_cache) + return; + + guest_cr3_hpa = vmcs_readl(GUEST_CR3); + /* + * Are they in sync already? + */ + if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx]) + return; + + cache = vcpu->arch.cr3_cache; +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == 4) { + for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) { + hpa_t root = cache->entry[j].host_cr3; + if (root != guest_cr3_hpa) + continue; + vcpu->arch.cr3 = cache->entry[j].guest_cr3; + vcpu->arch.cr3_cache_idx = j; + vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3; + ++vcpu->stat.cr3_cache_synced; + return; + } + WARN_ON(j == KVM_CR3_CACHE_SIZE-1); + } +#endif + + guest_cr3_hva = __va(guest_cr3_hpa); + for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) { + u64 *root = vcpu->arch.mmu.pae_root[j]; + WARN_ON(!root); + if (root != guest_cr3_hva) + continue; + vcpu->arch.cr3 = cache->entry[j].guest_cr3; + vcpu->arch.cr3_cache_idx = j; + vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]); + ++vcpu->stat.cr3_cache_synced; + return; + } + WARN_ON(j == KVM_CR3_CACHE_SIZE-1); +} + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2406,6 +2533,8 @@ static void vmx_vcpu_run(struct kvm_vcpu */ vmcs_writel(HOST_CR0, read_cr0()); + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]); + asm( /* Store host registers */ #ifdef CONFIG_X86_64 @@ -2520,6 +2649,12 @@ static void vmx_vcpu_run(struct kvm_vcpu , "ebx", "edi", "rsi" #endif ); + /* + * Figure out whether vcpu->cr3 needs updating because + * the guest made use of the cr3 cache. + */ + kvm_cr3_cache_sync(vcpu); + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) @@ -2552,11 +2687,16 @@ static void vmx_free_vmcs(struct kvm_vcp static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *page = NULL; spin_lock(&vmx_vpid_lock); if (vmx->vpid != 0) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); + if (vcpu->arch.cr3_cache) { + page = virt_to_page(vcpu->arch.cr3_cache); + kvm_release_page_dirty(page); + } vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2641,6 +2781,7 @@ static struct kvm_x86_ops vmx_x86_ops = .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, + .cpu_has_cr3_cache = cpu_has_cr3_cache, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, Index: kvm.paravirt/arch/x86/kvm/x86.c =================================================================== --- kvm.paravirt.orig/arch/x86/kvm/x86.c +++ kvm.paravirt/arch/x86/kvm/x86.c @@ -80,6 +80,7 @@ struct kvm_stats_debugfs_item debugfs_en { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "multicall", VCPU_STAT(multicall) }, { "multicall_nr", VCPU_STAT(multicall_nr) }, + { "cr3_cache_synced", VCPU_STAT(cr3_cache_synced) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -766,10 +767,13 @@ long kvm_arch_dev_ioctl(struct file *fil } case KVM_GET_PARA_FEATURES: { __u32 para_features = KVM_PARA_FEATURES; + if (tdp_enabled) { para_features &= ~(1UL << KVM_FEATURE_MMU_WRITE); para_features &= ~(1UL << KVM_FEATURE_MULTICALL); } + if (!kvm_x86_ops->cpu_has_cr3_cache()) + para_features &= ~(1UL << KVM_FEATURE_CR3_CACHE); r = -EFAULT; if (copy_to_user(argp, ¶_features, sizeof para_features)) @@ -2321,6 +2325,12 @@ static int kvm_hypercall_release_pt(stru return 0; } +static int kvm_hypercall_set_cr3(struct kvm_vcpu *vcpu, gpa_t cr3) +{ + set_cr3(vcpu, cr3); + return 0; +} + static int dispatch_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) @@ -2334,6 +2344,8 @@ static int dispatch_hypercall(struct kvm return kvm_hypercall_flush_tlb(vcpu); case KVM_HYPERCALL_RELEASE_PT: return kvm_hypercall_release_pt(vcpu, a0); + case KVM_HYPERCALL_SET_CR3: + return kvm_hypercall_set_cr3(vcpu, a0); } return -KVM_ENOSYS; @@ -3245,12 +3257,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu * { struct page *page; struct kvm *kvm; - int r; + int r, i; BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE; if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; else Index: kvm.paravirt/include/asm-x86/kvm_host.h =================================================================== --- kvm.paravirt.orig/include/asm-x86/kvm_host.h +++ kvm.paravirt/include/asm-x86/kvm_host.h @@ -181,11 +181,11 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); - hpa_t root_hpa; + hpa_t root_hpa[KVM_CR3_CACHE_SIZE]; int root_level; int shadow_root_level; - u64 *pae_root; + u64 *pae_root[KVM_CR3_CACHE_SIZE]; }; struct kvm_vcpu_arch { @@ -199,6 +199,9 @@ struct kvm_vcpu_arch { unsigned long cr0; unsigned long cr2; unsigned long cr3; + struct kvm_cr3_cache *cr3_cache; + unsigned int cr3_cache_idx; + unsigned int cr3_cache_limit; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -323,6 +326,7 @@ struct kvm_vcpu_stat { u32 insn_emulation_fail; u32 multicall; u32 multicall_nr; + u32 cr3_cache_synced; }; struct descriptor_table { @@ -339,6 +343,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + bool (*cpu_has_cr3_cache)(void); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); Index: kvm.paravirt/include/asm-x86/kvm_para.h =================================================================== --- kvm.paravirt.orig/include/asm-x86/kvm_para.h +++ kvm.paravirt/include/asm-x86/kvm_para.h @@ -8,6 +8,7 @@ #define KVM_FEATURE_NOP_IO_DELAY 0 #define KVM_FEATURE_MMU_WRITE 1 #define KVM_FEATURE_MULTICALL 2 +#define KVM_FEATURE_CR3_CACHE 3 /* This CPUID returns a feature bitmap in eax. Before enabling a particular * paravirtualization, the appropriate feature bit should be checked. @@ -19,7 +20,10 @@ #define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY) | \ (1UL << KVM_FEATURE_MMU_WRITE) | \ - (1UL << KVM_FEATURE_MULTICALL)) + (1UL << KVM_FEATURE_MULTICALL) | \ + (1UL << KVM_FEATURE_CR3_CACHE)) + +#define KVM_MSR_SET_CR3_CACHE 0x87655678 struct kvm_multicall_entry { @@ -118,4 +122,16 @@ static inline unsigned int kvm_arch_para #endif +#define KVM_CR3_CACHE_SIZE 4 + +struct kvm_cr3_cache_entry { + __u64 guest_cr3; + __u64 host_cr3; +}; + +struct kvm_cr3_cache { + struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE]; + __u32 max_idx; +}; + #endif Index: kvm.paravirt/include/linux/kvm_para.h =================================================================== --- kvm.paravirt.orig/include/linux/kvm_para.h +++ kvm.paravirt/include/linux/kvm_para.h @@ -20,6 +20,7 @@ #define KVM_HYPERCALL_FLUSH_TLB 3 #define KVM_HYPERCALL_RELEASE_PT 4 #define KVM_HYPERCALL_MULTICALL 5 +#define KVM_HYPERCALL_SET_CR3 6 /* * hypercalls use architecture specific Index: kvm.paravirt/arch/x86/kvm/svm.c =================================================================== --- kvm.paravirt.orig/arch/x86/kvm/svm.c +++ kvm.paravirt/arch/x86/kvm/svm.c @@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr( return false; } +static bool cpu_has_cr3_cache(void) +{ + return false; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + .cpu_has_cr3_cache = cpu_has_cr3_cache, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, -- ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel