From: Marcelo Tosatti <[EMAIL PROTECTED]> Add support for the cr3 cache feature on Intel VMX CPU's. This avoids vmexits on context switch if the cr3 value is cached in one of the entries (currently 4 are present).
This is especially important for Xenner, where each guest syscall involves a cr3 switch. v1->v2: - handle the race which happens when the guest has the cache cleared in the middle of kvm_write_cr3 by injecting a GP and trapping it to fallback to hypercall variant (suggested by Avi). v2->v3: - one ioctl per paravirt feature v3->v4: - disable if tdp enabled Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]> Signed-off-by: Avi Kivity <[EMAIL PROTECTED]> --- arch/x86/kvm/mmu.c | 196 +++++++++++++++++++++++++++++++------------- arch/x86/kvm/mmu.h | 3 +- arch/x86/kvm/paging_tmpl.h | 4 +- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 152 +++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 9 ++- include/asm-x86/kvm_host.h | 9 ++- include/asm-x86/kvm_para.h | 21 +++++ include/linux/kvm.h | 1 + 9 files changed, 332 insertions(+), 69 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 14de7dc..11bca62 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -272,6 +272,16 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } +static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu) +{ + struct kvm_cr3_cache *cache; + + if (!vcpu->arch.cr3_cache) + return; + cache = vcpu->arch.cr3_cache; + memset(cache->entry, 0, sizeof(cache->entry)); +} + static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) @@ -1127,7 +1137,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int largepage, gfn_t gfn, struct page *page, int level) { - hpa_t table_addr = vcpu->arch.mmu.root_hpa; + hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]; int pt_write = 0; for (; ; level--) { @@ -1219,53 +1229,75 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, static void mmu_free_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; struct kvm_mmu_page *sp; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; + /* + * Skip to the next cr3 filter entry and free it (if it's occupied). + */ + vcpu->arch.cr3_cache_idx++; + if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit)) + vcpu->arch.cr3_cache_idx = 0; + + j = vcpu->arch.cr3_cache_idx; + /* + * Clear the guest-visible entry. + */ + if (vcpu->arch.cr3_cache) { + vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0; + vcpu->arch.cr3_cache->entry[j].host_cr3 = 0; + } spin_lock(&vcpu->kvm->mmu_lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + hpa_t root = vcpu->arch.mmu.root_hpa[j]; + + if (!VALID_PAGE(root)) { + spin_unlock(&vcpu->kvm->mmu_lock); + return; + } sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_zap_page(vcpu->kvm, sp); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; } #endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + ASSERT(vcpu->arch.mmu.pae_root[j]); + if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) { + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[j][i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_zap_page(vcpu->kvm, sp); + } + vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE; } - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE; } static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; gfn_t root_gfn; struct kvm_mmu_page *sp; int metaphysical = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + j = vcpu->arch.cr3_cache_idx; #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + hpa_t root = vcpu->arch.mmu.root_hpa[j]; ASSERT(!VALID_PAGE(root)); if (tdp_enabled) @@ -1275,7 +1307,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; - vcpu->arch.mmu.root_hpa = root; + vcpu->arch.mmu.root_hpa[j] = root; return; } #endif @@ -1283,7 +1315,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) if (tdp_enabled) metaphysical = 1; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu.pae_root[j][i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { @@ -1299,9 +1331,9 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -1321,7 +1353,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, return r; ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j])); gfn = gva >> PAGE_SHIFT; @@ -1367,12 +1399,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, static void nonpaging_free(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + int j; + + /* + * This will cycle through all existing roots and free them. + */ + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); } static int nonpaging_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; @@ -1381,7 +1420,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->prefetch_page = nonpaging_prefetch_page; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; return 0; } @@ -1420,6 +1460,7 @@ static void paging_free(struct kvm_vcpu *vcpu) static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -1429,7 +1470,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->free = paging_free; context->root_level = level; context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; return 0; } @@ -1441,6 +1483,7 @@ static int paging64_init_context(struct kvm_vcpu *vcpu) static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -1449,7 +1492,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->prefetch_page = paging32_prefetch_page; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; return 0; } @@ -1461,13 +1505,15 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu) static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + int i; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->shadow_root_level = TDP_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + context->root_hpa[i] = INVALID_PAGE; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -1489,7 +1535,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx])); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); @@ -1511,11 +1557,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { + int j; ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { - vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } + + for(j = 0; j < KVM_CR3_CACHE_SIZE; j++) + if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) { + vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE; + } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -1528,6 +1577,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; + int j = vcpu->arch.cr3_cache_idx; r = mmu_topup_memory_caches(vcpu); if (r) @@ -1536,8 +1586,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); + /* setting CR3 will flush the TLB */ + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]); out: return r; } @@ -1545,7 +1595,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + int j; + + kvm_cr3_cache_clear(vcpu); + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); } static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, @@ -1727,6 +1781,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_cr3_cache_clear(vcpu); ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -1788,6 +1843,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (r) + kvm_cr3_cache_clear(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); return r; } @@ -1800,6 +1857,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_cr3_cache_clear(vcpu); ++vcpu->kvm->stat.mmu_recycled; } } @@ -1850,19 +1908,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; + int j; while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu->kvm, sp); } - free_page((unsigned long)vcpu->arch.mmu.pae_root); + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + ASSERT(vcpu->arch.mmu.pae_root[j]); + free_page((unsigned long)vcpu->arch.mmu.pae_root[j]); + vcpu->arch.mmu.pae_root[j] = NULL; + } } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { struct page *page; - int i; + int i, j; ASSERT(vcpu); @@ -1872,17 +1935,23 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) else vcpu->kvm->arch.n_free_mmu_pages = vcpu->kvm->arch.n_alloc_mmu_pages; - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->arch.mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate shadow page tables + * in the first 4GB of memory, which happens to fit the DMA32 + * zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + + ASSERT(!vcpu->arch.mmu.pae_root[j]); + vcpu->arch.mmu.pae_root[j] = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE; + } return 0; @@ -1894,7 +1963,7 @@ error_1: int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx])); return alloc_mmu_pages(vcpu); } @@ -1902,7 +1971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) int kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx])); return init_kvm_mmu(vcpu); } @@ -2091,6 +2160,15 @@ static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, return 0; return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); } + case KVM_MMU_OP_SET_CR3: { + struct kvm_mmu_op_set_cr3 *scr3; + + scr3 = pv_mmu_read_buffer(buffer, sizeof *scr3); + if (!scr3) + return 0; + kvm_set_cr3(vcpu, scr3->cr3); + return 1; + } default: return 0; } } @@ -2188,15 +2266,17 @@ static void audit_mappings(struct kvm_vcpu *vcpu) { unsigned i; - if (vcpu->arch.mmu.root_level == 4) + if (vcpu->arch.mmu.root_level == 4) { audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else + return; + } + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK) audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); + vcpu->arch.mmu.pae_root[j][i], + i << 30, 2); + } } static int count_rmaps(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e64e9f5..77f6882 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + int idx = vcpu->arch.cr3_cache_idx; + if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE)) return 0; return kvm_mmu_load(vcpu); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 17f9d16..3163c31 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -285,10 +285,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!is_present_pte(walker->ptes[walker->level - 1])) return NULL; - shadow_addr = vcpu->arch.mmu.root_hpa; + shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]; level = vcpu->arch.mmu.shadow_root_level; if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr = vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; --level; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 28ad3c4..7b774b0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } +static int cpu_has_cr3_cache(void) +{ + return 0; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + .cpu_has_cr3_cache = cpu_has_cr3_cache, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 46e0e58..44b1ae0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -215,6 +215,10 @@ static inline int cpu_has_vmx_vpid(void) return (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID); } +static inline int cpu_has_cr3_cache(void) +{ + return 1; +} static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { @@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data) +{ + struct page *page; + hva_t cr3_cache_hva; + + if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache) + return -EINVAL; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(page)) { + kvm_release_page_clean(page); + return -EINVAL; + } + + cr3_cache_hva = (hva_t)__va(page_to_phys(page)); + vcpu->arch.cr3_cache = (void *)cr3_cache_hva; + vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit; + + return 0; +} + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); break; + case KVM_MSR_SET_CR3_CACHE: + ret = vmx_cr3_cache_msr(vcpu, data); + break; default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm_cr3_cache *cache; + int idx; + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); + + if (!vcpu->arch.cr3_cache) + return; + + idx = vcpu->arch.cr3_cache_idx; + cache = vcpu->arch.cr3_cache; + + cache->entry[idx].host_cr3 = cr3; + cache->entry[idx].guest_cr3 = vcpu->arch.cr3; + vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1505,6 +1549,39 @@ out: up_read(¤t->mm->mmap_sem); return ret; } +/* + * Set up the cr3 validity hardware cache. + */ +static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu) +{ + unsigned int cr3_target_values, i; + u64 msr_val; + + rdmsrl(MSR_IA32_VMX_MISC, msr_val); + + printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val); + + /* + * 9 bits of "CR3 target values": + */ + cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1); + printk(" cr3 target values: %d\n", cr3_target_values); + if (cr3_target_values > KVM_CR3_CACHE_SIZE) { + printk("KVM: limiting cr3 cache size from %d to %d\n", + cr3_target_values, KVM_CR3_CACHE_SIZE); + cr3_target_values = KVM_CR3_CACHE_SIZE; + } + + vcpu->arch.cr3_cache_idx = 0; + vcpu->arch.cr3_cache_limit = cr3_target_values; + /* + * Initialize. TODO: set this to guest physical memory. + */ + for (i = 0; i < cr3_target_values; i++) + vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL); + + vmcs_write32(CR3_TARGET_COUNT, cr3_target_values); +} static void seg_setup(int seg) { @@ -1601,7 +1678,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + vmcs_setup_cr3_cache(&vmx->vcpu); vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ @@ -2032,9 +2109,12 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) skip_emulated_instruction(vcpu); return 1; case 3: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); - skip_emulated_instruction(vcpu); + if (!vcpu->arch.cr3_cache) { + vcpu_load_rsp_rip(vcpu); + kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); + skip_emulated_instruction(vcpu); + } else + kvm_inject_gp(vcpu, 0); return 1; case 4: vcpu_load_rsp_rip(vcpu); @@ -2395,6 +2475,56 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu) +{ + void *guest_cr3_hva; + hpa_t guest_cr3_hpa; + struct kvm_cr3_cache *cache; + int j; + int idx = vcpu->arch.cr3_cache_idx; + + if (!vcpu->arch.cr3_cache) + return; + + guest_cr3_hpa = vmcs_readl(GUEST_CR3); + /* + * Are they in sync already? + */ + if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx]) + return; + + cache = vcpu->arch.cr3_cache; +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == 4) { + for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) { + hpa_t root = cache->entry[j].host_cr3; + if (root != guest_cr3_hpa) + continue; + vcpu->arch.cr3 = cache->entry[j].guest_cr3; + vcpu->arch.cr3_cache_idx = j; + vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3; + ++vcpu->stat.cr3_cache_synced; + return; + } + WARN_ON(j == KVM_CR3_CACHE_SIZE); + } +#endif + + guest_cr3_hva = __va(guest_cr3_hpa); + for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) { + u64 *root = vcpu->arch.mmu.pae_root[j]; + WARN_ON(!root); + if (root != guest_cr3_hva) + continue; + vcpu->arch.cr3 = cache->entry[j].guest_cr3; + vcpu->arch.cr3_cache_idx = j; + vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]); + ++vcpu->stat.cr3_cache_synced; + return; + } + WARN_ON(j == KVM_CR3_CACHE_SIZE); +} + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2405,6 +2535,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]); + asm( /* Store host registers */ #ifdef CONFIG_X86_64 @@ -2519,6 +2651,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) , "ebx", "edi", "rsi" #endif ); + /* + * Figure out whether vcpu->cr3 needs updating because + * the guest made use of the cr3 cache. + */ + kvm_cr3_cache_sync(vcpu); + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) @@ -2551,11 +2689,16 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu) static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *page = NULL; spin_lock(&vmx_vpid_lock); if (vmx->vpid != 0) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); + if (vcpu->arch.cr3_cache) { + page = virt_to_page(vcpu->arch.cr3_cache); + kvm_release_page_dirty(page); + } vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2643,6 +2786,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, + .cpu_has_cr3_cache = cpu_has_cr3_cache, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92a51d3..19cceb2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -80,6 +80,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "cr3_cached_synced", VCPU_STAT(cr3_cache_synced) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -820,6 +821,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: r = !tdp_enabled; break; + case KVM_CAP_CR3_CACHE: + r = !tdp_enabled && kvm_x86_ops->cpu_has_cr3_cache(); + break; default: r = 0; break; @@ -3298,12 +3302,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; struct kvm *kvm; - int r; + int r, i; BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_CR3_CACHE_SIZE; i++) + vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE; if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; else diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index d20cabc..f3ca4f6 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -188,11 +188,11 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); - hpa_t root_hpa; + hpa_t root_hpa[KVM_CR3_CACHE_SIZE]; int root_level; int shadow_root_level; - u64 *pae_root; + u64 *pae_root[KVM_CR3_CACHE_SIZE]; }; struct kvm_vcpu_arch { @@ -206,6 +206,9 @@ struct kvm_vcpu_arch { unsigned long cr0; unsigned long cr2; unsigned long cr3; + struct kvm_cr3_cache *cr3_cache; + unsigned int cr3_cache_idx; + unsigned int cr3_cache_limit; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -338,6 +341,7 @@ struct kvm_vcpu_stat { u32 insn_emulation; u32 insn_emulation_fail; u32 hypercalls; + u32 cr3_cache_synced; }; struct descriptor_table { @@ -354,6 +358,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + int (*cpu_has_cr3_cache)(void); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 5098459..67f2ad2 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -13,9 +13,12 @@ #define KVM_FEATURE_CLOCKSOURCE 0 #define KVM_FEATURE_NOP_IO_DELAY 1 #define KVM_FEATURE_MMU_OP 2 +#define KVM_FEATURE_MMU_WRITE 2 +#define KVM_FEATURE_CR3_CACHE 3 #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 +#define KVM_MSR_SET_CR3_CACHE 0x13 #define KVM_MAX_MMU_OP_BATCH 32 @@ -23,6 +26,7 @@ #define KVM_MMU_OP_WRITE_PTE 1 #define KVM_MMU_OP_FLUSH_TLB 2 #define KVM_MMU_OP_RELEASE_PT 3 +#define KVM_MMU_OP_SET_CR3 4 /* Payload for KVM_HC_MMU_OP */ struct kvm_mmu_op_header { @@ -45,6 +49,11 @@ struct kvm_mmu_op_release_pt { __u64 pt_phys; }; +struct kvm_mmu_op_set_cr3 { + struct kvm_mmu_op_header header; + __u64 cr3; +}; + #ifdef __KERNEL__ #include <asm/processor.h> @@ -157,4 +166,16 @@ static inline unsigned int kvm_arch_para_features(void) #endif +#define KVM_CR3_CACHE_SIZE 4 + +struct kvm_cr3_cache_entry { + __u64 guest_cr3; + __u64 host_cr3; +}; + +struct kvm_cr3_cache { + struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE]; + __u32 max_idx; +}; + #endif diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 074a107..2aebd29 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -238,6 +238,7 @@ struct kvm_vapic_addr { #define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ #define KVM_CAP_NOP_IO_DELAY 11 #define KVM_CAP_PV_MMU 12 +#define KVM_CAP_CR3_CACHE 13 /* * ioctls for VM fds -- 1.5.4.2 ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel