[kvm-devel] [patch 5/5] KVM: VMX cr3 cache support (v2)

Marcelo Tosatti Wed, 20 Feb 2008 11:55:40 -0800

Add support for the cr3 cache feature on Intel VMX CPU's. This avoids
vmexits on context switch if the cr3 value is cached in one of the 
entries (currently 4 are present).


This is especially important for Xenner, where each guest syscall
involves a cr3 switch.

v1->v2:
- handle the race which happens when the guest has the cache cleared
in the middle of kvm_write_cr3 by injecting a GP and trapping it to
fallback to hypercall variant (suggested by Avi).

Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Cc: Anthony Liguori <[EMAIL PROTECTED]>


Index: kvm.paravirt2/arch/x86/kernel/kvm.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kernel/kvm.c
+++ kvm.paravirt2/arch/x86/kernel/kvm.c
@@ -26,14 +26,17 @@
 #include <linux/cpu.h>
 #include <linux/mm.h>
 #include <linux/hardirq.h>
+#include <asm/tlbflush.h>
+#include <asm/asm.h>
 
 #define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry))
 
 struct kvm_para_state {
+       struct kvm_cr3_cache cr3_cache;
        struct kvm_multicall_entry queue[MAX_MULTICALL_NR];
        int queue_index;
        enum paravirt_lazy_mode mode;
-};
+} __attribute__ ((aligned(PAGE_SIZE)));
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 
@@ -104,6 +107,116 @@ static void kvm_io_delay(void)
 {
 }
 
+static void kvm_new_cr3(unsigned long cr3)
+{
+       kvm_hypercall1(KVM_HYPERCALL_SET_CR3, cr3);
+}
+
+static unsigned long __force_order;
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+       struct kvm_para_state *para_state = &get_cpu_var(para_state);
+       struct kvm_cr3_cache *cache = &para_state->cr3_cache;
+       int idx;
+
+       /*
+        * Check the cache (maintained by the host) for a matching
+        * guest_cr3 => host_cr3 mapping. Use it if found:
+        */
+       for (idx = 0; idx < cache->max_idx; idx++) {
+               if (cache->entry[idx].guest_cr3 == guest_cr3) {
+                       unsigned long trap;
+
+                       /*
+                        * Cache-hit: we load the cached host-CR3 value.
+                        * Fallback to hypercall variant if it raced with
+                        * the host clearing the cache after guest_cr3
+                        * comparison.
+                        */
+                       __asm__ __volatile__ (
+                               "    mov %2, %0\n"
+                               "0:  mov %3, %%cr3\n"
+                               "1:\n"
+                               ".section .fixup,\"ax\"\n"
+                               "2:  mov %1, %0\n"
+                               "    jmp 1b\n"
+                               ".previous\n"
+                               _ASM_EXTABLE(0b, 2b)
+                               : "=&r" (trap)
+                               : "n" (1UL), "n" (0UL),
+                                 "b" (cache->entry[idx].host_cr3),
+                                 "m" (__force_order));
+                       if (!trap)
+                               goto out;
+                       break;
+               }
+       }
+
+       /*
+        * Cache-miss. Tell the host the new cr3 via hypercall (to avoid
+        * aliasing problems with a cached host_cr3 == guest_cr3).
+        */
+       kvm_new_cr3(guest_cr3);
+out:
+       put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+       kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static void kvm_flush_tlb_kernel(void)
+{
+       unsigned long orig_cr4 = read_cr4();
+
+       write_cr4(orig_cr4 & ~X86_CR4_PGE);
+       kvm_flush_tlb_user();
+       write_cr4(orig_cr4);
+}
+
+static void register_cr3_cache(void *cache)
+{
+       struct kvm_para_state *state;
+
+       state = &per_cpu(para_state, raw_smp_processor_id());
+       wrmsrl(KVM_MSR_SET_CR3_CACHE, __pa(&state->cr3_cache));
+}
+
+static unsigned __init kvm_patch(u8 type, u16 clobbers, void *ibuf,
+                                unsigned long addr, unsigned len)
+{
+       switch (type) {
+       case PARAVIRT_PATCH(pv_mmu_ops.write_cr3):
+               return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+       default:
+               return native_patch(type, clobbers, ibuf, addr, len);
+       }
+}
+
+static void __init setup_guest_cr3_cache(void)
+{
+       on_each_cpu(register_cr3_cache, NULL, 0, 1);
+
+       pv_mmu_ops.write_cr3 = kvm_write_cr3;
+       pv_mmu_ops.flush_tlb_user = kvm_flush_tlb_user;
+       pv_mmu_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+}
+
 static void kvm_mmu_write(void *dest, const void *src, size_t size)
 {
        const uint8_t *p = src;
@@ -120,6 +233,28 @@ static void kvm_mmu_write(void *dest, co
 }
 
 /*
+ * CR3 cache initialization uses on_each_cpu(), so it can't
+ * happen at kvm_guest_init time.
+ */
+int __init kvm_cr3_cache_init(void)
+{
+       unsigned long flags;
+
+       if (!kvm_para_available())
+               return -ENOSYS;
+
+       if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE)) {
+               setup_guest_cr3_cache();
+               local_irq_save(flags);
+               apply_paravirt(__parainstructions, __parainstructions_end);
+               local_irq_restore(flags);
+       }
+
+       return 0;
+}
+module_init(kvm_cr3_cache_init);
+
+/*
  * We only need to hook operations that are MMU writes.  We hook these so that
  * we can use lazy MMU mode to batch these operations.  We could probably
  * improve the performance of the host code if we used some of the information
@@ -239,6 +374,9 @@ static void paravirt_ops_setup(void)
                pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
                pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
        }
+
+       if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE))
+               pv_init_ops.patch = kvm_patch;
 }
 
 void __init kvm_guest_init(void)
Index: kvm.paravirt2/arch/x86/kvm/mmu.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/mmu.c
+++ kvm.paravirt2/arch/x86/kvm/mmu.c
@@ -258,6 +258,16 @@ static int mmu_topup_memory_cache(struct
        }
        return 0;
 }
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cr3_cache *cache;
+
+        if (!vcpu->arch.cr3_cache)
+                return;
+        cache = vcpu->arch.cr3_cache;
+        memset(cache->entry, 0, sizeof(cache->entry));
+}
+
 
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
@@ -979,7 +989,7 @@ static void nonpaging_new_cr3(struct kvm
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                           gfn_t gfn, struct page *page, int level)
 {
-       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+       hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
        int pt_write = 0;
 
        for (; ; level--) {
@@ -1059,53 +1069,75 @@ static void nonpaging_prefetch_page(stru
 
 static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
-       int i;
+       int i, j;
        struct kvm_mmu_page *sp;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               return;
+       /*
+        * Skip to the next cr3 filter entry and free it (if it's occupied).
+        */
+       vcpu->arch.cr3_cache_idx++;
+       if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+               vcpu->arch.cr3_cache_idx = 0;
+
+       j = vcpu->arch.cr3_cache_idx;
+       /*
+        * Clear the guest-visible entry.
+        */
+       if (vcpu->arch.cr3_cache) {
+               vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+               vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+       }
        spin_lock(&vcpu->kvm->mmu_lock);
 #ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+               hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+               if (!VALID_PAGE(root)) {
+                       spin_unlock(&vcpu->kvm->mmu_lock);
+                       return;
+               }
 
                sp = page_header(root);
                --sp->root_count;
                if (!sp->root_count && sp->role.invalid)
                        kvm_mmu_zap_page(vcpu->kvm, sp);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
                spin_unlock(&vcpu->kvm->mmu_lock);
                return;
        }
 #endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-               if (root) {
-                       root &= PT64_BASE_ADDR_MASK;
-                       sp = page_header(root);
-                       --sp->root_count;
-                       if (!sp->root_count && sp->role.invalid)
-                               kvm_mmu_zap_page(vcpu->kvm, sp);
+       ASSERT(vcpu->arch.mmu.pae_root[j]);
+       if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+               for (i = 0; i < 4; ++i) {
+                       hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+                       if (root) {
+                               root &= PT64_BASE_ADDR_MASK;
+                               sp = page_header(root);
+                               --sp->root_count;
+                               if (!sp->root_count && sp->role.invalid)
+                                       kvm_mmu_zap_page(vcpu->kvm, sp);
+                       }
+                       vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
                }
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
        }
        spin_unlock(&vcpu->kvm->mmu_lock);
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
-       int i;
+       int i, j;
        gfn_t root_gfn;
        struct kvm_mmu_page *sp;
        int metaphysical = 0;
 
        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+       j = vcpu->arch.cr3_cache_idx;
 
 #ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+               hpa_t root = vcpu->arch.mmu.root_hpa[j];
 
                ASSERT(!VALID_PAGE(root));
                if (tdp_enabled)
@@ -1115,7 +1147,7 @@ static void mmu_alloc_roots(struct kvm_v
                                      ACC_ALL, NULL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
-               vcpu->arch.mmu.root_hpa = root;
+               vcpu->arch.mmu.root_hpa[j] = root;
                return;
        }
 #endif
@@ -1123,7 +1155,7 @@ static void mmu_alloc_roots(struct kvm_v
        if (tdp_enabled)
                metaphysical = 1;
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu.pae_root[j][i];
 
                ASSERT(!VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1139,9 +1171,9 @@ static void mmu_alloc_roots(struct kvm_v
                                      ACC_ALL, NULL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
-               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+               vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
        }
-       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+       vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1161,7 +1193,7 @@ static int nonpaging_page_fault(struct k
                return r;
 
        ASSERT(vcpu);
-       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
 
        gfn = gva >> PAGE_SHIFT;
 
@@ -1201,12 +1233,19 @@ static int tdp_page_fault(struct kvm_vcp
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
-       mmu_free_roots(vcpu);
+       int j;
+
+       /*
+        * This will cycle through all existing roots and free them.
+        */
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               mmu_free_roots(vcpu);
 }
 
 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
@@ -1215,7 +1254,8 @@ static int nonpaging_init_context(struct
        context->prefetch_page = nonpaging_prefetch_page;
        context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1254,6 +1294,7 @@ static void paging_free(struct kvm_vcpu 
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        ASSERT(is_pae(vcpu));
        context->new_cr3 = paging_new_cr3;
@@ -1263,7 +1304,8 @@ static int paging64_init_context_common(
        context->free = paging_free;
        context->root_level = level;
        context->shadow_root_level = level;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1275,6 +1317,7 @@ static int paging64_init_context(struct 
 static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
@@ -1283,7 +1326,8 @@ static int paging32_init_context(struct 
        context->prefetch_page = paging32_prefetch_page;
        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1295,13 +1339,15 @@ static int paging32E_init_context(struct
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
        context->prefetch_page = nonpaging_prefetch_page;
        context->shadow_root_level = TDP_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
 
        if (!is_paging(vcpu)) {
                context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -1323,7 +1369,7 @@ static int init_kvm_tdp_mmu(struct kvm_v
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        if (!is_paging(vcpu))
                return nonpaging_init_context(vcpu);
@@ -1345,11 +1391,14 @@ static int init_kvm_mmu(struct kvm_vcpu 
 
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
+       int j;
        ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-               vcpu->arch.mmu.free(vcpu);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       }
+
+       for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+                       vcpu->arch.mmu.free(vcpu);
+                       vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+               }
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1362,6 +1411,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context)
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
        int r;
+       int j = vcpu->arch.cr3_cache_idx;
 
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -1370,8 +1420,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        kvm_mmu_free_some_pages(vcpu);
        mmu_alloc_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-       kvm_mmu_flush_tlb(vcpu);
+       /* setting CR3 will flush the TLB */
+       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
 out:
        return r;
 }
@@ -1379,7 +1429,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-       mmu_free_roots(vcpu);
+       int j;
+
+       kvm_cr3_cache_clear(vcpu);
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               mmu_free_roots(vcpu);
 }
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1551,6 +1605,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
                                 gpa, bytes, sp->role.word);
                        kvm_mmu_zap_page(vcpu->kvm, sp);
+                       kvm_cr3_cache_clear(vcpu);
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
                }
@@ -1612,6 +1667,8 @@ int kvm_mmu_unprotect_page_virt(struct k
 
        spin_lock(&vcpu->kvm->mmu_lock);
        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       if (r)
+               kvm_cr3_cache_clear(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
        return r;
 }
@@ -1624,6 +1681,7 @@ void __kvm_mmu_free_some_pages(struct kv
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
                                  struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu->kvm, sp);
+               kvm_cr3_cache_clear(vcpu);
                ++vcpu->kvm->stat.mmu_recycled;
        }
 }
@@ -1674,19 +1732,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu_page *sp;
+       int j;
 
        while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
                                  struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu->kvm, sp);
        }
-       free_page((unsigned long)vcpu->arch.mmu.pae_root);
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+               ASSERT(vcpu->arch.mmu.pae_root[j]);
+               free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+               vcpu->arch.mmu.pae_root[j] = NULL;
+       }
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct page *page;
-       int i;
+       int i, j;
 
        ASSERT(vcpu);
 
@@ -1696,17 +1759,23 @@ static int alloc_mmu_pages(struct kvm_vc
        else
                vcpu->kvm->arch.n_free_mmu_pages =
                                        vcpu->kvm->arch.n_alloc_mmu_pages;
-       /*
-        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-        * Therefore we need to allocate shadow page tables in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.
-        */
-       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-       if (!page)
-               goto error_1;
-       vcpu->arch.mmu.pae_root = page_address(page);
-       for (i = 0; i < 4; ++i)
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+               /*
+                * When emulating 32-bit mode, cr3 is only 32 bits even on
+                * x86_64. Therefore we need to allocate shadow page tables
+                * in the first 4GB of memory, which happens to fit the DMA32
+                * zone.
+                */
+               page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+               if (!page)
+                       goto error_1;
+
+               ASSERT(!vcpu->arch.mmu.pae_root[j]);
+               vcpu->arch.mmu.pae_root[j] = page_address(page);
+               for (i = 0; i < 4; ++i)
+                       vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+       }
 
        return 0;
 
@@ -1718,7 +1787,7 @@ error_1:
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        return alloc_mmu_pages(vcpu);
 }
@@ -1726,7 +1795,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu
 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        return init_kvm_mmu(vcpu);
 }
@@ -1886,15 +1955,17 @@ static void audit_mappings(struct kvm_vc
 {
        unsigned i;
 
-       if (vcpu->arch.mmu.root_level == 4)
+       if (vcpu->arch.mmu.root_level == 4) {
                audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-       else
+               return;
+       }
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
                for (i = 0; i < 4; ++i)
-                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+                       if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
                                audit_mappings_page(vcpu,
-                                                   vcpu->arch.mmu.pae_root[i],
-                                                   i << 30,
-                                                   2);
+                                                 vcpu->arch.mmu.pae_root[j][i],
+                                                 i << 30, 2);
+       }
 }
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
Index: kvm.paravirt2/arch/x86/kvm/mmu.h
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/mmu.h
+++ kvm.paravirt2/arch/x86/kvm/mmu.h
@@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pag
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
-       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+       int idx = vcpu->arch.cr3_cache_idx;
+       if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE))
                return 0;
 
        return kvm_mmu_load(vcpu);
Index: kvm.paravirt2/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.paravirt2/arch/x86/kvm/paging_tmpl.h
@@ -283,10 +283,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
        if (!is_present_pte(walker->ptes[walker->level - 1]))
                return NULL;
 
-       shadow_addr = vcpu->arch.mmu.root_hpa;
+       shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
        level = vcpu->arch.mmu.shadow_root_level;
        if (level == PT32E_ROOT_LEVEL) {
-               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr = 
vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
                shadow_addr &= PT64_BASE_ADDR_MASK;
                --level;
        }
Index: kvm.paravirt2/arch/x86/kvm/vmx.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/vmx.c
+++ kvm.paravirt2/arch/x86/kvm/vmx.c
@@ -216,6 +216,10 @@ static inline int cpu_has_vmx_vpid(void)
        return (vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_ENABLE_VPID);
 }
+static inline bool cpu_has_cr3_cache(void)
+{
+       return true;
+}
 
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
@@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *
        return 0;
 }
 
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct page *page;
+       hva_t cr3_cache_hva;
+
+       if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache)
+               return -EINVAL;
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+       up_read(&current->mm->mmap_sem);
+
+       if (is_error_page(page)) {
+               kvm_release_page_clean(page);
+               return -EINVAL;
+       }
+
+       cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+       vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+       vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+       return 0;
+}
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *
        case MSR_IA32_TIME_STAMP_COUNTER:
                guest_write_tsc(data);
                break;
+       case KVM_MSR_SET_CR3_CACHE:
+               ret = vmx_cr3_cache_msr(vcpu, data);
+               break;
        default:
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu 
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+       struct kvm_cr3_cache *cache;
+       int idx;
+
        vmx_flush_tlb(vcpu);
        vmcs_writel(GUEST_CR3, cr3);
        if (vcpu->arch.cr0 & X86_CR0_PE)
                vmx_fpu_deactivate(vcpu);
+
+       if (!vcpu->arch.cr3_cache)
+               return;
+
+       idx = vcpu->arch.cr3_cache_idx;
+       cache = vcpu->arch.cr3_cache;
+
+       cache->entry[idx].host_cr3 = cr3;
+       cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+       vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1503,6 +1547,39 @@ out:
        up_read(&current->mm->mmap_sem);
        return ret;
 }
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+       unsigned int cr3_target_values, i;
+       u64 msr_val;
+
+       rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+       printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+       /*
+        * 9 bits of "CR3 target values":
+        */
+       cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+       printk(" cr3 target values: %d\n", cr3_target_values);
+       if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+               printk("KVM: limiting cr3 cache size from %d to %d\n",
+                       cr3_target_values, KVM_CR3_CACHE_SIZE);
+               cr3_target_values = KVM_CR3_CACHE_SIZE;
+       }
+
+       vcpu->arch.cr3_cache_idx = 0;
+       vcpu->arch.cr3_cache_limit = cr3_target_values;
+       /*
+        * Initialize. TODO: set this to guest physical memory.
+        */
+       for (i = 0; i < cr3_target_values; i++)
+               vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+       vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
 
 static void seg_setup(int seg)
 {
@@ -1599,7 +1676,7 @@ static int vmx_vcpu_setup(struct vcpu_vm
 
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+       vmcs_setup_cr3_cache(&vmx->vcpu);
 
        vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
        vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
@@ -2030,9 +2107,12 @@ static int handle_cr(struct kvm_vcpu *vc
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
-                       vcpu_load_rsp_rip(vcpu);
-                       set_cr3(vcpu, vcpu->arch.regs[reg]);
-                       skip_emulated_instruction(vcpu);
+                       if (!vcpu->arch.cr3_cache) {
+                               vcpu_load_rsp_rip(vcpu);
+                               set_cr3(vcpu, vcpu->arch.regs[reg]);
+                               skip_emulated_instruction(vcpu);
+                       } else
+                               kvm_inject_gp(vcpu, 0);
                        return 1;
                case 4:
                        vcpu_load_rsp_rip(vcpu);
@@ -2393,6 +2473,56 @@ static void fixup_rmode_irq(struct vcpu_
                | vmx->rmode.irq.vector;
 }
 
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+       void *guest_cr3_hva;
+       hpa_t guest_cr3_hpa;
+       struct kvm_cr3_cache *cache;
+       int j;
+       int idx = vcpu->arch.cr3_cache_idx;
+
+       if (!vcpu->arch.cr3_cache)
+               return;
+
+       guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+       /*
+        * Are they in sync already?
+        */
+       if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+               return;
+
+       cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == 4) {
+               for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+                       hpa_t root = cache->entry[j].host_cr3;
+                       if (root != guest_cr3_hpa)
+                               continue;
+                       vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+                       vcpu->arch.cr3_cache_idx = j;
+                       vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+                       ++vcpu->stat.cr3_cache_synced;
+                       return;
+               }
+       WARN_ON(j == KVM_CR3_CACHE_SIZE);
+       }
+#endif
+
+       guest_cr3_hva = __va(guest_cr3_hpa);
+       for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+               u64 *root = vcpu->arch.mmu.pae_root[j];
+               WARN_ON(!root);
+               if (root != guest_cr3_hva)
+                       continue;
+               vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+               vcpu->arch.cr3_cache_idx = j;
+               vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+               ++vcpu->stat.cr3_cache_synced;
+               return;
+       }
+       WARN_ON(j == KVM_CR3_CACHE_SIZE);
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2403,6 +2533,8 @@ static void vmx_vcpu_run(struct kvm_vcpu
         */
        vmcs_writel(HOST_CR0, read_cr0());
 
+       WARN_ON(vmcs_readl(GUEST_CR3) != 
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
        asm(
                /* Store host registers */
 #ifdef CONFIG_X86_64
@@ -2517,6 +2649,12 @@ static void vmx_vcpu_run(struct kvm_vcpu
                , "ebx", "edi", "rsi"
 #endif
              );
+       /*
+        * Figure out whether vcpu->cr3 needs updating because
+        * the guest made use of the cr3 cache.
+        */
+       kvm_cr3_cache_sync(vcpu);
+       WARN_ON(vmcs_readl(GUEST_CR3) != 
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        if (vmx->rmode.irq.pending)
@@ -2549,11 +2687,16 @@ static void vmx_free_vmcs(struct kvm_vcp
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct page *page = NULL;
 
        spin_lock(&vmx_vpid_lock);
        if (vmx->vpid != 0)
                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
+       if (vcpu->arch.cr3_cache) {
+               page = virt_to_page(vcpu->arch.cr3_cache);
+               kvm_release_page_dirty(page);
+       }
        vmx_free_vmcs(vcpu);
        kfree(vmx->host_msrs);
        kfree(vmx->guest_msrs);
@@ -2641,6 +2784,7 @@ static struct kvm_x86_ops vmx_x86_ops = 
        .hardware_enable = hardware_enable,
        .hardware_disable = hardware_disable,
        .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+       .cpu_has_cr3_cache = cpu_has_cr3_cache,
 
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
Index: kvm.paravirt2/arch/x86/kvm/x86.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/x86.c
+++ kvm.paravirt2/arch/x86/kvm/x86.c
@@ -81,6 +81,7 @@ struct kvm_stats_debugfs_item debugfs_en
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
        { "multicall", VCPU_STAT(multicall) },
        { "multicall_nr", VCPU_STAT(multicall_nr) },
+       { "cr3_cache_synced", VCPU_STAT(cr3_cache_synced) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -858,10 +859,13 @@ long kvm_arch_dev_ioctl(struct file *fil
        }
        case KVM_GET_PARA_FEATURES: {
                __u32 para_features = KVM_PARA_FEATURES;
+
                if (tdp_enabled) {
                        para_features &= ~(1UL << KVM_FEATURE_MMU_WRITE);
                        para_features &= ~(1UL << KVM_FEATURE_MULTICALL);
                }
+               if (!kvm_x86_ops->cpu_has_cr3_cache())
+                       para_features &= ~(1UL << KVM_FEATURE_CR3_CACHE);
 
                r = -EFAULT;
                if (copy_to_user(argp, &para_features, sizeof para_features))
@@ -2416,6 +2420,12 @@ static int kvm_hypercall_release_pt(stru
        return 0;
 }
 
+static int kvm_hypercall_set_cr3(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+       set_cr3(vcpu, cr3);
+       return 0;
+}
+
 static int dispatch_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
                              unsigned long a0, unsigned long a1,
                              unsigned long a2, unsigned long a3)
@@ -2429,6 +2439,8 @@ static int dispatch_hypercall(struct kvm
                return kvm_hypercall_flush_tlb(vcpu);
        case KVM_HYPERCALL_RELEASE_PT:
                return kvm_hypercall_release_pt(vcpu, a0);
+       case KVM_HYPERCALL_SET_CR3:
+               return kvm_hypercall_set_cr3(vcpu, a0);
        }
 
        return -KVM_ENOSYS;
@@ -3361,12 +3373,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
 {
        struct page *page;
        struct kvm *kvm;
-       int r;
+       int r, i;
 
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
 
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE;
        if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
                vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
        else
Index: kvm.paravirt2/include/asm-x86/kvm_host.h
===================================================================
--- kvm.paravirt2.orig/include/asm-x86/kvm_host.h
+++ kvm.paravirt2/include/asm-x86/kvm_host.h
@@ -181,11 +181,11 @@ struct kvm_mmu {
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
        void (*prefetch_page)(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page);
-       hpa_t root_hpa;
+       hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
        int root_level;
        int shadow_root_level;
 
-       u64 *pae_root;
+       u64 *pae_root[KVM_CR3_CACHE_SIZE];
 };
 
 struct kvm_vcpu_arch {
@@ -199,6 +199,9 @@ struct kvm_vcpu_arch {
        unsigned long cr0;
        unsigned long cr2;
        unsigned long cr3;
+       struct kvm_cr3_cache *cr3_cache;
+       unsigned int cr3_cache_idx;
+       unsigned int cr3_cache_limit;
        unsigned long cr4;
        unsigned long cr8;
        u64 pdptrs[4]; /* pae */
@@ -330,6 +333,7 @@ struct kvm_vcpu_stat {
        u32 insn_emulation_fail;
        u32 multicall;
        u32 multicall_nr;
+       u32 cr3_cache_synced;
 };
 
 struct descriptor_table {
@@ -346,6 +350,7 @@ struct kvm_x86_ops {
        int (*hardware_setup)(void);               /* __init */
        void (*hardware_unsetup)(void);            /* __exit */
        bool (*cpu_has_accelerated_tpr)(void);
+       bool (*cpu_has_cr3_cache)(void);
 
        /* Create, but do not attach this VCPU */
        struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
Index: kvm.paravirt2/include/asm-x86/kvm_para.h
===================================================================
--- kvm.paravirt2.orig/include/asm-x86/kvm_para.h
+++ kvm.paravirt2/include/asm-x86/kvm_para.h
@@ -14,6 +14,7 @@
 #define KVM_FEATURE_NOP_IO_DELAY       1
 #define KVM_FEATURE_MMU_WRITE          2
 #define KVM_FEATURE_MULTICALL          3
+#define KVM_FEATURE_CR3_CACHE          4
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
@@ -53,7 +54,10 @@ extern void kvmclock_init(void);
 #define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY)   |       \
                           (1UL << KVM_FEATURE_CLOCKSOURCE)     |       \
                           (1UL << KVM_FEATURE_MMU_WRITE)       |       \
-                          (1UL << KVM_FEATURE_MULTICALL))
+                          (1UL << KVM_FEATURE_MULTICALL)       |       \
+                          (1UL << KVM_FEATURE_CR3_CACHE))
+
+#define KVM_MSR_SET_CR3_CACHE 0x87655678
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
@@ -143,4 +147,16 @@ static inline unsigned int kvm_arch_para
 
 #endif
 
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+       __u64 guest_cr3;
+       __u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+       struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+       __u32 max_idx;
+};
+
 #endif
Index: kvm.paravirt2/include/linux/kvm_para.h
===================================================================
--- kvm.paravirt2.orig/include/linux/kvm_para.h
+++ kvm.paravirt2/include/linux/kvm_para.h
@@ -20,6 +20,7 @@
 #define KVM_HYPERCALL_FLUSH_TLB                3
 #define KVM_HYPERCALL_RELEASE_PT       4
 #define KVM_HYPERCALL_MULTICALL                5
+#define KVM_HYPERCALL_SET_CR3          6
 
 /*
  * hypercalls use architecture specific
Index: kvm.paravirt2/arch/x86/kvm/svm.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/svm.c
+++ kvm.paravirt2/arch/x86/kvm/svm.c
@@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(
        return false;
 }
 
+static bool cpu_has_cr3_cache(void)
+{
+       return false;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = 
        .hardware_enable = svm_hardware_enable,
        .hardware_disable = svm_hardware_disable,
        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+       .cpu_has_cr3_cache = cpu_has_cr3_cache,
 
        .vcpu_create = svm_create_vcpu,
        .vcpu_free = svm_free_vcpu,

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel

[kvm-devel] [patch 5/5] KVM: VMX cr3 cache support (v2)

Reply via email to