[kvm-devel] [RFC] VMX CR3 cache

Marcelo Tosatti Mon, 28 Jan 2008 08:02:30 -0800

Hi,

The CR3 cache feature of VMX CPU's does not seem to increase
context switch performance significantly as it did in the original
implementation (http://lkml.org/lkml/2007/1/5/205).


The following is similar to the original, but it also caches roots for
4-level pagetables on x86-64, and clearing the cache is only performed
in zap_page() instead of on every pagefault.

Tests performed on a 4-way guest.

lat_ctx numbers (output is "nr-procs overhead-in-us"):

cr3-cache:
"size=0k ovr=1.30
2 6.63
"size=0k ovr=1.31
4 7.43
"size=0k ovr=1.32
8 11.02

standard guest:
"size=0k ovr=1.28
2 10.02
"size=0k ovr=1.29
4 10.79
"size=0k ovr=1.30
8 11.20

hackbench numbers (hackbench 1 process 50000)

cr3-cache:
Time: 48.531
Time: 50.665
Time: 51.000

standard guest:
Time: 48.712
Time: 52.653
Time: 55.376

Nowhere near the results achieved earlier (and kernel compilation and
httperf seems slightly slower, probably due to paravirt overhead).

So the questions are, why the benefit is not similar (vm-exit's are now 
way cheaper?) and is it worth to merge this given the results?

In addition to this it is necessary to comment out the 

PATCH_SITE(pv_mmu_ops, write_cr3);

line in paravirt_patch_{32,64}.c, otherwise a native write_cr3() will be used.


Index: linux-2.6-x86-kvm/arch/x86/kernel/kvm.c
===================================================================
--- linux-2.6-x86-kvm.orig/arch/x86/kernel/kvm.c
+++ linux-2.6-x86-kvm/arch/x86/kernel/kvm.c
@@ -26,6 +26,15 @@
 #include <linux/cpu.h>
 #include <linux/mm.h>
 
+#include <asm/tlbflush.h>
+
+struct kvm_para_state {
+       struct kvm_cr3_cache cr3_cache;
+       char pad[PAGE_SIZE];
+} __attribute__ ((aligned(PAGE_SIZE)));
+
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+
 /*
  * No need for any "IO delay" on KVM
  */
@@ -33,20 +42,110 @@ static void kvm_io_delay(void)
 {
 }
 
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+       struct kvm_para_state *para_state = &get_cpu_var(para_state);
+       struct kvm_cr3_cache *cache = &para_state->cr3_cache;
+       int idx;
+
+       /*
+        * Check the cache (maintained by the host) for a matching
+        * guest_cr3 => host_cr3 mapping. Use it if found:
+        */
+       for (idx = 0; idx < cache->max_idx; idx++) {
+               if (cache->entry[idx].guest_cr3 == guest_cr3) {
+                       /*
+                        * Cache-hit: we load the cached host-CR3 value.
+                        * This never causes any VM exit. (if it does then the
+                        * hypervisor could do nothing with this instruction
+                        * and the guest OS would be aborted)
+                        */
+                       native_write_cr3(cache->entry[idx].host_cr3);
+                       goto out;
+               }
+       }
+
+       /*
+        * Cache-miss. Load the guest-cr3 value into cr3, which will
+        * cause a VM exit to the hypervisor, which then loads the
+        * host cr3 value and updates the cr3_cache.
+        */
+       native_write_cr3(guest_cr3);
+out:
+       put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+       kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+static void kvm_flush_tlb_single(unsigned long addr)
+{
+       __flush_tlb_one(addr);
+}
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static fastcall void kvm_flush_tlb_kernel(void)
+{
+       unsigned long orig_cr4 = read_cr4();
+
+       write_cr4(orig_cr4 & ~X86_CR4_PGE);
+       kvm_flush_tlb_user();
+       write_cr4(orig_cr4);
+}
+
+static void register_cr3_cache(void *cache)
+{
+       struct kvm_para_state *state;
+
+       state = &per_cpu(para_state, raw_smp_processor_id());
+       wrmsrl(KVM_MSR_SET_CR3_CACHE, &state->cr3_cache);
+}
+
+static void setup_guest_cr3_cache(void)
+{
+       on_each_cpu(register_cr3_cache, NULL, 0, 1);
+
+       pv_mmu_ops.write_cr3 = kvm_write_cr3;
+       pv_mmu_ops.flush_tlb_user = kvm_flush_tlb_user;
+       pv_mmu_ops.flush_tlb_single = kvm_flush_tlb_single;
+       pv_mmu_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+}
+
 static void paravirt_ops_setup(void)
 {
+       unsigned long flags;
+
        pv_info.name = "KVM";
        pv_info.paravirt_enabled = 1;
 
        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
                pv_cpu_ops.io_delay = kvm_io_delay;
-
+       if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE))
+               setup_guest_cr3_cache();
+       local_irq_save(flags);
+       apply_paravirt(__parainstructions, __parainstructions_end);
+       local_irq_restore(flags);
 }
 
-void __init kvm_guest_init(void)
+int __init kvm_guest_init(void)
 {
        if (!kvm_para_available())
-               return;
+               return -ENOSYS;
 
        paravirt_ops_setup();
+       return 0;
 }
+module_init(kvm_guest_init);
Index: linux-2.6-x86-kvm/arch/x86/kvm/mmu.c
===================================================================
--- linux-2.6-x86-kvm.orig/arch/x86/kvm/mmu.c
+++ linux-2.6-x86-kvm/arch/x86/kvm/mmu.c
@@ -257,6 +257,16 @@ static int mmu_topup_memory_cache(struct
        }
        return 0;
 }
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cr3_cache *cache;
+
+        if (!vcpu->arch.cr3_cache)
+                return;
+        cache = vcpu->arch.cr3_cache;
+        memset(cache->entry, 0, sizeof(cache->entry));
+}
+
 
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
@@ -667,7 +677,8 @@ static struct kvm_mmu_page *kvm_mmu_look
        index = kvm_page_table_hashfn(gfn);
        bucket = &kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry(sp, node, bucket, hash_link)
-               if (sp->gfn == gfn && !sp->role.metaphysical) {
+               if (sp->gfn == gfn && !sp->role.metaphysical &&
+                   !sp->role.invalid) {
                        pgprintk("%s: found role %x\n",
                                 __FUNCTION__, sp->role.word);
                        return sp;
@@ -795,8 +806,10 @@ static void kvm_mmu_zap_page(struct kvm 
        if (!sp->root_count) {
                hlist_del(&sp->hash_link);
                kvm_mmu_free_page(kvm, sp);
-       } else
+       } else { 
                list_move(&sp->link, &kvm->arch.active_mmu_pages);
+               sp->role.invalid = 1;
+       }
        kvm_mmu_reset_last_pte_updated(kvm);
 }
 
@@ -882,6 +895,7 @@ struct page *gva_to_page(struct kvm_vcpu
                return NULL;
        return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 }
+EXPORT_SYMBOL(gva_to_page);
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                         unsigned pt_access, unsigned pte_access,
@@ -975,7 +989,7 @@ static int __nonpaging_map(struct kvm_vc
                           gfn_t gfn, struct page *page)
 {
        int level = PT32E_ROOT_LEVEL;
-       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+       hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
        int pt_write = 0;
 
        for (; ; level--) {
@@ -1045,60 +1059,83 @@ static void nonpaging_prefetch_page(stru
 
 static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
-       int i;
+       int i, j = 0;
        struct kvm_mmu_page *sp;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               return;
+       /*
+        * Skip to the next cr3 filter entry and free it (if it's occupied).
+        */
+       vcpu->arch.cr3_cache_idx++;
+       if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+               vcpu->arch.cr3_cache_idx = 0;
+       j = vcpu->arch.cr3_cache_idx;
+       /*
+        * Clear the guest-visible entry.
+        */
+       if (vcpu->arch.cr3_cache) {
+               vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+               vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+       }
+
        spin_lock(&vcpu->kvm->mmu_lock);
 #ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+               hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+               if (!VALID_PAGE(root)) {
+                       spin_unlock(&vcpu->kvm->mmu_lock);
+                       return;
+               }
 
                sp = page_header(root);
                --sp->root_count;
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
                spin_unlock(&vcpu->kvm->mmu_lock);
                return;
        }
 #endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-               if (root) {
-                       root &= PT64_BASE_ADDR_MASK;
-                       sp = page_header(root);
-                       --sp->root_count;
+       ASSERT(vcpu->arch.mmu.pae_root[j]);
+       if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+               vcpu->arch.guest_cr3_gpa[j] = INVALID_PAGE;
+               for (i = 0; i < 4; ++i) {
+                       hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+                       if (root) {
+                               root &= PT64_BASE_ADDR_MASK;
+                               sp = page_header(root);
+                               --sp->root_count;
+                       }
+                       vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
                }
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
        }
        spin_unlock(&vcpu->kvm->mmu_lock);
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
-       int i;
+       int i, j;
        gfn_t root_gfn;
        struct kvm_mmu_page *sp;
 
        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+       j = vcpu->arch.cr3_cache_idx;
 
 #ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+               hpa_t root = vcpu->arch.mmu.root_hpa[j];
 
                ASSERT(!VALID_PAGE(root));
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
                                      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
-               vcpu->arch.mmu.root_hpa = root;
+               vcpu->arch.mmu.root_hpa[j] = root;
                return;
        }
 #endif
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu.pae_root[j][i];
 
                ASSERT(!VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1114,9 +1151,14 @@ static void mmu_alloc_roots(struct kvm_v
                                      ACC_ALL, NULL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
-               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+               vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
        }
-       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+       vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+       /*
+        * Store the guest-side address too, we need it if a guest
+        * exits the VM, to rediscover what cr3 it changed to:
+        */
+       vcpu->arch.guest_cr3_gpa[j] = vcpu->arch.cr3;
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1136,7 +1178,7 @@ static int nonpaging_page_fault(struct k
                return r;
 
        ASSERT(vcpu);
-       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
 
        gfn = gva >> PAGE_SHIFT;
 
@@ -1146,12 +1188,19 @@ static int nonpaging_page_fault(struct k
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
-       mmu_free_roots(vcpu);
+       int j;
+
+       /*
+        * This will cycle through all existing roots and free them.
+        */
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               mmu_free_roots(vcpu);
 }
 
 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
@@ -1160,7 +1209,8 @@ static int nonpaging_init_context(struct
        context->prefetch_page = nonpaging_prefetch_page;
        context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1199,6 +1249,7 @@ static void paging_free(struct kvm_vcpu 
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        ASSERT(is_pae(vcpu));
        context->new_cr3 = paging_new_cr3;
@@ -1208,7 +1259,8 @@ static int paging64_init_context_common(
        context->free = paging_free;
        context->root_level = level;
        context->shadow_root_level = level;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1220,6 +1272,7 @@ static int paging64_init_context(struct 
 static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
@@ -1228,7 +1281,8 @@ static int paging32_init_context(struct 
        context->prefetch_page = paging32_prefetch_page;
        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1240,7 +1294,7 @@ static int paging32E_init_context(struct
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        if (!is_paging(vcpu))
                return nonpaging_init_context(vcpu);
@@ -1254,11 +1308,14 @@ static int init_kvm_mmu(struct kvm_vcpu 
 
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
+       int j;
        ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-               vcpu->arch.mmu.free(vcpu);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       }
+
+       for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+                       vcpu->arch.mmu.free(vcpu);
+                       vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+               }
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1271,6 +1328,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context)
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
        int r;
+       int j = vcpu->arch.cr3_cache_idx;
 
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -1279,8 +1337,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        kvm_mmu_free_some_pages(vcpu);
        mmu_alloc_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-       kvm_mmu_flush_tlb(vcpu);
+       /* setting CR3 will flush the TLB */
+       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
 out:
        return r;
 }
@@ -1288,7 +1346,9 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-       mmu_free_roots(vcpu);
+       int j;
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               mmu_free_roots(vcpu);
 }
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1449,6 +1509,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
                         */
                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
                                 gpa, bytes, sp->role.word);
+                       kvm_cr3_cache_clear(vcpu);
                        kvm_mmu_zap_page(vcpu->kvm, sp);
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
@@ -1567,19 +1628,24 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu_page *sp;
+       int j;
 
        while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
                                  struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu->kvm, sp);
        }
-       free_page((unsigned long)vcpu->arch.mmu.pae_root);
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+               ASSERT(vcpu->arch.mmu.pae_root[j]);
+               free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+               vcpu->arch.mmu.pae_root[j] = NULL;
+       }
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct page *page;
-       int i;
+       int i, j;
 
        ASSERT(vcpu);
 
@@ -1589,17 +1655,23 @@ static int alloc_mmu_pages(struct kvm_vc
        else
                vcpu->kvm->arch.n_free_mmu_pages =
                                        vcpu->kvm->arch.n_alloc_mmu_pages;
-       /*
-        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-        * Therefore we need to allocate shadow page tables in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.
-        */
-       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-       if (!page)
-               goto error_1;
-       vcpu->arch.mmu.pae_root = page_address(page);
-       for (i = 0; i < 4; ++i)
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+               /*
+                * When emulating 32-bit mode, cr3 is only 32 bits even on
+                * x86_64. Therefore we need to allocate shadow page tables
+                * in the first 4GB of memory, which happens to fit the DMA32
+                * zone.
+                */
+               page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+               if (!page)
+                       goto error_1;
+
+               ASSERT(!vcpu->arch.mmu.pae_root[j]);
+               vcpu->arch.mmu.pae_root[j] = page_address(page);
+               for (i = 0; i < 4; ++i)
+                       vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+       }
 
        return 0;
 
@@ -1611,7 +1683,7 @@ error_1:
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        return alloc_mmu_pages(vcpu);
 }
@@ -1619,7 +1691,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu
 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        return init_kvm_mmu(vcpu);
 }
@@ -1779,15 +1851,16 @@ static void audit_mappings(struct kvm_vc
 {
        unsigned i;
 
-       if (vcpu->arch.mmu.root_level == 4)
+       if (vcpu->arch.mmu.root_level == 4) {
                audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-       else
+               return;
+       }
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
                for (i = 0; i < 4; ++i)
-                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+                       if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
                                audit_mappings_page(vcpu,
-                                                   vcpu->arch.mmu.pae_root[i],
-                                                   i << 30,
-                                                   2);
+                                                 
vcpu->arch.mmu.pae_root[j][i],                                                  
  i << 30, 2);
+       }
 }
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
Index: linux-2.6-x86-kvm/arch/x86/kvm/vmx.c
===================================================================
--- linux-2.6-x86-kvm.orig/arch/x86/kvm/vmx.c
+++ linux-2.6-x86-kvm/arch/x86/kvm/vmx.c
@@ -749,6 +749,30 @@ static int vmx_get_msr(struct kvm_vcpu *
        return 0;
 }
 
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct page *page;
+       hva_t cr3_cache_hva;
+
+       if (data != PAGE_ALIGN(data)) {
+               printk("data must be aligned!\n");
+               return -EINVAL;
+       }
+
+       down_read(&current->mm->mmap_sem);
+       /*XXX: release on unload */
+       page = gva_to_page(vcpu, data);
+       up_read(&current->mm->mmap_sem);
+       cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+
+       vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+       vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+       printk(KERN_ERR "using CR3 cache\n");
+
+       return 0;
+}
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -788,6 +812,9 @@ static int vmx_set_msr(struct kvm_vcpu *
        case MSR_IA32_TIME_STAMP_COUNTER:
                guest_write_tsc(data);
                break;
+       case KVM_MSR_SET_CR3_CACHE:
+               ret = vmx_cr3_cache_msr(vcpu, data);
+               break;
        default:
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -1274,11 +1301,25 @@ static void vmx_set_cr0(struct kvm_vcpu 
                vmx_fpu_activate(vcpu);
 }
 
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3_hpa)
 {
-       vmcs_writel(GUEST_CR3, cr3);
+       struct kvm_cr3_cache *cache;
+       int idx;
+
+       vmcs_writel(GUEST_CR3, cr3_hpa);
        if (vcpu->arch.cr0 & X86_CR0_PE)
                vmx_fpu_deactivate(vcpu);
+
+       if (!vcpu->arch.cr3_cache)
+               return;
+
+       idx = vcpu->arch.cr3_cache_idx;
+       cache = vcpu->arch.cr3_cache;
+
+       cache->entry[idx].host_cr3 = cr3_hpa;
+       cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+       vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3_hpa);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1491,6 +1532,39 @@ out:
        up_write(&current->mm->mmap_sem);
        return r;
 }
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+       unsigned int cr3_target_values, i;
+       u64 msr_val;
+
+       rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+       printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+       /*
+        * 9 bits of "CR3 target values":
+        */
+       cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+       printk(" cr3 target values: %d\n", cr3_target_values);
+       if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+               printk("KVM: limiting cr3 cache size from %d to %d\n",
+                       cr3_target_values, KVM_CR3_CACHE_SIZE);
+               cr3_target_values = KVM_CR3_CACHE_SIZE;
+       }
+
+       vcpu->arch.cr3_cache_idx = 0;
+       vcpu->arch.cr3_cache_limit = cr3_target_values;
+       /*
+        * Initialize. TODO: set this to guest physical memory.
+        */
+       for (i = 0; i < cr3_target_values; i++)
+               vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+       vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
 
 /*
  * Sets up the vmcs for emulated real mode.
@@ -1535,7 +1609,7 @@ static int vmx_vcpu_setup(struct vcpu_vm
 
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+       vmcs_setup_cr3_cache(&vmx->vcpu);
 
        vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
        vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
@@ -2333,6 +2407,55 @@ static void fixup_rmode_irq(struct vcpu_
                | vmx->rmode.irq.vector;
 }
 
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+       void *guest_cr3_hva;
+       hpa_t guest_cr3_hpa;
+       struct kvm_cr3_cache *cache;
+       int j;
+       int idx = vcpu->arch.cr3_cache_idx;
+
+       if (!vcpu->arch.cr3_cache)
+               return;
+
+       guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+       /*
+        * Are they in sync already?
+        */
+       if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+               return;
+
+       cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == 4) { 
+               for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+                       hpa_t root = cache->entry[j].host_cr3;
+                       if (root != guest_cr3_hpa)
+                               continue;
+                       vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+                       vcpu->arch.cr3_cache_idx = j;
+                       vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+                       ++vcpu->stat.cr3_cache_synced;
+                       return;
+               }
+       WARN_ON(j == KVM_CR3_CACHE_SIZE-1);
+       }
+#endif
+
+       guest_cr3_hva = __va(guest_cr3_hpa);
+       for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+               u64 *root = vcpu->arch.mmu.pae_root[j];
+               WARN_ON(!root);
+               if (root != guest_cr3_hva)
+                       continue;
+               vcpu->arch.cr3 = vcpu->arch.guest_cr3_gpa[j];
+               vcpu->arch.cr3_cache_idx = j;
+               vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+               break;
+       }
+       WARN_ON(j == KVM_CR3_CACHE_SIZE);
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2343,6 +2466,8 @@ static void vmx_vcpu_run(struct kvm_vcpu
         */
        vmcs_writel(HOST_CR0, read_cr0());
 
+       WARN_ON(vmcs_readl(GUEST_CR3) != 
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
        asm(
                /* Store host registers */
 #ifdef CONFIG_X86_64
@@ -2457,6 +2582,12 @@ static void vmx_vcpu_run(struct kvm_vcpu
                , "ebx", "edi", "rsi"
 #endif
              );
+       /*
+        * Figure out whether vcpu->cr3 needs updating because
+        * the guest makde use of the cr3 cache.
+        */
+       kvm_cr3_cache_sync(vcpu);
+       WARN_ON(vmcs_readl(GUEST_CR3) != 
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        if (vmx->rmode.irq.pending)
Index: linux-2.6-x86-kvm/include/asm-x86/kvm_host.h
===================================================================
--- linux-2.6-x86-kvm.orig/include/asm-x86/kvm_host.h
+++ linux-2.6-x86-kvm/include/asm-x86/kvm_host.h
@@ -140,6 +140,7 @@ union kvm_mmu_page_role {
                unsigned pad_for_nice_hex_output : 6;
                unsigned metaphysical : 1;
                unsigned access : 3;
+               unsigned invalid : 1;
        };
 };
 
@@ -180,11 +181,11 @@ struct kvm_mmu {
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
        void (*prefetch_page)(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page);
-       hpa_t root_hpa;
+       hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
        int root_level;
        int shadow_root_level;
 
-       u64 *pae_root;
+       u64 *pae_root[KVM_CR3_CACHE_SIZE];
 };
 
 struct kvm_vcpu_arch {
@@ -198,6 +199,10 @@ struct kvm_vcpu_arch {
        unsigned long cr0;
        unsigned long cr2;
        unsigned long cr3;
+       struct kvm_cr3_cache *cr3_cache;
+       unsigned int cr3_cache_idx;
+       unsigned int cr3_cache_limit;
+       gpa_t guest_cr3_gpa[KVM_CR3_CACHE_SIZE];
        unsigned long cr4;
        unsigned long cr8;
        u64 pdptrs[4]; /* pae */
@@ -320,6 +325,7 @@ struct kvm_vcpu_stat {
        u32 fpu_reload;
        u32 insn_emulation;
        u32 insn_emulation_fail;
+       u32 cr3_cache_synced;
 };
 
 struct descriptor_table {
Index: linux-2.6-x86-kvm/include/asm-x86/kvm_para.h
===================================================================
--- linux-2.6-x86-kvm.orig/include/asm-x86/kvm_para.h
+++ linux-2.6-x86-kvm/include/asm-x86/kvm_para.h
@@ -6,6 +6,7 @@
  */
 #define KVM_CPUID_SIGNATURE    0x40000000
 #define KVM_FEATURE_NOP_IO_DELAY       0
+#define KVM_FEATURE_CR3_CACHE          1
 
 /* This CPUID returns a feature bitmap in eax.  Before enabling a particular
  * paravirtualization, the appropriate feature bit should be checked.
@@ -15,7 +16,22 @@
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
-#define KVM_PARA_FEATURES (1UL << KVM_FEATURE_NOP_IO_DELAY)
+#define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY) |         \
+                          (1UL << KVM_FEATURE_CR3_CACHE))
+
+#define KVM_MSR_SET_CR3_CACHE 0x87655678
+
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+       u64 guest_cr3;
+       u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+       struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+       u32 max_idx;
+};
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
Index: linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- linux-2.6-x86-kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
@@ -140,7 +140,7 @@ walk:
        }
 #endif
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-              (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+              (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
 
        pt_access = ACC_ALL;
 
@@ -280,10 +280,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
        if (!is_present_pte(walker->ptes[walker->level - 1]))
                return NULL;
 
-       shadow_addr = vcpu->arch.mmu.root_hpa;
+       shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
        level = vcpu->arch.mmu.shadow_root_level;
        if (level == PT32E_ROOT_LEVEL) {
-               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr = 
vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
                shadow_addr &= PT64_BASE_ADDR_MASK;
                --level;
        }
Index: linux-2.6-x86-kvm/arch/x86/kvm/x86.c
===================================================================
--- linux-2.6-x86-kvm.orig/arch/x86/kvm/x86.c
+++ linux-2.6-x86-kvm/arch/x86/kvm/x86.c
@@ -67,6 +67,7 @@ struct kvm_stats_debugfs_item debugfs_en
        { "fpu_reload", VCPU_STAT(fpu_reload) },
        { "insn_emulation", VCPU_STAT(insn_emulation) },
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "cr3_cache_synced", VCPU_STAT(cr3_cache_synced) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
Index: linux-2.6-x86-kvm/include/linux/kvm_para.h
===================================================================
--- linux-2.6-x86-kvm.orig/include/linux/kvm_para.h
+++ linux-2.6-x86-kvm/include/linux/kvm_para.h
@@ -21,7 +21,7 @@
 
 #ifdef __KERNEL__
 #ifdef CONFIG_KVM_GUEST
-void __init kvm_guest_init(void);
+int __init kvm_guest_init(void);
 #else
 #define kvm_guest_init() do { } while (0)
 #endif

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

[kvm-devel] [RFC] VMX CR3 cache

Reply via email to