From: Marcelo Tosatti <[EMAIL PROTECTED]>

Add support for the cr3 cache feature on Intel VMX CPU's. This avoids
vmexits on context switch if the cr3 value is cached in one of the
entries (currently 4 are present).

This is especially important for Xenner, where each guest syscall
involves a cr3 switch.

v1->v2:
- handle the race which happens when the guest has the cache cleared
in the middle of kvm_write_cr3 by injecting a GP and trapping it to
fallback to hypercall variant (suggested by Avi).

v2->v3:
- one ioctl per paravirt feature

v3->v4:
- disable if tdp enabled

Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Signed-off-by: Avi Kivity <[EMAIL PROTECTED]>
---
 arch/x86/kvm/mmu.c         |  196 +++++++++++++++++++++++++++++++-------------
 arch/x86/kvm/mmu.h         |    3 +-
 arch/x86/kvm/paging_tmpl.h |    4 +-
 arch/x86/kvm/svm.c         |    6 ++
 arch/x86/kvm/vmx.c         |  152 +++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c         |    9 ++-
 include/asm-x86/kvm_host.h |    9 ++-
 include/asm-x86/kvm_para.h |   21 +++++
 include/linux/kvm.h        |    1 +
 9 files changed, 332 insertions(+), 69 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 14de7dc..11bca62 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -272,6 +272,16 @@ static int mmu_topup_memory_cache(struct 
kvm_mmu_memory_cache *cache,
        return 0;
 }
 
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cr3_cache *cache;
+
+        if (!vcpu->arch.cr3_cache)
+                return;
+        cache = vcpu->arch.cr3_cache;
+        memset(cache->entry, 0, sizeof(cache->entry));
+}
+
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
        while (mc->nobjs)
@@ -1127,7 +1137,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
                           int largepage, gfn_t gfn, struct page *page,
                           int level)
 {
-       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+       hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
        int pt_write = 0;
 
        for (; ; level--) {
@@ -1219,53 +1229,75 @@ static void nonpaging_prefetch_page(struct kvm_vcpu 
*vcpu,
 
 static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
-       int i;
+       int i, j;
        struct kvm_mmu_page *sp;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               return;
+       /*
+        * Skip to the next cr3 filter entry and free it (if it's occupied).
+        */
+       vcpu->arch.cr3_cache_idx++;
+       if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+               vcpu->arch.cr3_cache_idx = 0;
+
+       j = vcpu->arch.cr3_cache_idx;
+       /*
+        * Clear the guest-visible entry.
+        */
+       if (vcpu->arch.cr3_cache) {
+               vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+               vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+       }
        spin_lock(&vcpu->kvm->mmu_lock);
 #ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+               hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+               if (!VALID_PAGE(root)) {
+                       spin_unlock(&vcpu->kvm->mmu_lock);
+                       return;
+               }
 
                sp = page_header(root);
                --sp->root_count;
                if (!sp->root_count && sp->role.invalid)
                        kvm_mmu_zap_page(vcpu->kvm, sp);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
                spin_unlock(&vcpu->kvm->mmu_lock);
                return;
        }
 #endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-               if (root) {
-                       root &= PT64_BASE_ADDR_MASK;
-                       sp = page_header(root);
-                       --sp->root_count;
-                       if (!sp->root_count && sp->role.invalid)
-                               kvm_mmu_zap_page(vcpu->kvm, sp);
+       ASSERT(vcpu->arch.mmu.pae_root[j]);
+       if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+               for (i = 0; i < 4; ++i) {
+                       hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+                       if (root) {
+                               root &= PT64_BASE_ADDR_MASK;
+                               sp = page_header(root);
+                               --sp->root_count;
+                               if (!sp->root_count && sp->role.invalid)
+                                       kvm_mmu_zap_page(vcpu->kvm, sp);
+                       }
+                       vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
                }
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
        }
        spin_unlock(&vcpu->kvm->mmu_lock);
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
-       int i;
+       int i, j;
        gfn_t root_gfn;
        struct kvm_mmu_page *sp;
        int metaphysical = 0;
 
        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+       j = vcpu->arch.cr3_cache_idx;
 
 #ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+               hpa_t root = vcpu->arch.mmu.root_hpa[j];
 
                ASSERT(!VALID_PAGE(root));
                if (tdp_enabled)
@@ -1275,7 +1307,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                                      ACC_ALL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
-               vcpu->arch.mmu.root_hpa = root;
+               vcpu->arch.mmu.root_hpa[j] = root;
                return;
        }
 #endif
@@ -1283,7 +1315,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
        if (tdp_enabled)
                metaphysical = 1;
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu.pae_root[j][i];
 
                ASSERT(!VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1299,9 +1331,9 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                                      ACC_ALL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
-               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+               vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
        }
-       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+       vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1321,7 +1353,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, 
gva_t gva,
                return r;
 
        ASSERT(vcpu);
-       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
 
        gfn = gva >> PAGE_SHIFT;
 
@@ -1367,12 +1399,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa,
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
-       mmu_free_roots(vcpu);
+       int j;
+
+       /*
+        * This will cycle through all existing roots and free them.
+        */
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               mmu_free_roots(vcpu);
 }
 
 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
@@ -1381,7 +1420,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
        context->prefetch_page = nonpaging_prefetch_page;
        context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1420,6 +1460,7 @@ static void paging_free(struct kvm_vcpu *vcpu)
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        ASSERT(is_pae(vcpu));
        context->new_cr3 = paging_new_cr3;
@@ -1429,7 +1470,8 @@ static int paging64_init_context_common(struct kvm_vcpu 
*vcpu, int level)
        context->free = paging_free;
        context->root_level = level;
        context->shadow_root_level = level;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1441,6 +1483,7 @@ static int paging64_init_context(struct kvm_vcpu *vcpu)
 static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
@@ -1449,7 +1492,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
        context->prefetch_page = paging32_prefetch_page;
        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
        return 0;
 }
 
@@ -1461,13 +1505,15 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
+       int i;
 
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
        context->prefetch_page = nonpaging_prefetch_page;
        context->shadow_root_level = TDP_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               context->root_hpa[i] = INVALID_PAGE;
 
        if (!is_paging(vcpu)) {
                context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -1489,7 +1535,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        if (!is_paging(vcpu))
                return nonpaging_init_context(vcpu);
@@ -1511,11 +1557,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
+       int j;
        ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-               vcpu->arch.mmu.free(vcpu);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       }
+
+       for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+                       vcpu->arch.mmu.free(vcpu);
+                       vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+               }
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1528,6 +1577,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
        int r;
+       int j = vcpu->arch.cr3_cache_idx;
 
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -1536,8 +1586,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        kvm_mmu_free_some_pages(vcpu);
        mmu_alloc_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-       kvm_mmu_flush_tlb(vcpu);
+       /* setting CR3 will flush the TLB */
+       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
 out:
        return r;
 }
@@ -1545,7 +1595,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-       mmu_free_roots(vcpu);
+       int j;
+
+       kvm_cr3_cache_clear(vcpu);
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+               mmu_free_roots(vcpu);
 }
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1727,6 +1781,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
                                 gpa, bytes, sp->role.word);
                        kvm_mmu_zap_page(vcpu->kvm, sp);
+                       kvm_cr3_cache_clear(vcpu);
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
                }
@@ -1788,6 +1843,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, 
gva_t gva)
 
        spin_lock(&vcpu->kvm->mmu_lock);
        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       if (r)
+               kvm_cr3_cache_clear(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
        return r;
 }
@@ -1800,6 +1857,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
                                  struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu->kvm, sp);
+               kvm_cr3_cache_clear(vcpu);
                ++vcpu->kvm->stat.mmu_recycled;
        }
 }
@@ -1850,19 +1908,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu_page *sp;
+       int j;
 
        while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
                                  struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu->kvm, sp);
        }
-       free_page((unsigned long)vcpu->arch.mmu.pae_root);
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+               ASSERT(vcpu->arch.mmu.pae_root[j]);
+               free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+               vcpu->arch.mmu.pae_root[j] = NULL;
+       }
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct page *page;
-       int i;
+       int i, j;
 
        ASSERT(vcpu);
 
@@ -1872,17 +1935,23 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
        else
                vcpu->kvm->arch.n_free_mmu_pages =
                                        vcpu->kvm->arch.n_alloc_mmu_pages;
-       /*
-        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-        * Therefore we need to allocate shadow page tables in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.
-        */
-       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-       if (!page)
-               goto error_1;
-       vcpu->arch.mmu.pae_root = page_address(page);
-       for (i = 0; i < 4; ++i)
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+               /*
+                * When emulating 32-bit mode, cr3 is only 32 bits even on
+                * x86_64. Therefore we need to allocate shadow page tables
+                * in the first 4GB of memory, which happens to fit the DMA32
+                * zone.
+                */
+               page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+               if (!page)
+                       goto error_1;
+
+               ASSERT(!vcpu->arch.mmu.pae_root[j]);
+               vcpu->arch.mmu.pae_root[j] = page_address(page);
+               for (i = 0; i < 4; ++i)
+                       vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+       }
 
        return 0;
 
@@ -1894,7 +1963,7 @@ error_1:
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        return alloc_mmu_pages(vcpu);
 }
@@ -1902,7 +1971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
        return init_kvm_mmu(vcpu);
 }
@@ -2091,6 +2160,15 @@ static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
                        return 0;
                return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
        }
+       case KVM_MMU_OP_SET_CR3: {
+               struct kvm_mmu_op_set_cr3 *scr3;
+
+               scr3 = pv_mmu_read_buffer(buffer, sizeof *scr3);
+               if (!scr3)
+                       return 0;
+               kvm_set_cr3(vcpu, scr3->cr3);
+               return 1;
+       }
        default: return 0;
        }
 }
@@ -2188,15 +2266,17 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 {
        unsigned i;
 
-       if (vcpu->arch.mmu.root_level == 4)
+       if (vcpu->arch.mmu.root_level == 4) {
                audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-       else
+               return;
+       }
+       for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
                for (i = 0; i < 4; ++i)
-                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+                       if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
                                audit_mappings_page(vcpu,
-                                                   vcpu->arch.mmu.pae_root[i],
-                                                   i << 30,
-                                                   2);
+                                                 vcpu->arch.mmu.pae_root[j][i],
+                                                 i << 30, 2);
+       }
 }
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e64e9f5..77f6882 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pages(struct kvm_vcpu 
*vcpu)
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
-       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+       int idx = vcpu->arch.cr3_cache_idx;
+       if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE))
                return 0;
 
        return kvm_mmu_load(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 17f9d16..3163c31 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -285,10 +285,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t 
addr,
        if (!is_present_pte(walker->ptes[walker->level - 1]))
                return NULL;
 
-       shadow_addr = vcpu->arch.mmu.root_hpa;
+       shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
        level = vcpu->arch.mmu.shadow_root_level;
        if (level == PT32E_ROOT_LEVEL) {
-               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr = 
vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
                shadow_addr &= PT64_BASE_ADDR_MASK;
                --level;
        }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 28ad3c4..7b774b0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
        return false;
 }
 
+static int cpu_has_cr3_cache(void)
+{
+       return 0;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .hardware_enable = svm_hardware_enable,
        .hardware_disable = svm_hardware_disable,
        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+       .cpu_has_cr3_cache = cpu_has_cr3_cache,
 
        .vcpu_create = svm_create_vcpu,
        .vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46e0e58..44b1ae0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,10 @@ static inline int cpu_has_vmx_vpid(void)
        return (vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_ENABLE_VPID);
 }
+static inline int cpu_has_cr3_cache(void)
+{
+       return 1;
+}
 
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
@@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
        return 0;
 }
 
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct page *page;
+       hva_t cr3_cache_hva;
+
+       if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache)
+               return -EINVAL;
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+       up_read(&current->mm->mmap_sem);
+
+       if (is_error_page(page)) {
+               kvm_release_page_clean(page);
+               return -EINVAL;
+       }
+
+       cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+       vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+       vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+       return 0;
+}
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 data)
        case MSR_IA32_TIME_STAMP_COUNTER:
                guest_write_tsc(data);
                break;
+       case KVM_MSR_SET_CR3_CACHE:
+               ret = vmx_cr3_cache_msr(vcpu, data);
+               break;
        default:
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+       struct kvm_cr3_cache *cache;
+       int idx;
+
        vmx_flush_tlb(vcpu);
        vmcs_writel(GUEST_CR3, cr3);
        if (vcpu->arch.cr0 & X86_CR0_PE)
                vmx_fpu_deactivate(vcpu);
+
+       if (!vcpu->arch.cr3_cache)
+               return;
+
+       idx = vcpu->arch.cr3_cache_idx;
+       cache = vcpu->arch.cr3_cache;
+
+       cache->entry[idx].host_cr3 = cr3;
+       cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+       vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1505,6 +1549,39 @@ out:
        up_read(&current->mm->mmap_sem);
        return ret;
 }
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+       unsigned int cr3_target_values, i;
+       u64 msr_val;
+
+       rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+       printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+       /*
+        * 9 bits of "CR3 target values":
+        */
+       cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+       printk(" cr3 target values: %d\n", cr3_target_values);
+       if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+               printk("KVM: limiting cr3 cache size from %d to %d\n",
+                       cr3_target_values, KVM_CR3_CACHE_SIZE);
+               cr3_target_values = KVM_CR3_CACHE_SIZE;
+       }
+
+       vcpu->arch.cr3_cache_idx = 0;
+       vcpu->arch.cr3_cache_limit = cr3_target_values;
+       /*
+        * Initialize. TODO: set this to guest physical memory.
+        */
+       for (i = 0; i < cr3_target_values; i++)
+               vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+       vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
 
 static void seg_setup(int seg)
 {
@@ -1601,7 +1678,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+       vmcs_setup_cr3_cache(&vmx->vcpu);
 
        vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
        vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
@@ -2032,9 +2109,12 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
-                       vcpu_load_rsp_rip(vcpu);
-                       kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
-                       skip_emulated_instruction(vcpu);
+                       if (!vcpu->arch.cr3_cache) {
+                               vcpu_load_rsp_rip(vcpu);
+                               kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+                               skip_emulated_instruction(vcpu);
+                       } else
+                               kvm_inject_gp(vcpu, 0);
                        return 1;
                case 4:
                        vcpu_load_rsp_rip(vcpu);
@@ -2395,6 +2475,56 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
                | vmx->rmode.irq.vector;
 }
 
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+       void *guest_cr3_hva;
+       hpa_t guest_cr3_hpa;
+       struct kvm_cr3_cache *cache;
+       int j;
+       int idx = vcpu->arch.cr3_cache_idx;
+
+       if (!vcpu->arch.cr3_cache)
+               return;
+
+       guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+       /*
+        * Are they in sync already?
+        */
+       if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+               return;
+
+       cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == 4) {
+               for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+                       hpa_t root = cache->entry[j].host_cr3;
+                       if (root != guest_cr3_hpa)
+                               continue;
+                       vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+                       vcpu->arch.cr3_cache_idx = j;
+                       vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+                       ++vcpu->stat.cr3_cache_synced;
+                       return;
+               }
+       WARN_ON(j == KVM_CR3_CACHE_SIZE);
+       }
+#endif
+
+       guest_cr3_hva = __va(guest_cr3_hpa);
+       for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+               u64 *root = vcpu->arch.mmu.pae_root[j];
+               WARN_ON(!root);
+               if (root != guest_cr3_hva)
+                       continue;
+               vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+               vcpu->arch.cr3_cache_idx = j;
+               vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+               ++vcpu->stat.cr3_cache_synced;
+               return;
+       }
+       WARN_ON(j == KVM_CR3_CACHE_SIZE);
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2405,6 +2535,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
         */
        vmcs_writel(HOST_CR0, read_cr0());
 
+       WARN_ON(vmcs_readl(GUEST_CR3) != 
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
        asm(
                /* Store host registers */
 #ifdef CONFIG_X86_64
@@ -2519,6 +2651,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
                , "ebx", "edi", "rsi"
 #endif
              );
+       /*
+        * Figure out whether vcpu->cr3 needs updating because
+        * the guest made use of the cr3 cache.
+        */
+       kvm_cr3_cache_sync(vcpu);
+       WARN_ON(vmcs_readl(GUEST_CR3) != 
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        if (vmx->rmode.irq.pending)
@@ -2551,11 +2689,16 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct page *page = NULL;
 
        spin_lock(&vmx_vpid_lock);
        if (vmx->vpid != 0)
                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
+       if (vcpu->arch.cr3_cache) {
+               page = virt_to_page(vcpu->arch.cr3_cache);
+               kvm_release_page_dirty(page);
+       }
        vmx_free_vmcs(vcpu);
        kfree(vmx->host_msrs);
        kfree(vmx->guest_msrs);
@@ -2643,6 +2786,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .hardware_enable = hardware_enable,
        .hardware_disable = hardware_disable,
        .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+       .cpu_has_cr3_cache = cpu_has_cr3_cache,
 
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92a51d3..19cceb2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -80,6 +80,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "fpu_reload", VCPU_STAT(fpu_reload) },
        { "insn_emulation", VCPU_STAT(insn_emulation) },
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "cr3_cached_synced", VCPU_STAT(cr3_cache_synced) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -820,6 +821,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PV_MMU:
                r = !tdp_enabled;
                break;
+       case KVM_CAP_CR3_CACHE:
+               r = !tdp_enabled && kvm_x86_ops->cpu_has_cr3_cache();
+               break;
        default:
                r = 0;
                break;
@@ -3298,12 +3302,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
        struct page *page;
        struct kvm *kvm;
-       int r;
+       int r, i;
 
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
 
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+               vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE;
        if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
                vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
        else
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d20cabc..f3ca4f6 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -188,11 +188,11 @@ struct kvm_mmu {
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
        void (*prefetch_page)(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page);
-       hpa_t root_hpa;
+       hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
        int root_level;
        int shadow_root_level;
 
-       u64 *pae_root;
+       u64 *pae_root[KVM_CR3_CACHE_SIZE];
 };
 
 struct kvm_vcpu_arch {
@@ -206,6 +206,9 @@ struct kvm_vcpu_arch {
        unsigned long cr0;
        unsigned long cr2;
        unsigned long cr3;
+       struct kvm_cr3_cache *cr3_cache;
+       unsigned int cr3_cache_idx;
+       unsigned int cr3_cache_limit;
        unsigned long cr4;
        unsigned long cr8;
        u64 pdptrs[4]; /* pae */
@@ -338,6 +341,7 @@ struct kvm_vcpu_stat {
        u32 insn_emulation;
        u32 insn_emulation_fail;
        u32 hypercalls;
+       u32 cr3_cache_synced;
 };
 
 struct descriptor_table {
@@ -354,6 +358,7 @@ struct kvm_x86_ops {
        int (*hardware_setup)(void);               /* __init */
        void (*hardware_unsetup)(void);            /* __exit */
        bool (*cpu_has_accelerated_tpr)(void);
+       int (*cpu_has_cr3_cache)(void);
 
        /* Create, but do not attach this VCPU */
        struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 5098459..67f2ad2 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -13,9 +13,12 @@
 #define KVM_FEATURE_CLOCKSOURCE                0
 #define KVM_FEATURE_NOP_IO_DELAY       1
 #define KVM_FEATURE_MMU_OP             2
+#define KVM_FEATURE_MMU_WRITE          2
+#define KVM_FEATURE_CR3_CACHE          3
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
+#define KVM_MSR_SET_CR3_CACHE 0x13
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
@@ -23,6 +26,7 @@
 #define KVM_MMU_OP_WRITE_PTE            1
 #define KVM_MMU_OP_FLUSH_TLB           2
 #define KVM_MMU_OP_RELEASE_PT          3
+#define KVM_MMU_OP_SET_CR3              4
 
 /* Payload for KVM_HC_MMU_OP */
 struct kvm_mmu_op_header {
@@ -45,6 +49,11 @@ struct kvm_mmu_op_release_pt {
        __u64 pt_phys;
 };
 
+struct kvm_mmu_op_set_cr3 {
+       struct kvm_mmu_op_header header;
+       __u64 cr3;
+};
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
@@ -157,4 +166,16 @@ static inline unsigned int kvm_arch_para_features(void)
 
 #endif
 
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+       __u64 guest_cr3;
+       __u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+       struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+       __u32 max_idx;
+};
+
 #endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 074a107..2aebd29 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -238,6 +238,7 @@ struct kvm_vapic_addr {
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_NOP_IO_DELAY 11
 #define KVM_CAP_PV_MMU 12
+#define KVM_CAP_CR3_CACHE 13
 
 /*
  * ioctls for VM fds
-- 
1.5.4.2


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to