Add support for the cr3 cache feature on Intel VMX CPU's. This avoids
vmexits on context switch if the cr3 value is cached in one of the
entries (currently 4 are present).
This is especially important for Xenner, where each guest syscall
involves a cr3 switch.
v1->v2:
- handle the race which happens when the guest has the cache cleared
in the middle of kvm_write_cr3 by injecting a GP and trapping it to
fallback to hypercall variant (suggested by Avi).
Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Cc: Anthony Liguori <[EMAIL PROTECTED]>
Index: kvm.paravirt2/arch/x86/kernel/kvm.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kernel/kvm.c
+++ kvm.paravirt2/arch/x86/kernel/kvm.c
@@ -26,14 +26,17 @@
#include <linux/cpu.h>
#include <linux/mm.h>
#include <linux/hardirq.h>
+#include <asm/tlbflush.h>
+#include <asm/asm.h>
#define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry))
struct kvm_para_state {
+ struct kvm_cr3_cache cr3_cache;
struct kvm_multicall_entry queue[MAX_MULTICALL_NR];
int queue_index;
enum paravirt_lazy_mode mode;
-};
+} __attribute__ ((aligned(PAGE_SIZE)));
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -104,6 +107,116 @@ static void kvm_io_delay(void)
{
}
+static void kvm_new_cr3(unsigned long cr3)
+{
+ kvm_hypercall1(KVM_HYPERCALL_SET_CR3, cr3);
+}
+
+static unsigned long __force_order;
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+ struct kvm_para_state *para_state = &get_cpu_var(para_state);
+ struct kvm_cr3_cache *cache = ¶_state->cr3_cache;
+ int idx;
+
+ /*
+ * Check the cache (maintained by the host) for a matching
+ * guest_cr3 => host_cr3 mapping. Use it if found:
+ */
+ for (idx = 0; idx < cache->max_idx; idx++) {
+ if (cache->entry[idx].guest_cr3 == guest_cr3) {
+ unsigned long trap;
+
+ /*
+ * Cache-hit: we load the cached host-CR3 value.
+ * Fallback to hypercall variant if it raced with
+ * the host clearing the cache after guest_cr3
+ * comparison.
+ */
+ __asm__ __volatile__ (
+ " mov %2, %0\n"
+ "0: mov %3, %%cr3\n"
+ "1:\n"
+ ".section .fixup,\"ax\"\n"
+ "2: mov %1, %0\n"
+ " jmp 1b\n"
+ ".previous\n"
+ _ASM_EXTABLE(0b, 2b)
+ : "=&r" (trap)
+ : "n" (1UL), "n" (0UL),
+ "b" (cache->entry[idx].host_cr3),
+ "m" (__force_order));
+ if (!trap)
+ goto out;
+ break;
+ }
+ }
+
+ /*
+ * Cache-miss. Tell the host the new cr3 via hypercall (to avoid
+ * aliasing problems with a cached host_cr3 == guest_cr3).
+ */
+ kvm_new_cr3(guest_cr3);
+out:
+ put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+ kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static void kvm_flush_tlb_kernel(void)
+{
+ unsigned long orig_cr4 = read_cr4();
+
+ write_cr4(orig_cr4 & ~X86_CR4_PGE);
+ kvm_flush_tlb_user();
+ write_cr4(orig_cr4);
+}
+
+static void register_cr3_cache(void *cache)
+{
+ struct kvm_para_state *state;
+
+ state = &per_cpu(para_state, raw_smp_processor_id());
+ wrmsrl(KVM_MSR_SET_CR3_CACHE, __pa(&state->cr3_cache));
+}
+
+static unsigned __init kvm_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ switch (type) {
+ case PARAVIRT_PATCH(pv_mmu_ops.write_cr3):
+ return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ default:
+ return native_patch(type, clobbers, ibuf, addr, len);
+ }
+}
+
+static void __init setup_guest_cr3_cache(void)
+{
+ on_each_cpu(register_cr3_cache, NULL, 0, 1);
+
+ pv_mmu_ops.write_cr3 = kvm_write_cr3;
+ pv_mmu_ops.flush_tlb_user = kvm_flush_tlb_user;
+ pv_mmu_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+}
+
static void kvm_mmu_write(void *dest, const void *src, size_t size)
{
const uint8_t *p = src;
@@ -120,6 +233,28 @@ static void kvm_mmu_write(void *dest, co
}
/*
+ * CR3 cache initialization uses on_each_cpu(), so it can't
+ * happen at kvm_guest_init time.
+ */
+int __init kvm_cr3_cache_init(void)
+{
+ unsigned long flags;
+
+ if (!kvm_para_available())
+ return -ENOSYS;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE)) {
+ setup_guest_cr3_cache();
+ local_irq_save(flags);
+ apply_paravirt(__parainstructions, __parainstructions_end);
+ local_irq_restore(flags);
+ }
+
+ return 0;
+}
+module_init(kvm_cr3_cache_init);
+
+/*
* We only need to hook operations that are MMU writes. We hook these so that
* we can use lazy MMU mode to batch these operations. We could probably
* improve the performance of the host code if we used some of the information
@@ -239,6 +374,9 @@ static void paravirt_ops_setup(void)
pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
}
+
+ if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE))
+ pv_init_ops.patch = kvm_patch;
}
void __init kvm_guest_init(void)
Index: kvm.paravirt2/arch/x86/kvm/mmu.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/mmu.c
+++ kvm.paravirt2/arch/x86/kvm/mmu.c
@@ -258,6 +258,16 @@ static int mmu_topup_memory_cache(struct
}
return 0;
}
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cr3_cache *cache;
+
+ if (!vcpu->arch.cr3_cache)
+ return;
+ cache = vcpu->arch.cr3_cache;
+ memset(cache->entry, 0, sizeof(cache->entry));
+}
+
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
@@ -979,7 +989,7 @@ static void nonpaging_new_cr3(struct kvm
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
gfn_t gfn, struct page *page, int level)
{
- hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+ hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
int pt_write = 0;
for (; ; level--) {
@@ -1059,53 +1069,75 @@ static void nonpaging_prefetch_page(stru
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
- int i;
+ int i, j;
struct kvm_mmu_page *sp;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
+ /*
+ * Skip to the next cr3 filter entry and free it (if it's occupied).
+ */
+ vcpu->arch.cr3_cache_idx++;
+ if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+ vcpu->arch.cr3_cache_idx = 0;
+
+ j = vcpu->arch.cr3_cache_idx;
+ /*
+ * Clear the guest-visible entry.
+ */
+ if (vcpu->arch.cr3_cache) {
+ vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+ vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+ if (!VALID_PAGE(root)) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
kvm_mmu_zap_page(vcpu->kvm, sp);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
#endif
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ ASSERT(vcpu->arch.mmu.pae_root[j]);
+ if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ }
+ vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
}
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
}
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
- int i;
+ int i, j;
gfn_t root_gfn;
struct kvm_mmu_page *sp;
int metaphysical = 0;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+ j = vcpu->arch.cr3_cache_idx;
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ hpa_t root = vcpu->arch.mmu.root_hpa[j];
ASSERT(!VALID_PAGE(root));
if (tdp_enabled)
@@ -1115,7 +1147,7 @@ static void mmu_alloc_roots(struct kvm_v
ACC_ALL, NULL, NULL);
root = __pa(sp->spt);
++sp->root_count;
- vcpu->arch.mmu.root_hpa = root;
+ vcpu->arch.mmu.root_hpa[j] = root;
return;
}
#endif
@@ -1123,7 +1155,7 @@ static void mmu_alloc_roots(struct kvm_v
if (tdp_enabled)
metaphysical = 1;
for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
+ hpa_t root = vcpu->arch.mmu.pae_root[j][i];
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1139,9 +1171,9 @@ static void mmu_alloc_roots(struct kvm_v
ACC_ALL, NULL, NULL);
root = __pa(sp->spt);
++sp->root_count;
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
}
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1161,7 +1193,7 @@ static int nonpaging_page_fault(struct k
return r;
ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
gfn = gva >> PAGE_SHIFT;
@@ -1201,12 +1233,19 @@ static int tdp_page_fault(struct kvm_vcp
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
- mmu_free_roots(vcpu);
+ int j;
+
+ /*
+ * This will cycle through all existing roots and free them.
+ */
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+ mmu_free_roots(vcpu);
}
static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
@@ -1215,7 +1254,8 @@ static int nonpaging_init_context(struct
context->prefetch_page = nonpaging_prefetch_page;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
return 0;
}
@@ -1254,6 +1294,7 @@ static void paging_free(struct kvm_vcpu
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
@@ -1263,7 +1304,8 @@ static int paging64_init_context_common(
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
return 0;
}
@@ -1275,6 +1317,7 @@ static int paging64_init_context(struct
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
@@ -1283,7 +1326,8 @@ static int paging32_init_context(struct
context->prefetch_page = paging32_prefetch_page;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
return 0;
}
@@ -1295,13 +1339,15 @@ static int paging32E_init_context(struct
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
context->shadow_root_level = TDP_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
if (!is_paging(vcpu)) {
context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -1323,7 +1369,7 @@ static int init_kvm_tdp_mmu(struct kvm_v
static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
if (!is_paging(vcpu))
return nonpaging_init_context(vcpu);
@@ -1345,11 +1391,14 @@ static int init_kvm_mmu(struct kvm_vcpu
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
+ int j;
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
- vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
+
+ for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+ vcpu->arch.mmu.free(vcpu);
+ vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+ }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1362,6 +1411,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context)
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
+ int j = vcpu->arch.cr3_cache_idx;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -1370,8 +1420,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
+ /* setting CR3 will flush the TLB */
+ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
out:
return r;
}
@@ -1379,7 +1429,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- mmu_free_roots(vcpu);
+ int j;
+
+ kvm_cr3_cache_clear(vcpu);
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+ mmu_free_roots(vcpu);
}
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1551,6 +1605,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_cr3_cache_clear(vcpu);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -1612,6 +1667,8 @@ int kvm_mmu_unprotect_page_virt(struct k
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (r)
+ kvm_cr3_cache_clear(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
}
@@ -1624,6 +1681,7 @@ void __kvm_mmu_free_some_pages(struct kv
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_cr3_cache_clear(vcpu);
++vcpu->kvm->stat.mmu_recycled;
}
}
@@ -1674,19 +1732,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
+ int j;
while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
}
- free_page((unsigned long)vcpu->arch.mmu.pae_root);
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+ ASSERT(vcpu->arch.mmu.pae_root[j]);
+ free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+ vcpu->arch.mmu.pae_root[j] = NULL;
+ }
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
struct page *page;
- int i;
+ int i, j;
ASSERT(vcpu);
@@ -1696,17 +1759,23 @@ static int alloc_mmu_pages(struct kvm_vc
else
vcpu->kvm->arch.n_free_mmu_pages =
vcpu->kvm->arch.n_alloc_mmu_pages;
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- goto error_1;
- vcpu->arch.mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate shadow page tables
+ * in the first 4GB of memory, which happens to fit the DMA32
+ * zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ goto error_1;
+
+ ASSERT(!vcpu->arch.mmu.pae_root[j]);
+ vcpu->arch.mmu.pae_root[j] = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+ }
return 0;
@@ -1718,7 +1787,7 @@ error_1:
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
return alloc_mmu_pages(vcpu);
}
@@ -1726,7 +1795,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
return init_kvm_mmu(vcpu);
}
@@ -1886,15 +1955,17 @@ static void audit_mappings(struct kvm_vc
{
unsigned i;
- if (vcpu->arch.mmu.root_level == 4)
+ if (vcpu->arch.mmu.root_level == 4) {
audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
+ return;
+ }
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
+ vcpu->arch.mmu.pae_root[j][i],
+ i << 30, 2);
+ }
}
static int count_rmaps(struct kvm_vcpu *vcpu)
Index: kvm.paravirt2/arch/x86/kvm/mmu.h
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/mmu.h
+++ kvm.paravirt2/arch/x86/kvm/mmu.h
@@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pag
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ int idx = vcpu->arch.cr3_cache_idx;
+ if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
Index: kvm.paravirt2/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.paravirt2/arch/x86/kvm/paging_tmpl.h
@@ -283,10 +283,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
if (!is_present_pte(walker->ptes[walker->level - 1]))
return NULL;
- shadow_addr = vcpu->arch.mmu.root_hpa;
+ shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
level = vcpu->arch.mmu.shadow_root_level;
if (level == PT32E_ROOT_LEVEL) {
- shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr =
vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
--level;
}
Index: kvm.paravirt2/arch/x86/kvm/vmx.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/vmx.c
+++ kvm.paravirt2/arch/x86/kvm/vmx.c
@@ -216,6 +216,10 @@ static inline int cpu_has_vmx_vpid(void)
return (vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID);
}
+static inline bool cpu_has_cr3_cache(void)
+{
+ return true;
+}
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
@@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *
return 0;
}
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct page *page;
+ hva_t cr3_cache_hva;
+
+ if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache)
+ return -EINVAL;
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ return -EINVAL;
+ }
+
+ cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+ vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+ vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+ return 0;
+}
+
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *
case MSR_IA32_TIME_STAMP_COUNTER:
guest_write_tsc(data);
break;
+ case KVM_MSR_SET_CR3_CACHE:
+ ret = vmx_cr3_cache_msr(vcpu, data);
+ break;
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm_cr3_cache *cache;
+ int idx;
+
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, cr3);
if (vcpu->arch.cr0 & X86_CR0_PE)
vmx_fpu_deactivate(vcpu);
+
+ if (!vcpu->arch.cr3_cache)
+ return;
+
+ idx = vcpu->arch.cr3_cache_idx;
+ cache = vcpu->arch.cr3_cache;
+
+ cache->entry[idx].host_cr3 = cr3;
+ cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+ vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1503,6 +1547,39 @@ out:
up_read(¤t->mm->mmap_sem);
return ret;
}
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+ unsigned int cr3_target_values, i;
+ u64 msr_val;
+
+ rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+ printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+ /*
+ * 9 bits of "CR3 target values":
+ */
+ cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+ printk(" cr3 target values: %d\n", cr3_target_values);
+ if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+ printk("KVM: limiting cr3 cache size from %d to %d\n",
+ cr3_target_values, KVM_CR3_CACHE_SIZE);
+ cr3_target_values = KVM_CR3_CACHE_SIZE;
+ }
+
+ vcpu->arch.cr3_cache_idx = 0;
+ vcpu->arch.cr3_cache_limit = cr3_target_values;
+ /*
+ * Initialize. TODO: set this to guest physical memory.
+ */
+ for (i = 0; i < cr3_target_values; i++)
+ vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+ vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
static void seg_setup(int seg)
{
@@ -1599,7 +1676,7 @@ static int vmx_vcpu_setup(struct vcpu_vm
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+ vmcs_setup_cr3_cache(&vmx->vcpu);
vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
@@ -2030,9 +2107,12 @@ static int handle_cr(struct kvm_vcpu *vc
skip_emulated_instruction(vcpu);
return 1;
case 3:
- vcpu_load_rsp_rip(vcpu);
- set_cr3(vcpu, vcpu->arch.regs[reg]);
- skip_emulated_instruction(vcpu);
+ if (!vcpu->arch.cr3_cache) {
+ vcpu_load_rsp_rip(vcpu);
+ set_cr3(vcpu, vcpu->arch.regs[reg]);
+ skip_emulated_instruction(vcpu);
+ } else
+ kvm_inject_gp(vcpu, 0);
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
@@ -2393,6 +2473,56 @@ static void fixup_rmode_irq(struct vcpu_
| vmx->rmode.irq.vector;
}
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+ void *guest_cr3_hva;
+ hpa_t guest_cr3_hpa;
+ struct kvm_cr3_cache *cache;
+ int j;
+ int idx = vcpu->arch.cr3_cache_idx;
+
+ if (!vcpu->arch.cr3_cache)
+ return;
+
+ guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+ /*
+ * Are they in sync already?
+ */
+ if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+ return;
+
+ cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.mmu.shadow_root_level == 4) {
+ for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+ hpa_t root = cache->entry[j].host_cr3;
+ if (root != guest_cr3_hpa)
+ continue;
+ vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+ vcpu->arch.cr3_cache_idx = j;
+ vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+ ++vcpu->stat.cr3_cache_synced;
+ return;
+ }
+ WARN_ON(j == KVM_CR3_CACHE_SIZE);
+ }
+#endif
+
+ guest_cr3_hva = __va(guest_cr3_hpa);
+ for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+ u64 *root = vcpu->arch.mmu.pae_root[j];
+ WARN_ON(!root);
+ if (root != guest_cr3_hva)
+ continue;
+ vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+ vcpu->arch.cr3_cache_idx = j;
+ vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+ ++vcpu->stat.cr3_cache_synced;
+ return;
+ }
+ WARN_ON(j == KVM_CR3_CACHE_SIZE);
+}
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2403,6 +2533,8 @@ static void vmx_vcpu_run(struct kvm_vcpu
*/
vmcs_writel(HOST_CR0, read_cr0());
+ WARN_ON(vmcs_readl(GUEST_CR3) !=
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
asm(
/* Store host registers */
#ifdef CONFIG_X86_64
@@ -2517,6 +2649,12 @@ static void vmx_vcpu_run(struct kvm_vcpu
, "ebx", "edi", "rsi"
#endif
);
+ /*
+ * Figure out whether vcpu->cr3 needs updating because
+ * the guest made use of the cr3 cache.
+ */
+ kvm_cr3_cache_sync(vcpu);
+ WARN_ON(vmcs_readl(GUEST_CR3) !=
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
@@ -2549,11 +2687,16 @@ static void vmx_free_vmcs(struct kvm_vcp
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page = NULL;
spin_lock(&vmx_vpid_lock);
if (vmx->vpid != 0)
__clear_bit(vmx->vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
+ if (vcpu->arch.cr3_cache) {
+ page = virt_to_page(vcpu->arch.cr3_cache);
+ kvm_release_page_dirty(page);
+ }
vmx_free_vmcs(vcpu);
kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
@@ -2641,6 +2784,7 @@ static struct kvm_x86_ops vmx_x86_ops =
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+ .cpu_has_cr3_cache = cpu_has_cr3_cache,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
Index: kvm.paravirt2/arch/x86/kvm/x86.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/x86.c
+++ kvm.paravirt2/arch/x86/kvm/x86.c
@@ -81,6 +81,7 @@ struct kvm_stats_debugfs_item debugfs_en
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "multicall", VCPU_STAT(multicall) },
{ "multicall_nr", VCPU_STAT(multicall_nr) },
+ { "cr3_cache_synced", VCPU_STAT(cr3_cache_synced) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -858,10 +859,13 @@ long kvm_arch_dev_ioctl(struct file *fil
}
case KVM_GET_PARA_FEATURES: {
__u32 para_features = KVM_PARA_FEATURES;
+
if (tdp_enabled) {
para_features &= ~(1UL << KVM_FEATURE_MMU_WRITE);
para_features &= ~(1UL << KVM_FEATURE_MULTICALL);
}
+ if (!kvm_x86_ops->cpu_has_cr3_cache())
+ para_features &= ~(1UL << KVM_FEATURE_CR3_CACHE);
r = -EFAULT;
if (copy_to_user(argp, ¶_features, sizeof para_features))
@@ -2416,6 +2420,12 @@ static int kvm_hypercall_release_pt(stru
return 0;
}
+static int kvm_hypercall_set_cr3(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ set_cr3(vcpu, cr3);
+ return 0;
+}
+
static int dispatch_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
unsigned long a0, unsigned long a1,
unsigned long a2, unsigned long a3)
@@ -2429,6 +2439,8 @@ static int dispatch_hypercall(struct kvm
return kvm_hypercall_flush_tlb(vcpu);
case KVM_HYPERCALL_RELEASE_PT:
return kvm_hypercall_release_pt(vcpu, a0);
+ case KVM_HYPERCALL_SET_CR3:
+ return kvm_hypercall_set_cr3(vcpu, a0);
}
return -KVM_ENOSYS;
@@ -3361,12 +3373,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
{
struct page *page;
struct kvm *kvm;
- int r;
+ int r, i;
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE;
if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
else
Index: kvm.paravirt2/include/asm-x86/kvm_host.h
===================================================================
--- kvm.paravirt2.orig/include/asm-x86/kvm_host.h
+++ kvm.paravirt2/include/asm-x86/kvm_host.h
@@ -181,11 +181,11 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
- hpa_t root_hpa;
+ hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
int root_level;
int shadow_root_level;
- u64 *pae_root;
+ u64 *pae_root[KVM_CR3_CACHE_SIZE];
};
struct kvm_vcpu_arch {
@@ -199,6 +199,9 @@ struct kvm_vcpu_arch {
unsigned long cr0;
unsigned long cr2;
unsigned long cr3;
+ struct kvm_cr3_cache *cr3_cache;
+ unsigned int cr3_cache_idx;
+ unsigned int cr3_cache_limit;
unsigned long cr4;
unsigned long cr8;
u64 pdptrs[4]; /* pae */
@@ -330,6 +333,7 @@ struct kvm_vcpu_stat {
u32 insn_emulation_fail;
u32 multicall;
u32 multicall_nr;
+ u32 cr3_cache_synced;
};
struct descriptor_table {
@@ -346,6 +350,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ bool (*cpu_has_cr3_cache)(void);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
Index: kvm.paravirt2/include/asm-x86/kvm_para.h
===================================================================
--- kvm.paravirt2.orig/include/asm-x86/kvm_para.h
+++ kvm.paravirt2/include/asm-x86/kvm_para.h
@@ -14,6 +14,7 @@
#define KVM_FEATURE_NOP_IO_DELAY 1
#define KVM_FEATURE_MMU_WRITE 2
#define KVM_FEATURE_MULTICALL 3
+#define KVM_FEATURE_CR3_CACHE 4
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
@@ -53,7 +54,10 @@ extern void kvmclock_init(void);
#define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY) | \
(1UL << KVM_FEATURE_CLOCKSOURCE) | \
(1UL << KVM_FEATURE_MMU_WRITE) | \
- (1UL << KVM_FEATURE_MULTICALL))
+ (1UL << KVM_FEATURE_MULTICALL) | \
+ (1UL << KVM_FEATURE_CR3_CACHE))
+
+#define KVM_MSR_SET_CR3_CACHE 0x87655678
/* This instruction is vmcall. On non-VT architectures, it will generate a
* trap that we will then rewrite to the appropriate instruction.
@@ -143,4 +147,16 @@ static inline unsigned int kvm_arch_para
#endif
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+ __u64 guest_cr3;
+ __u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+ struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+ __u32 max_idx;
+};
+
#endif
Index: kvm.paravirt2/include/linux/kvm_para.h
===================================================================
--- kvm.paravirt2.orig/include/linux/kvm_para.h
+++ kvm.paravirt2/include/linux/kvm_para.h
@@ -20,6 +20,7 @@
#define KVM_HYPERCALL_FLUSH_TLB 3
#define KVM_HYPERCALL_RELEASE_PT 4
#define KVM_HYPERCALL_MULTICALL 5
+#define KVM_HYPERCALL_SET_CR3 6
/*
* hypercalls use architecture specific
Index: kvm.paravirt2/arch/x86/kvm/svm.c
===================================================================
--- kvm.paravirt2.orig/arch/x86/kvm/svm.c
+++ kvm.paravirt2/arch/x86/kvm/svm.c
@@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(
return false;
}
+static bool cpu_has_cr3_cache(void)
+{
+ return false;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops =
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .cpu_has_cr3_cache = cpu_has_cr3_cache,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
--
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel