From: Marcelo Tosatti <[EMAIL PROTECTED]>
Add support for the cr3 cache feature on Intel VMX CPU's. This avoids
vmexits on context switch if the cr3 value is cached in one of the
entries (currently 4 are present).
This is especially important for Xenner, where each guest syscall
involves a cr3 switch.
v1->v2:
- handle the race which happens when the guest has the cache cleared
in the middle of kvm_write_cr3 by injecting a GP and trapping it to
fallback to hypercall variant (suggested by Avi).
v2->v3:
- one ioctl per paravirt feature
v3->v4:
- disable if tdp enabled
Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Signed-off-by: Avi Kivity <[EMAIL PROTECTED]>
---
arch/x86/kvm/mmu.c | 196 +++++++++++++++++++++++++++++++-------------
arch/x86/kvm/mmu.h | 3 +-
arch/x86/kvm/paging_tmpl.h | 4 +-
arch/x86/kvm/svm.c | 6 ++
arch/x86/kvm/vmx.c | 152 +++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 9 ++-
include/asm-x86/kvm_host.h | 9 ++-
include/asm-x86/kvm_para.h | 21 +++++
include/linux/kvm.h | 1 +
9 files changed, 332 insertions(+), 69 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 14de7dc..11bca62 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -272,6 +272,16 @@ static int mmu_topup_memory_cache(struct
kvm_mmu_memory_cache *cache,
return 0;
}
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cr3_cache *cache;
+
+ if (!vcpu->arch.cr3_cache)
+ return;
+ cache = vcpu->arch.cr3_cache;
+ memset(cache->entry, 0, sizeof(cache->entry));
+}
+
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
while (mc->nobjs)
@@ -1127,7 +1137,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v,
int write,
int largepage, gfn_t gfn, struct page *page,
int level)
{
- hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+ hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
int pt_write = 0;
for (; ; level--) {
@@ -1219,53 +1229,75 @@ static void nonpaging_prefetch_page(struct kvm_vcpu
*vcpu,
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
- int i;
+ int i, j;
struct kvm_mmu_page *sp;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
+ /*
+ * Skip to the next cr3 filter entry and free it (if it's occupied).
+ */
+ vcpu->arch.cr3_cache_idx++;
+ if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+ vcpu->arch.cr3_cache_idx = 0;
+
+ j = vcpu->arch.cr3_cache_idx;
+ /*
+ * Clear the guest-visible entry.
+ */
+ if (vcpu->arch.cr3_cache) {
+ vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+ vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+ if (!VALID_PAGE(root)) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
kvm_mmu_zap_page(vcpu->kvm, sp);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
#endif
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ ASSERT(vcpu->arch.mmu.pae_root[j]);
+ if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ }
+ vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
}
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
}
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
- int i;
+ int i, j;
gfn_t root_gfn;
struct kvm_mmu_page *sp;
int metaphysical = 0;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+ j = vcpu->arch.cr3_cache_idx;
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ hpa_t root = vcpu->arch.mmu.root_hpa[j];
ASSERT(!VALID_PAGE(root));
if (tdp_enabled)
@@ -1275,7 +1307,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
- vcpu->arch.mmu.root_hpa = root;
+ vcpu->arch.mmu.root_hpa[j] = root;
return;
}
#endif
@@ -1283,7 +1315,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
if (tdp_enabled)
metaphysical = 1;
for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
+ hpa_t root = vcpu->arch.mmu.pae_root[j][i];
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1299,9 +1331,9 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
}
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1321,7 +1353,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
gva_t gva,
return r;
ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
gfn = gva >> PAGE_SHIFT;
@@ -1367,12 +1399,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t
gpa,
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
- mmu_free_roots(vcpu);
+ int j;
+
+ /*
+ * This will cycle through all existing roots and free them.
+ */
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+ mmu_free_roots(vcpu);
}
static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
@@ -1381,7 +1420,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->prefetch_page = nonpaging_prefetch_page;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
return 0;
}
@@ -1420,6 +1460,7 @@ static void paging_free(struct kvm_vcpu *vcpu)
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
@@ -1429,7 +1470,8 @@ static int paging64_init_context_common(struct kvm_vcpu
*vcpu, int level)
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
return 0;
}
@@ -1441,6 +1483,7 @@ static int paging64_init_context(struct kvm_vcpu *vcpu)
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
@@ -1449,7 +1492,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->prefetch_page = paging32_prefetch_page;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
return 0;
}
@@ -1461,13 +1505,15 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ int i;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
context->shadow_root_level = TDP_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ context->root_hpa[i] = INVALID_PAGE;
if (!is_paging(vcpu)) {
context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -1489,7 +1535,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
if (!is_paging(vcpu))
return nonpaging_init_context(vcpu);
@@ -1511,11 +1557,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
+ int j;
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
- vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
+
+ for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+ vcpu->arch.mmu.free(vcpu);
+ vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+ }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1528,6 +1577,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
+ int j = vcpu->arch.cr3_cache_idx;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -1536,8 +1586,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
+ /* setting CR3 will flush the TLB */
+ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
out:
return r;
}
@@ -1545,7 +1595,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- mmu_free_roots(vcpu);
+ int j;
+
+ kvm_cr3_cache_clear(vcpu);
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+ mmu_free_roots(vcpu);
}
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1727,6 +1781,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_cr3_cache_clear(vcpu);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -1788,6 +1843,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu,
gva_t gva)
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (r)
+ kvm_cr3_cache_clear(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
}
@@ -1800,6 +1857,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_cr3_cache_clear(vcpu);
++vcpu->kvm->stat.mmu_recycled;
}
}
@@ -1850,19 +1908,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
+ int j;
while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
}
- free_page((unsigned long)vcpu->arch.mmu.pae_root);
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+ ASSERT(vcpu->arch.mmu.pae_root[j]);
+ free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+ vcpu->arch.mmu.pae_root[j] = NULL;
+ }
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
struct page *page;
- int i;
+ int i, j;
ASSERT(vcpu);
@@ -1872,17 +1935,23 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
else
vcpu->kvm->arch.n_free_mmu_pages =
vcpu->kvm->arch.n_alloc_mmu_pages;
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- goto error_1;
- vcpu->arch.mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate shadow page tables
+ * in the first 4GB of memory, which happens to fit the DMA32
+ * zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ goto error_1;
+
+ ASSERT(!vcpu->arch.mmu.pae_root[j]);
+ vcpu->arch.mmu.pae_root[j] = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+ }
return 0;
@@ -1894,7 +1963,7 @@ error_1:
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
return alloc_mmu_pages(vcpu);
}
@@ -1902,7 +1971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
return init_kvm_mmu(vcpu);
}
@@ -2091,6 +2160,15 @@ static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
return 0;
return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
}
+ case KVM_MMU_OP_SET_CR3: {
+ struct kvm_mmu_op_set_cr3 *scr3;
+
+ scr3 = pv_mmu_read_buffer(buffer, sizeof *scr3);
+ if (!scr3)
+ return 0;
+ kvm_set_cr3(vcpu, scr3->cr3);
+ return 1;
+ }
default: return 0;
}
}
@@ -2188,15 +2266,17 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
{
unsigned i;
- if (vcpu->arch.mmu.root_level == 4)
+ if (vcpu->arch.mmu.root_level == 4) {
audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
+ return;
+ }
+ for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
+ vcpu->arch.mmu.pae_root[j][i],
+ i << 30, 2);
+ }
}
static int count_rmaps(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e64e9f5..77f6882 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pages(struct kvm_vcpu
*vcpu)
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ int idx = vcpu->arch.cr3_cache_idx;
+ if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 17f9d16..3163c31 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -285,10 +285,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t
addr,
if (!is_present_pte(walker->ptes[walker->level - 1]))
return NULL;
- shadow_addr = vcpu->arch.mmu.root_hpa;
+ shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
level = vcpu->arch.mmu.shadow_root_level;
if (level == PT32E_ROOT_LEVEL) {
- shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr =
vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
--level;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 28ad3c4..7b774b0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
+static int cpu_has_cr3_cache(void)
+{
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .cpu_has_cr3_cache = cpu_has_cr3_cache,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46e0e58..44b1ae0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,10 @@ static inline int cpu_has_vmx_vpid(void)
return (vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID);
}
+static inline int cpu_has_cr3_cache(void)
+{
+ return 1;
+}
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
@@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32
msr_index, u64 *pdata)
return 0;
}
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct page *page;
+ hva_t cr3_cache_hva;
+
+ if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache)
+ return -EINVAL;
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ return -EINVAL;
+ }
+
+ cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+ vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+ vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+ return 0;
+}
+
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32
msr_index, u64 data)
case MSR_IA32_TIME_STAMP_COUNTER:
guest_write_tsc(data);
break;
+ case KVM_MSR_SET_CR3_CACHE:
+ ret = vmx_cr3_cache_msr(vcpu, data);
+ break;
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned
long cr0)
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm_cr3_cache *cache;
+ int idx;
+
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, cr3);
if (vcpu->arch.cr0 & X86_CR0_PE)
vmx_fpu_deactivate(vcpu);
+
+ if (!vcpu->arch.cr3_cache)
+ return;
+
+ idx = vcpu->arch.cr3_cache_idx;
+ cache = vcpu->arch.cr3_cache;
+
+ cache->entry[idx].host_cr3 = cr3;
+ cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+ vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1505,6 +1549,39 @@ out:
up_read(¤t->mm->mmap_sem);
return ret;
}
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+ unsigned int cr3_target_values, i;
+ u64 msr_val;
+
+ rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+ printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+ /*
+ * 9 bits of "CR3 target values":
+ */
+ cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+ printk(" cr3 target values: %d\n", cr3_target_values);
+ if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+ printk("KVM: limiting cr3 cache size from %d to %d\n",
+ cr3_target_values, KVM_CR3_CACHE_SIZE);
+ cr3_target_values = KVM_CR3_CACHE_SIZE;
+ }
+
+ vcpu->arch.cr3_cache_idx = 0;
+ vcpu->arch.cr3_cache_limit = cr3_target_values;
+ /*
+ * Initialize. TODO: set this to guest physical memory.
+ */
+ for (i = 0; i < cr3_target_values; i++)
+ vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+ vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
static void seg_setup(int seg)
{
@@ -1601,7 +1678,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+ vmcs_setup_cr3_cache(&vmx->vcpu);
vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
@@ -2032,9 +2109,12 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct
kvm_run *kvm_run)
skip_emulated_instruction(vcpu);
return 1;
case 3:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
- skip_emulated_instruction(vcpu);
+ if (!vcpu->arch.cr3_cache) {
+ vcpu_load_rsp_rip(vcpu);
+ kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+ skip_emulated_instruction(vcpu);
+ } else
+ kvm_inject_gp(vcpu, 0);
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
@@ -2395,6 +2475,56 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
| vmx->rmode.irq.vector;
}
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+ void *guest_cr3_hva;
+ hpa_t guest_cr3_hpa;
+ struct kvm_cr3_cache *cache;
+ int j;
+ int idx = vcpu->arch.cr3_cache_idx;
+
+ if (!vcpu->arch.cr3_cache)
+ return;
+
+ guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+ /*
+ * Are they in sync already?
+ */
+ if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+ return;
+
+ cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.mmu.shadow_root_level == 4) {
+ for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+ hpa_t root = cache->entry[j].host_cr3;
+ if (root != guest_cr3_hpa)
+ continue;
+ vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+ vcpu->arch.cr3_cache_idx = j;
+ vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+ ++vcpu->stat.cr3_cache_synced;
+ return;
+ }
+ WARN_ON(j == KVM_CR3_CACHE_SIZE);
+ }
+#endif
+
+ guest_cr3_hva = __va(guest_cr3_hpa);
+ for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+ u64 *root = vcpu->arch.mmu.pae_root[j];
+ WARN_ON(!root);
+ if (root != guest_cr3_hva)
+ continue;
+ vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+ vcpu->arch.cr3_cache_idx = j;
+ vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+ ++vcpu->stat.cr3_cache_synced;
+ return;
+ }
+ WARN_ON(j == KVM_CR3_CACHE_SIZE);
+}
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2405,6 +2535,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct
kvm_run *kvm_run)
*/
vmcs_writel(HOST_CR0, read_cr0());
+ WARN_ON(vmcs_readl(GUEST_CR3) !=
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
asm(
/* Store host registers */
#ifdef CONFIG_X86_64
@@ -2519,6 +2651,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct
kvm_run *kvm_run)
, "ebx", "edi", "rsi"
#endif
);
+ /*
+ * Figure out whether vcpu->cr3 needs updating because
+ * the guest made use of the cr3 cache.
+ */
+ kvm_cr3_cache_sync(vcpu);
+ WARN_ON(vmcs_readl(GUEST_CR3) !=
vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
@@ -2551,11 +2689,16 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page = NULL;
spin_lock(&vmx_vpid_lock);
if (vmx->vpid != 0)
__clear_bit(vmx->vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
+ if (vcpu->arch.cr3_cache) {
+ page = virt_to_page(vcpu->arch.cr3_cache);
+ kvm_release_page_dirty(page);
+ }
vmx_free_vmcs(vcpu);
kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
@@ -2643,6 +2786,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+ .cpu_has_cr3_cache = cpu_has_cr3_cache,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92a51d3..19cceb2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -80,6 +80,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "cr3_cached_synced", VCPU_STAT(cr3_cache_synced) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -820,6 +821,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PV_MMU:
r = !tdp_enabled;
break;
+ case KVM_CAP_CR3_CACHE:
+ r = !tdp_enabled && kvm_x86_ops->cpu_has_cr3_cache();
+ break;
default:
r = 0;
break;
@@ -3298,12 +3302,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
struct kvm *kvm;
- int r;
+ int r, i;
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+ vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE;
if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
else
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d20cabc..f3ca4f6 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -188,11 +188,11 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
- hpa_t root_hpa;
+ hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
int root_level;
int shadow_root_level;
- u64 *pae_root;
+ u64 *pae_root[KVM_CR3_CACHE_SIZE];
};
struct kvm_vcpu_arch {
@@ -206,6 +206,9 @@ struct kvm_vcpu_arch {
unsigned long cr0;
unsigned long cr2;
unsigned long cr3;
+ struct kvm_cr3_cache *cr3_cache;
+ unsigned int cr3_cache_idx;
+ unsigned int cr3_cache_limit;
unsigned long cr4;
unsigned long cr8;
u64 pdptrs[4]; /* pae */
@@ -338,6 +341,7 @@ struct kvm_vcpu_stat {
u32 insn_emulation;
u32 insn_emulation_fail;
u32 hypercalls;
+ u32 cr3_cache_synced;
};
struct descriptor_table {
@@ -354,6 +358,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ int (*cpu_has_cr3_cache)(void);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 5098459..67f2ad2 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -13,9 +13,12 @@
#define KVM_FEATURE_CLOCKSOURCE 0
#define KVM_FEATURE_NOP_IO_DELAY 1
#define KVM_FEATURE_MMU_OP 2
+#define KVM_FEATURE_MMU_WRITE 2
+#define KVM_FEATURE_CR3_CACHE 3
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
+#define KVM_MSR_SET_CR3_CACHE 0x13
#define KVM_MAX_MMU_OP_BATCH 32
@@ -23,6 +26,7 @@
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
#define KVM_MMU_OP_RELEASE_PT 3
+#define KVM_MMU_OP_SET_CR3 4
/* Payload for KVM_HC_MMU_OP */
struct kvm_mmu_op_header {
@@ -45,6 +49,11 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
+struct kvm_mmu_op_set_cr3 {
+ struct kvm_mmu_op_header header;
+ __u64 cr3;
+};
+
#ifdef __KERNEL__
#include <asm/processor.h>
@@ -157,4 +166,16 @@ static inline unsigned int kvm_arch_para_features(void)
#endif
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+ __u64 guest_cr3;
+ __u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+ struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+ __u32 max_idx;
+};
+
#endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 074a107..2aebd29 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -238,6 +238,7 @@ struct kvm_vapic_addr {
#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */
#define KVM_CAP_NOP_IO_DELAY 11
#define KVM_CAP_PV_MMU 12
+#define KVM_CAP_CR3_CACHE 13
/*
* ioctls for VM fds
--
1.5.4.2
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel