On Tue, Dec 18, 2007 at 05:49:51PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >Hi,
> >
> >The following is an improvement on top of an earlier patch by Izik. It
> >increases pagefault scalability for SMP guests by allowing concurrent
> >guest walking, allocation and instruction emulation on the fault path.
> >
> >The test being used is pft, which starts a number of threads
> >allocating and writing malloc()'ed memory. pft.c can be found at
> >http://lkml.org/lkml/2004/8/15/58
> >
> >The script being used is:
> >
> >bytes=$((400*1024*1024))
> >./pft -t -b$bytes -r10 -f1
> >./pft -b$bytes -r10 -f2
> >./pft -b$bytes -r10 -f3
> >./pft -b$bytes -r10 -f4
> >./pft -b$bytes -r10 -f8
> >
> >This is a 4-way guest.
> >
> >One important detail from the results is that there is no difference
> >for the two threads case, but beyond that we see a clear improvement.
> >follow_page() is showing up high in profiling, so I believe this
> >is partly due to the fact that it does follow_page() twice while
> >holding the lock, once in mmu_set_spte() from walk_addr() and again in
> >mmu_set_spte() in fetch() - I'm looking into removing those duplicated
> >calls for the same gfn.
> >
> >The patch still lacks the copy_from_user_inatomic() change in
> >prefetch_page() to avoid a potential sleep in case the page is swapped
> >out.
> >
> >Another issue is that now fetch() will re-read the pte's after
> >instantiating a shadow page, but in theory they could be swapped out. I
> >believe that is safe since walk_addr() just touched the pte's bringing
> >them in from swap.
> >
>
> We need to convert that to kvm_read_guest_atomic() to avoid even that
> theoretical race. If the read fails, we can simply return and let the
> guest retry the faulting instruction.
Updated patch, now feature complete. Changes from last version:
- Use __gfn_to_page in cmpxchg_pte() to avoid potential deadlock
- Add kvm_read_guest_inatomic() and use it in fetch()
- Make prefetch_page() use copy_from_user_inatomic()
- Pass grabbed page down to mmu_set_spte to avoid a potential schedule
with mmu_lock held (this could happen even without the page being
swapped out because get_user_pages() calls cond_resched).
- Convert a few missing mutex lock users to mmap_sem.
- Grab the mutex lock when calling kvm_iodevice_{read,write}
Please review.
Tests on 4-way guest:
KVM stock:
Gb Rep Threads User System Wall flt/cpu/s fault/wsec
0 10 1 0.368s 5.440s 6.017s176297.521 165958.112
0 10 2 0.520s 7.144s 4.023s133603.358 241902.916
0 10 3 0.576s 11.292s 4.061s 86277.053 221972.262
0 10 4 0.596s 14.996s 4.058s 65670.603 223380.197
0 10 8 0.916s 14.772s 4.063s 65268.743 220801.490
KVM + scale-2.patch:
Gb Rep Threads User System Wall flt/cpu/s fault/wsec
0 10 1 0.296s 4.976s 6.006s194221.567 168951.621
0 10 2 0.408s 6.208s 3.084s154766.639 266578.709
0 10 3 0.528s 6.736s 2.093s140960.353 348877.073
0 10 4 0.548s 7.988s 2.059s119955.022 394976.087
0 10 8 1.596s 7.896s 3.016s107873.592 323434.429
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 401eb7c..1b375ba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -810,7 +810,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int
kvm_nr_mmu_pages)
* number of actived pages , we must to free some mmu pages before we
* change the value
*/
-
+ spin_lock(&kvm->mmu_lock);
if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
kvm_nr_mmu_pages) {
int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
@@ -831,6 +831,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int
kvm_nr_mmu_pages)
- kvm->arch.n_alloc_mmu_pages;
kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+ spin_unlock(&kvm->mmu_lock);
}
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -879,13 +880,13 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return NULL;
- return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ return __gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
}
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn)
+ int *ptwrite, gfn_t gfn, struct page *userpage)
{
u64 spte;
int was_rmapped = is_rmap_pte(*shadow_pte);
@@ -907,7 +908,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64
*shadow_pte,
if (!(pte_access & ACC_EXEC_MASK))
spte |= PT64_NX_MASK;
- page = gfn_to_page(vcpu->kvm, gfn);
+ if (userpage) {
+ page = userpage;
+ get_page(page);
+ } else
+ page = __gfn_to_page(vcpu->kvm, gfn);
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
@@ -984,7 +989,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v,
int write, gfn_t gfn)
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn);
+ 0, write, 1, &pt_write, gfn, NULL);
return pt_write || is_io_pte(table[index]);
}
@@ -1026,6 +1031,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+ spin_lock(&vcpu->kvm->mmu_lock);
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -1033,6 +1039,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
#endif
@@ -1047,6 +1054,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ spin_unlock(&vcpu->kvm->mmu_lock);
}
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
@@ -1129,6 +1137,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->pte_to_page = NULL;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
context->root_level = 0;
@@ -1177,6 +1186,7 @@ static int paging64_init_context_common(struct kvm_vcpu
*vcpu, int level)
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
+ context->pte_to_page = paging64_pte_to_page;
context->prefetch_page = paging64_prefetch_page;
context->free = paging_free;
context->root_level = level;
@@ -1197,6 +1207,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
+ context->pte_to_page = paging32_pte_to_page;
context->free = paging_free;
context->prefetch_page = paging32_prefetch_page;
context->root_level = PT32_ROOT_LEVEL;
@@ -1245,15 +1256,16 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- mutex_lock(&vcpu->kvm->lock);
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
mmu_alloc_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
- mutex_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1286,7 +1298,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *spte,
const void *new, int bytes,
- int offset_in_pte)
+ int offset_in_pte,
+ struct page *userpage)
{
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
@@ -1295,9 +1308,11 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte,
+ userpage);
else
- paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte,
+ userpage);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -1329,7 +1344,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu
*vcpu)
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ const u8 *new, int bytes, struct page *userpage)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@@ -1410,7 +1425,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
- page_offset & (pte_size - 1));
+ page_offset & (pte_size - 1),
+ userpage);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
@@ -1420,13 +1436,22 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa_t gpa;
+ int r;
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ up_read(¤t->mm->mmap_sem);
- return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ spin_lock(&vcpu->kvm->mmu_lock);
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return r;
}
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
+ spin_lock(&vcpu->kvm->mmu_lock);
while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *sp;
@@ -1435,6 +1460,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
+ spin_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -1442,7 +1468,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2,
u32 error_code)
int r;
enum emulation_result er;
- mutex_lock(&vcpu->kvm->lock);
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
if (r < 0)
goto out;
@@ -1457,7 +1482,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2,
u32 error_code)
goto out;
er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
- mutex_unlock(&vcpu->kvm->lock);
switch (er) {
case EMULATE_DONE:
@@ -1472,7 +1496,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2,
u32 error_code)
BUG();
}
out:
- mutex_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -1569,8 +1592,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
+ spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
kvm_mmu_zap_page(kvm, sp);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 56b88f7..b02be2e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -68,6 +68,7 @@ struct guest_walker {
pt_element_t ptes[PT_MAX_FULL_LEVELS];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
+ struct page *page;
unsigned pte_access;
gfn_t gfn;
u32 error_code;
@@ -91,7 +92,7 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
- page = gfn_to_page(kvm, table_gfn);
+ page = __gfn_to_page(kvm, table_gfn);
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -186,6 +187,7 @@ walk:
if (walker->level == PT_PAGE_TABLE_LEVEL) {
walker->gfn = gpte_to_gfn(pte);
+ walker->page = __gfn_to_page(vcpu->kvm, walker->gfn);
break;
}
@@ -196,6 +198,7 @@ walk:
walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
if (PTTYPE == 32 && is_cpuid_PSE36())
walker->gfn += pse36_gfn_delta(pte);
+ walker->page = __gfn_to_page(vcpu->kvm, walker->gfn);
break;
}
@@ -209,10 +212,15 @@ walk:
mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
- if (ret)
+ if (ret) {
+ kvm_release_page_clean(walker->page);
goto walk;
+ }
pte |= PT_DIRTY_MASK;
- kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte),
+ walker->page);
+ spin_unlock(&vcpu->kvm->mmu_lock);
walker->ptes[walker->level - 1] = pte;
}
@@ -241,7 +249,7 @@ err:
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
u64 *spte, const void *pte, int bytes,
- int offset_in_pte)
+ int offset_in_pte, struct page *userpage)
{
pt_element_t gpte;
unsigned pte_access;
@@ -257,7 +265,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct
kvm_mmu_page *page,
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
+ gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), userpage);
}
/*
@@ -316,11 +324,16 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t
addr,
metaphysical, access,
shadow_ent, &new_page);
if (new_page && !metaphysical) {
+ int r;
pt_element_t curr_pte;
- kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (curr_pte != walker->ptes[level - 2])
- return NULL;
+ r = kvm_read_guest_inatomic(vcpu->kvm,
+ walker->pte_gpa[level - 2],
+ &curr_pte,
+ sizeof(curr_pte));
+ if (r || curr_pte != walker->ptes[level - 2]) {
+ shadow_ent = NULL;
+ goto out;
+ }
}
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
@@ -331,8 +344,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, walker->gfn);
-
+ ptwrite, walker->gfn, walker->page);
+out:
+ kvm_release_page_clean(walker->page);
return shadow_ent;
}
@@ -371,6 +385,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t
addr,
/*
* Look up the shadow pte for the faulting address.
*/
+ down_read(¤t->mm->mmap_sem);
r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
fetch_fault);
@@ -378,12 +393,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t
addr,
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
+ up_read(¤t->mm->mmap_sem);
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
return 0;
}
-
+ spin_lock(&vcpu->kvm->mmu_lock);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
&write_pt);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
@@ -395,15 +411,32 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t
addr,
/*
* mmio: emulate if accessible, otherwise its a guest fault.
*/
- if (shadow_pte && is_io_pte(*shadow_pte))
+ if (shadow_pte && is_io_pte(*shadow_pte)) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
return 1;
+ }
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
return write_pt;
}
+static struct page *FNAME(pte_to_page)(struct kvm_vcpu *vcpu, const void *pte,
+ int bytes)
+{
+ pt_element_t gpte = *(const pt_element_t *)pte;
+
+ if (bytes < sizeof(pt_element_t))
+ return NULL;
+ if (!is_present_pte(gpte))
+ return NULL;
+
+ return __gfn_to_page(vcpu->kvm, gpte_to_gfn(gpte));
+}
+
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
{
struct guest_walker walker;
@@ -415,6 +448,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t
vaddr)
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
+ kvm_release_page_clean(walker.page);
}
return gpa;
@@ -423,27 +457,36 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu,
gva_t vaddr)
static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- int i, offset = 0;
+ int i, r, offset = 0;
pt_element_t *gpt;
- struct page *page;
-
+ void __user *src = (void __user *)gfn_to_hva(vcpu->kvm, sp->gfn);
+ void *dest = (void *)vcpu->kvm->prefetch_tmp_area;
+
if (sp->role.metaphysical
|| (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
nonpaging_prefetch_page(vcpu, sp);
return;
}
+ pagefault_disable();
+ r = __copy_from_user_inatomic(dest, src, PAGE_SIZE);
+ pagefault_enable();
+
+ if (r) {
+ nonpaging_prefetch_page(vcpu, sp);
+ return;
+ }
+
+ gpt = (pt_element_t *)dest;
+
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
- page = gfn_to_page(vcpu->kvm, sp->gfn);
- gpt = kmap_atomic(page, KM_USER0);
+
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
if (is_present_pte(gpt[offset + i]))
sp->spt[i] = shadow_trap_nonpresent_pte;
else
sp->spt[i] = shadow_notrap_nonpresent_pte;
- kunmap_atomic(gpt, KM_USER0);
- kvm_release_page_clean(page);
}
#undef pt_element_t
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 20c0f5e..e5a40dc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1431,27 +1431,34 @@ static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
u16 data = 0;
+ int ret = 0;
int r;
+ down_read(¤t->mm->mmap_sem);
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- return 0;
+ if (r < 0)
+ goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
if (r < 0)
- return 0;
+ goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
if (r < 0)
- return 0;
+ goto out;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
- return 0;
+ goto out;
data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE
- 1,
- sizeof(u8));
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+ sizeof(u8));
if (r < 0)
- return 0;
- return 1;
+ goto out;
+
+ ret = 1;
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
}
static void seg_setup(int seg)
@@ -1468,8 +1475,8 @@ static int alloc_apic_access_page(struct kvm *kvm)
{
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
-
- mutex_lock(&kvm->lock);
+
+ down_write(¤t->mm->mmap_sem);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1481,7 +1488,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
goto out;
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return r;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b26270..24d8344 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,7 +180,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
@@ -197,7 +197,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
return ret;
}
@@ -211,13 +211,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte,
sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
return changed;
}
@@ -277,9 +277,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
return;
}
EXPORT_SYMBOL_GPL(set_cr0);
@@ -319,9 +317,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
- mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
}
EXPORT_SYMBOL_GPL(set_cr4);
@@ -359,7 +355,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
@@ -375,7 +371,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
}
EXPORT_SYMBOL_GPL(set_cr3);
@@ -1170,12 +1166,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm
*kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return 0;
}
@@ -1224,7 +1220,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
p = &kvm->arch.aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1238,7 +1234,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
kvm_mmu_zap_all(kvm);
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return 0;
@@ -1314,7 +1310,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -1330,7 +1326,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
}
r = 0;
out:
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return r;
}
@@ -1524,25 +1520,32 @@ int emulator_read_std(unsigned long addr,
struct kvm_vcpu *vcpu)
{
void *data = val;
+ int r = X86EMUL_CONTINUE;
+ down_read(¤t->mm->mmap_sem);
while (bytes) {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
- if (ret < 0)
- return X86EMUL_UNHANDLEABLE;
+ if (ret < 0) {
+ r = X86EMUL_UNHANDLEABLE;
+ goto out;
+ }
bytes -= tocopy;
data += tocopy;
addr += tocopy;
}
-
- return X86EMUL_CONTINUE;
+out:
+ up_read(¤t->mm->mmap_sem);
+ return r;
}
EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1560,7 +1563,9 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
+ down_read(¤t->mm->mmap_sem);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1576,11 +1581,14 @@ mmio:
/*
* Is this MMIO handled locally?
*/
+ mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
if (mmio_dev) {
kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+ mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
+ mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -1594,11 +1602,21 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu,
gpa_t gpa,
const void *val, int bytes)
{
int ret;
+ struct page *page;
+ down_read(¤t->mm->mmap_sem);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
- if (ret < 0)
+ if (ret < 0) {
+ up_read(¤t->mm->mmap_sem);
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ }
+ page = vcpu->arch.mmu.pte_to_page(vcpu, val, bytes);
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, page);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
+ if (page)
+ kvm_release_page_clean(page);
return 1;
}
@@ -1608,7 +1626,11 @@ static int emulator_write_emulated_onepage(unsigned long
addr,
struct kvm_vcpu *vcpu)
{
struct kvm_io_device *mmio_dev;
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa;
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
if (gpa == UNMAPPED_GVA) {
kvm_inject_page_fault(vcpu, addr, 2);
@@ -1626,11 +1648,14 @@ mmio:
/*
* Is this MMIO handled locally?
*/
+ mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
if (mmio_dev) {
kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+ mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
+ mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -1677,11 +1702,15 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
#ifndef CONFIG_X86_64
/* guests cmpxchg8b have to be emulated atomically */
if (bytes == 8) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa;
struct page *page;
char *addr;
u64 *val;
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
+
if (gpa == UNMAPPED_GVA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
@@ -2077,10 +2106,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu,
struct kvm_run *run, int in,
kvm_x86_ops->skip_emulated_instruction(vcpu);
for (i = 0; i < nr_pages; ++i) {
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
vcpu->arch.pio.guest_pages[i] = page;
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
if (!page) {
kvm_inject_gp(vcpu, 0);
free_pio_guest_pages(vcpu);
@@ -2203,7 +2232,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
char instruction[3];
int ret = 0;
- mutex_lock(&vcpu->kvm->lock);
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
@@ -2218,8 +2246,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
!= X86EMUL_CONTINUE)
ret = -EFAULT;
- mutex_unlock(&vcpu->kvm->lock);
-
return ret;
}
@@ -2827,13 +2853,13 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
vcpu_load(vcpu);
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+ up_read(¤t->mm->mmap_sem);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
- mutex_unlock(&vcpu->kvm->lock);
vcpu_put(vcpu);
return 0;
@@ -3102,13 +3128,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
- down_write(¤t->mm->mmap_sem);
memslot->userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS,
0);
- up_write(¤t->mm->mmap_sem);
if (IS_ERR((void *)memslot->userspace_addr))
return PTR_ERR((void *)memslot->userspace_addr);
@@ -3116,10 +3140,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;
- down_write(¤t->mm->mmap_sem);
ret = do_munmap(current->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
- up_write(¤t->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
"kvm_vm_ioctl_set_memory_region: "
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 28940e1..fd06723 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -177,6 +177,8 @@ struct kvm_mmu {
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ struct page *(*pte_to_page)(struct kvm_vcpu *vcpu, const void *pte,
+ int bytes);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
hpa_t root_hpa;
@@ -468,7 +470,7 @@ unsigned long segment_base(u16 selector);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes);
+ const u8 *new, int bytes, struct page *userpage);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 953b50a..6ca0bdb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -105,11 +105,13 @@ struct kvm_memory_slot {
struct kvm {
struct mutex lock; /* protects everything except vcpus */
+ spinlock_t mmu_lock;
struct mm_struct *mm; /* userspace tied to this vm */
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
KVM_PRIVATE_MEM_SLOTS];
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+ unsigned long prefetch_tmp_area;
struct list_head vm_list;
struct file *filp;
struct kvm_io_bus mmio_bus;
@@ -163,11 +165,18 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
int user_alloc);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+
+int kvm_read_guest_page_inatomic(struct kvm *kvm, gfn_t gfn, void *data,
+ int offset, int len);
+int kvm_read_guest_inatomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len);
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 845beb2..afdb767 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,12 +165,14 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
+ spin_lock_init(&kvm->mmu_lock);
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
kvm_io_bus_init(&kvm->mmio_bus);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
+ kvm->prefetch_tmp_area = get_zeroed_page(GFP_KERNEL);
out:
return kvm;
}
@@ -211,6 +213,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_io_bus_destroy(&kvm->mmio_bus);
kvm_arch_destroy_vm(kvm);
mmdrop(mm);
+ free_page(kvm->prefetch_tmp_area);
}
static int kvm_vm_release(struct inode *inode, struct file *filp)
@@ -227,7 +230,7 @@ static int kvm_vm_release(struct inode *inode, struct file
*filp)
*
* Discontiguous memory is allowed, mostly for framebuffers.
*
- * Must be called holding kvm->lock.
+ * Must be called holding mmap_sem for write.
*/
int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
@@ -338,9 +341,9 @@ int kvm_set_memory_region(struct kvm *kvm,
{
int r;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -442,7 +445,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
-static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
@@ -452,11 +455,12 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t
gfn)
return bad_hva();
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
+EXPORT_SYMBOL_GPL(gfn_to_hva);
/*
* Requires current->mm->mmap_sem to be held
*/
-static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
struct page *page[1];
unsigned long addr;
@@ -480,6 +484,7 @@ static struct page *__gfn_to_page(struct kvm *kvm, gfn_t
gfn)
return page[0];
}
+EXPORT_SYMBOL(__gfn_to_page);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
@@ -552,6 +557,46 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data,
unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_read_guest);
+int kvm_read_guest_page_inatomic(struct kvm *kvm, gfn_t gfn, void *data,
+ int offset, int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ pagefault_disable();
+ r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ pagefault_enable();
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_inatomic);
+
+int kvm_read_guest_inatomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_read_guest_page_inatomic(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_inatomic);
+
+
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len)
{
-------------------------------------------------------------------------
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services
for just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel