Hi, 

The following is an improvement on top of an earlier patch by Izik. It
increases pagefault scalability for SMP guests by allowing concurrent
guest walking, allocation and instruction emulation on the fault path.

The test being used is pft, which starts a number of threads
allocating and writing malloc()'ed memory. pft.c can be found at
http://lkml.org/lkml/2004/8/15/58

The script being used is:

bytes=$((400*1024*1024))
./pft -t -b$bytes -r10 -f1
./pft -b$bytes -r10 -f2
./pft -b$bytes -r10 -f3
./pft -b$bytes -r10 -f4
./pft -b$bytes -r10 -f8

This is a 4-way guest.

One important detail from the results is that there is no difference
for the two threads case, but beyond that we see a clear improvement.
follow_page() is showing up high in profiling, so I believe this
is partly due to the fact that it does follow_page() twice while
holding the lock, once in mmu_set_spte() from walk_addr() and again in
mmu_set_spte() in fetch() - I'm looking into removing those duplicated
calls for the same gfn.

The patch still lacks the copy_from_user_inatomic() change in
prefetch_page() to avoid a potential sleep in case the page is swapped
out. 

Another issue is that now fetch() will re-read the pte's after
instantiating a shadow page, but in theory they could be swapped out. I
believe that is safe since walk_addr() just touched the pte's bringing
them in from swap.

Switching the shadow lock to a spinlock also reduces the pagefault
latency in > 2 CPU's case to 50%.


stock KVM:
 Gb Rep Threads   User      System     Wall flt/cpu/s fault/wsec
  0  10    1    0.292s      5.744s   6.006s169638.179 168754.614
  0  10    2    0.476s      7.340s   4.032s131005.143 236904.678
  0  10    3    0.620s     10.972s   4.058s 88331.275 223355.103
  0  10    4    0.588s     14.784s   4.062s 66610.464 221352.545
  0  10    8    0.616s     15.100s   4.074s 65152.462 215861.148

KVM + kvm-scale.patch:
 Gb Rep Threads   User      System     Wall flt/cpu/s fault/wsec   
  0  10    1    0.328s      6.668s   7.002s146360.233 145746.022
  0  10    2    0.472s      7.708s   4.036s125175.570 234513.121
  0  10    3    0.500s      8.480s   3.038s114024.057 302227.769
  0  10    4    0.508s      9.156s   3.012s105953.654 328156.474
  0  10    8    1.144s      9.260s   3.028s 98417.544 311395.745


diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 401eb7c..c630f59 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -810,7 +810,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int 
kvm_nr_mmu_pages)
         * number of actived pages , we must to free some mmu pages before we
         * change the value
         */
-
+       spin_lock(&kvm->mmu_lock);
        if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
            kvm_nr_mmu_pages) {
                int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
@@ -831,6 +831,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int 
kvm_nr_mmu_pages)
                                         - kvm->arch.n_alloc_mmu_pages;
 
        kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+       spin_unlock(&kvm->mmu_lock);
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -907,7 +908,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*shadow_pte,
        if (!(pte_access & ACC_EXEC_MASK))
                spte |= PT64_NX_MASK;
 
-       page = gfn_to_page(vcpu->kvm, gfn);
+       page = __gfn_to_page(vcpu->kvm, gfn);
 
        spte |= PT_PRESENT_MASK;
        if (pte_access & ACC_USER_MASK)
@@ -1245,15 +1246,16 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
        int r;
 
-       mutex_lock(&vcpu->kvm->lock);
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                goto out;
+
+       spin_lock(&vcpu->kvm->mmu_lock);
        mmu_alloc_roots(vcpu);
+       spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
        kvm_mmu_flush_tlb(vcpu);
 out:
-       mutex_unlock(&vcpu->kvm->lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1420,13 +1422,22 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 {
-       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+       gpa_t gpa;
+       int r;
+
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+       up_read(&current->mm->mmap_sem);
 
-       return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       spin_lock(&vcpu->kvm->mmu_lock);
+       r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       return r;
 }
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
+       spin_lock(&vcpu->kvm->mmu_lock);
        while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
                struct kvm_mmu_page *sp;
 
@@ -1435,6 +1446,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
                kvm_mmu_zap_page(vcpu->kvm, sp);
                ++vcpu->kvm->stat.mmu_recycled;
        }
+       spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -1442,7 +1454,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u32 error_code)
        int r;
        enum emulation_result er;
 
-       mutex_lock(&vcpu->kvm->lock);
        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
        if (r < 0)
                goto out;
@@ -1457,7 +1468,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u32 error_code)
                goto out;
 
        er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
-       mutex_unlock(&vcpu->kvm->lock);
 
        switch (er) {
        case EMULATE_DONE:
@@ -1472,7 +1482,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u32 error_code)
                BUG();
        }
 out:
-       mutex_unlock(&vcpu->kvm->lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -1569,8 +1578,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 {
        struct kvm_mmu_page *sp, *node;
 
+       spin_lock(&kvm->mmu_lock);
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
                kvm_mmu_zap_page(kvm, sp);
+       spin_unlock(&kvm->mmu_lock);
 
        kvm_flush_remote_tlbs(kvm);
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 56b88f7..32902c4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -68,6 +68,7 @@ struct guest_walker {
        pt_element_t ptes[PT_MAX_FULL_LEVELS];
        gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
        unsigned pt_access;
+       struct page *page;
        unsigned pte_access;
        gfn_t gfn;
        u32 error_code;
@@ -186,6 +187,7 @@ walk:
 
                if (walker->level == PT_PAGE_TABLE_LEVEL) {
                        walker->gfn = gpte_to_gfn(pte);
+                       walker->page = __gfn_to_page(vcpu->kvm, walker->gfn);
                        break;
                }
 
@@ -196,6 +198,7 @@ walk:
                        walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
                        if (PTTYPE == 32 && is_cpuid_PSE36())
                                walker->gfn += pse36_gfn_delta(pte);
+                       walker->page = __gfn_to_page(vcpu->kvm, walker->gfn);
                        break;
                }
 
@@ -212,7 +215,9 @@ walk:
                if (ret)
                        goto walk;
                pte |= PT_DIRTY_MASK;
+               spin_lock(&vcpu->kvm->mmu_lock);
                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+               spin_unlock(&vcpu->kvm->mmu_lock);
                walker->ptes[walker->level - 1] = pte;
        }
 
@@ -319,20 +324,22 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t 
addr,
                        pt_element_t curr_pte;
                        kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
                                       &curr_pte, sizeof(curr_pte));
-                       if (curr_pte != walker->ptes[level - 2])
-                               return NULL;
+                       if (curr_pte != walker->ptes[level - 2]) {
+                               shadow_ent = NULL;
+                               goto out;
+                       }
                }
                shadow_addr = __pa(shadow_page->spt);
                shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
                        | PT_WRITABLE_MASK | PT_USER_MASK;
                *shadow_ent = shadow_pte;
        }
-
        mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
                     user_fault, write_fault,
                     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
                     ptwrite, walker->gfn);
-
+out:
+       kvm_release_page_clean(walker->page);
        return shadow_ent;
 }
 
@@ -371,6 +378,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
        /*
         * Look up the shadow pte for the faulting address.
         */
+       down_read(&current->mm->mmap_sem);
        r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
                             fetch_fault);
 
@@ -378,12 +386,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
         * The page is not mapped by the guest.  Let the guest handle it.
         */
        if (!r) {
+               up_read(&current->mm->mmap_sem);
                pgprintk("%s: guest page fault\n", __FUNCTION__);
                inject_page_fault(vcpu, addr, walker.error_code);
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
                return 0;
        }
-
+       spin_lock(&vcpu->kvm->mmu_lock);
        shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
                                  &write_pt);
        pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
@@ -395,12 +404,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
        /*
         * mmio: emulate if accessible, otherwise its a guest fault.
         */
-       if (shadow_pte && is_io_pte(*shadow_pte))
+       if (shadow_pte && is_io_pte(*shadow_pte)) {
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               up_read(&current->mm->mmap_sem);
                return 1;
+       }
 
        ++vcpu->stat.pf_fixed;
        kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       up_read(&current->mm->mmap_sem);
        return write_pt;
 }
 
@@ -415,6 +428,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t 
vaddr)
        if (r) {
                gpa = gfn_to_gpa(walker.gfn);
                gpa |= vaddr & ~PAGE_MASK;
+               kvm_release_page_clean(walker.page);
        }
 
        return gpa;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 20c0f5e..e5a40dc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1431,27 +1431,34 @@ static int init_rmode_tss(struct kvm *kvm)
 {
        gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
        u16 data = 0;
+       int ret = 0;
        int r;
 
+       down_read(&current->mm->mmap_sem);
        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-       if (r < 0)
-               return 0;
+       if (r < 0) 
+               goto out;
        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
        r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
        if (r < 0)
-               return 0;
+               goto out;
        r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
        if (r < 0)
-               return 0;
+               goto out;
        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
        if (r < 0)
-               return 0;
+               goto out;
        data = ~0;
-       r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE 
- 1,
-                       sizeof(u8));
+       r = kvm_write_guest_page(kvm, fn, &data,
+                                RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+                                sizeof(u8));
        if (r < 0)
-               return 0;
-       return 1;
+               goto out;
+
+       ret = 1;
+out:
+       up_read(&current->mm->mmap_sem);
+       return ret;
 }
 
 static void seg_setup(int seg)
@@ -1468,8 +1475,8 @@ static int alloc_apic_access_page(struct kvm *kvm)
 {
        struct kvm_userspace_memory_region kvm_userspace_mem;
        int r = 0;
-
-       mutex_lock(&kvm->lock);
+       
+       down_write(&current->mm->mmap_sem);
        if (kvm->arch.apic_access_page)
                goto out;
        kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1481,7 +1488,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
                goto out;
        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
 out:
-       mutex_unlock(&kvm->lock);
+       up_write(&current->mm->mmap_sem);
        return r;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b26270..f0d3edd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,7 +180,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
        int ret;
        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-       mutex_lock(&vcpu->kvm->lock);
+       down_read(&current->mm->mmap_sem);
        ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
                                  offset * sizeof(u64), sizeof(pdpte));
        if (ret < 0) {
@@ -197,7 +197,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 
        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-       mutex_unlock(&vcpu->kvm->lock);
+       up_read(&current->mm->mmap_sem);
 
        return ret;
 }
@@ -211,13 +211,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
        if (is_long_mode(vcpu) || !is_pae(vcpu))
                return false;
 
-       mutex_lock(&vcpu->kvm->lock);
+       down_read(&current->mm->mmap_sem);
        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, 
sizeof(pdpte));
        if (r < 0)
                goto out;
        changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 out:
-       mutex_unlock(&vcpu->kvm->lock);
+       up_read(&current->mm->mmap_sem);
 
        return changed;
 }
@@ -1224,7 +1224,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
            < alias->target_phys_addr)
                goto out;
 
-       mutex_lock(&kvm->lock);
+       down_write(&current->mm->mmap_sem);
 
        p = &kvm->arch.aliases[alias->slot];
        p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1238,7 +1238,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 
        kvm_mmu_zap_all(kvm);
 
-       mutex_unlock(&kvm->lock);
+       up_write(&current->mm->mmap_sem);
 
        return 0;
 
@@ -1314,6 +1314,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        struct kvm_memory_slot *memslot;
        int is_dirty = 0;
 
+       down_write(&current->mm->mmap_sem);
        mutex_lock(&kvm->lock);
 
        r = kvm_get_dirty_log(kvm, log, &is_dirty);
@@ -1331,6 +1332,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        r = 0;
 out:
        mutex_unlock(&kvm->lock);
+       up_write(&current->mm->mmap_sem);
        return r;
 }
 
@@ -1524,25 +1526,32 @@ int emulator_read_std(unsigned long addr,
                             struct kvm_vcpu *vcpu)
 {
        void *data = val;
+       int r = X86EMUL_CONTINUE;
 
+       down_read(&current->mm->mmap_sem);
        while (bytes) {
                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
 
-               if (gpa == UNMAPPED_GVA)
-                       return X86EMUL_PROPAGATE_FAULT;
+               if (gpa == UNMAPPED_GVA) { 
+                       r = X86EMUL_PROPAGATE_FAULT;
+                       goto out;
+               }
                ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
-               if (ret < 0)
-                       return X86EMUL_UNHANDLEABLE;
+               if (ret < 0) {
+                       r = X86EMUL_UNHANDLEABLE;
+                       goto out;
+               }
 
                bytes -= tocopy;
                data += tocopy;
                addr += tocopy;
        }
-
-       return X86EMUL_CONTINUE;
+out:
+       up_read(&current->mm->mmap_sem);
+       return r;
 }
 EXPORT_SYMBOL_GPL(emulator_read_std);
 
@@ -1560,7 +1569,9 @@ static int emulator_read_emulated(unsigned long addr,
                return X86EMUL_CONTINUE;
        }
 
+       down_read(&current->mm->mmap_sem);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       up_read(&current->mm->mmap_sem);
 
        /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1594,11 +1605,20 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, 
gpa_t gpa,
                               const void *val, int bytes)
 {
        int ret;
+       struct page *page;
 
+       down_read(&current->mm->mmap_sem);
        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-       if (ret < 0)
+       if (ret < 0) {
+               up_read(&current->mm->mmap_sem);
                return 0;
+       }
+       page = __gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       up_read(&current->mm->mmap_sem);
+       kvm_release_page_clean(page);
        return 1;
 }
 
@@ -1608,7 +1628,11 @@ static int emulator_write_emulated_onepage(unsigned long 
addr,
                                           struct kvm_vcpu *vcpu)
 {
        struct kvm_io_device *mmio_dev;
-       gpa_t                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       gpa_t                 gpa;
+
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       up_read(&current->mm->mmap_sem);
 
        if (gpa == UNMAPPED_GVA) {
                kvm_inject_page_fault(vcpu, addr, 2);
@@ -1677,11 +1701,15 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 #ifndef CONFIG_X86_64
        /* guests cmpxchg8b have to be emulated atomically */
        if (bytes == 8) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               gpa_t gpa;
                struct page *page;
                char *addr;
                u64 *val;
 
+               down_read(&current->mm->mmap_sem);
+               gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               up_read(&current->mm->mmap_sem);
+
                if (gpa == UNMAPPED_GVA ||
                   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
                        goto emul_write;
@@ -1690,7 +1718,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                        goto emul_write;
 
                val = (u64 *)new;
-               page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+               page = __gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
                addr = kmap_atomic(page, KM_USER0);
                set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
                kunmap_atomic(addr, KM_USER0);
@@ -2077,10 +2105,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, 
struct kvm_run *run, int in,
                kvm_x86_ops->skip_emulated_instruction(vcpu);
 
        for (i = 0; i < nr_pages; ++i) {
-               mutex_lock(&vcpu->kvm->lock);
+               down_read(&current->mm->mmap_sem);
                page = gva_to_page(vcpu, address + i * PAGE_SIZE);
                vcpu->arch.pio.guest_pages[i] = page;
-               mutex_unlock(&vcpu->kvm->lock);
+               up_read(&current->mm->mmap_sem);
                if (!page) {
                        kvm_inject_gp(vcpu, 0);
                        free_pio_guest_pages(vcpu);
@@ -2203,7 +2231,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
        char instruction[3];
        int ret = 0;
 
-       mutex_lock(&vcpu->kvm->lock);
 
        /*
         * Blow out the MMU to ensure that no other VCPU has an active mapping
@@ -2218,8 +2245,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
            != X86EMUL_CONTINUE)
                ret = -EFAULT;
 
-       mutex_unlock(&vcpu->kvm->lock);
-
        return ret;
 }
 
@@ -3102,13 +3127,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
         */
        if (!user_alloc) {
                if (npages && !old.rmap) {
-                       down_write(&current->mm->mmap_sem);
                        memslot->userspace_addr = do_mmap(NULL, 0,
                                                     npages * PAGE_SIZE,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_SHARED | MAP_ANONYMOUS,
                                                     0);
-                       up_write(&current->mm->mmap_sem);
 
                        if (IS_ERR((void *)memslot->userspace_addr))
                                return PTR_ERR((void *)memslot->userspace_addr);
@@ -3116,10 +3139,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                        if (!old.user_alloc && old.rmap) {
                                int ret;
 
-                               down_write(&current->mm->mmap_sem);
                                ret = do_munmap(current->mm, old.userspace_addr,
                                                old.npages * PAGE_SIZE);
-                               up_write(&current->mm->mmap_sem);
                                if (ret < 0)
                                        printk(KERN_WARNING
                                       "kvm_vm_ioctl_set_memory_region: "
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 953b50a..73dfb0b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -105,11 +105,13 @@ struct kvm_memory_slot {
 
 struct kvm {
        struct mutex lock; /* protects everything except vcpus */
+       spinlock_t mmu_lock;
        struct mm_struct *mm; /* userspace tied to this vm */
        int nmemslots;
        struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
                                        KVM_PRIVATE_MEM_SLOTS];
        struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+       unsigned long prefetch_tmp_area;
        struct list_head vm_list;
        struct file *filp;
        struct kvm_io_bus mmio_bus;
@@ -163,6 +165,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                                int user_alloc);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 845beb2..9a2a6e9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,12 +165,14 @@ static struct kvm *kvm_create_vm(void)
 
        kvm->mm = current->mm;
        atomic_inc(&kvm->mm->mm_count);
+       spin_lock_init(&kvm->mmu_lock);
        kvm_io_bus_init(&kvm->pio_bus);
        mutex_init(&kvm->lock);
        kvm_io_bus_init(&kvm->mmio_bus);
        spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        spin_unlock(&kvm_lock);
+       kvm->prefetch_tmp_area = get_zeroed_page(GFP_KERNEL);
 out:
        return kvm;
 }
@@ -211,6 +213,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_io_bus_destroy(&kvm->mmio_bus);
        kvm_arch_destroy_vm(kvm);
        mmdrop(mm);
+       free_page(kvm->prefetch_tmp_area);
 }
 
 static int kvm_vm_release(struct inode *inode, struct file *filp)
@@ -227,7 +230,7 @@ static int kvm_vm_release(struct inode *inode, struct file 
*filp)
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
  *
- * Must be called holding kvm->lock.
+ * Must be called holding mmap_sem for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
                            struct kvm_userspace_memory_region *mem,
@@ -338,9 +341,9 @@ int kvm_set_memory_region(struct kvm *kvm,
 {
        int r;
 
-       mutex_lock(&kvm->lock);
+       down_write(&current->mm->mmap_sem);
        r = __kvm_set_memory_region(kvm, mem, user_alloc);
-       mutex_unlock(&kvm->lock);
+       up_write(&current->mm->mmap_sem);
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -442,7 +445,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 
-static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_memory_slot *slot;
 
@@ -452,11 +455,12 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t 
gfn)
                return bad_hva();
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
+EXPORT_SYMBOL_GPL(gfn_to_hva);
 
 /*
  * Requires current->mm->mmap_sem to be held
  */
-static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
        struct page *page[1];
        unsigned long addr;
@@ -480,6 +484,7 @@ static struct page *__gfn_to_page(struct kvm *kvm, gfn_t 
gfn)
 
        return page[0];
 }
+EXPORT_SYMBOL(__gfn_to_page);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {





-------------------------------------------------------------------------
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services
for just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to