On Sun, Dec 23, 2007 at 01:01:25PM +0200, Avi Kivity wrote:
> Andrew Morton wrote:
> >On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <[EMAIL PROTECTED]> wrote:
> >
> >  
> >>Andrew Morton wrote:
> >>    
> >>>On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <[EMAIL PROTECTED]> wrote:
> >>>
> >>>  
> >>>      
> >>>>Avi Kivity wrote:
> >>>>    
> >>>>        
> >>>>>Avi Kivity wrote:
> >>>>>  
> >>>>>      
> >>>>>          
> >>>>>>Exactly.  But it is better to be explicit about it and pass the page
> >>>>>>directly like you did before.  I hate to make you go back-and-fourth,
> >>>>>>but I did not understand the issue completely before.
> >>>>>>
> >>>>>>    
> >>>>>>        
> >>>>>>            
> >>>>>btw, the call to gfn_to_page() can happen in page_fault() instead of
> >>>>>walk_addr(); that will reduce the amount of error handling, and will
> >>>>>simplify the callers to walk_addr() that don't need the page.
> >>>>>
> >>>>>  
> >>>>>      
> >>>>>          
> >>>>Note further that all this doesn't obviate the need for follow_page()
> >>>>(or get_user_pages_inatomic()); we still need something in update_pte()
> >>>>for the demand paging case.
> >>>>    
> >>>>        
> >>>Please review -mm's mm/pagewalk.c for suitability.
> >>>
> >>>If is is unsuitable but repairable then please cc Matt Mackall
> >>><[EMAIL PROTECTED]> on the review.
> >>>
> >>>  
> >>>      
> >>The "no locks are taken" comment is very worrying.  We need accurate 
> >>results.
> >>    
> >
> >take down_read(mm->mmap_sem) before calling it..
> >
> >You have to do that anyway for its results to be meaningful in the caller. 
> >Ditto get_user_pages().
> >
> >  
> 
> Yes, but what about the page table locks?
> 
> follow_page() is much more thorough.

It can acquire the pagetablelock in the callback handler. But then,
vm_normal_page() must also be exported.

Are you guys OK with this ?


Modular KVM needs walk_page_range(), and also vm_normal_page() to be 
used on pagewalk callback.

Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>

Index: kvm.quilt/mm/pagewalk.c
===================================================================
--- kvm.quilt.orig/mm/pagewalk.c
+++ kvm.quilt/mm/pagewalk.c
@@ -1,6 +1,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
+#include <linux/module.h>
 
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                          const struct mm_walk *walk, void *private)
@@ -129,3 +130,4 @@ int walk_page_range(const struct mm_stru
 
        return err;
 }
+EXPORT_SYMBOL(walk_page_range);
Index: kvm.quilt/mm/memory.c
===================================================================
--- kvm.quilt.orig/mm/memory.c
+++ kvm.quilt/mm/memory.c
@@ -412,6 +412,7 @@ struct page *vm_normal_page(struct vm_ar
         */
        return pfn_to_page(pfn);
 }
+EXPORT_SYMBOL(vm_normal_page);
 
 /*
  * copy one vm_area from one task to the other. Assumes the page tables




[PATCH] KVM: add kvm_follow_page()

In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() 
by using walk_page_range() instead of get_user_pages().

The fault handling work by get_user_pages() now happens outside the lock.

Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>

Index: kvm.quilt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/mmu.c
+++ kvm.quilt/arch/x86/kvm/mmu.c
@@ -882,14 +882,18 @@ struct page *gva_to_page(struct kvm_vcpu
        return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 }
 
+/*
+ * mmu_set_spte must be called with an additional reference on
+ * struct page *page, and it will release that if necessary (so
+ * the callers should not worry about it).
+ */
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
-                        int *ptwrite, gfn_t gfn)
+                        int *ptwrite, gfn_t gfn, struct page *page)
 {
        u64 spte;
        int was_rmapped = is_rmap_pte(*shadow_pte);
-       struct page *page;
 
        pgprintk("%s: spte %llx access %x write_fault %d"
                 " user_fault %d gfn %lx\n",
@@ -907,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu
        if (!(pte_access & ACC_EXEC_MASK))
                spte |= PT64_NX_MASK;
 
-       page = gfn_to_page(vcpu->kvm, gfn);
-
        spte |= PT_PRESENT_MASK;
        if (pte_access & ACC_USER_MASK)
                spte |= PT_USER_MASK;
@@ -983,8 +985,10 @@ static int nonpaging_map(struct kvm_vcpu
                table = __va(table_addr);
 
                if (level == 1) {
+                       struct page *page;
+                       page = gfn_to_page(vcpu->kvm, gfn);
                        mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-                                    0, write, 1, &pt_write, gfn);
+                                    0, write, 1, &pt_write, gfn, page);
                        return pt_write || is_io_pte(table[index]);
                }
 
Index: kvm.quilt/include/linux/kvm_host.h
===================================================================
--- kvm.quilt.orig/include/linux/kvm_host.h
+++ kvm.quilt/include/linux/kvm_host.h
@@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv
                                int user_alloc);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -453,6 +453,54 @@ static unsigned long gfn_to_hva(struct k
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
 
+static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+                          void *private)
+{
+       struct page **page = private;
+       struct vm_area_struct *vma;
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+       int err = -EFAULT;
+
+       vma = find_vma(current->mm, addr);
+       if (!vma)
+               return err;
+
+       ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
+       pte = *ptep;
+       if (!pte_present(pte))
+               goto unlock;
+
+       *page = vm_normal_page(vma, addr, pte);
+       if (!*page)
+               goto unlock;
+
+       get_page(*page);
+       err = 0;
+unlock:
+       pte_unmap_unlock(ptep, ptl);
+       return err;
+}
+
+static struct mm_walk kvm_mm_walk = { .pmd_entry = kvm_pte_range };
+
+struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned long addr;
+       struct page *page = NULL;
+
+       addr = gfn_to_hva(kvm, gfn);
+       /* MMIO access */
+       if (kvm_is_error_hva(addr)) {
+               get_page(bad_page);
+               return bad_page;
+       }
+
+       walk_page_range(current->mm, addr, addr+PAGE_SIZE, &kvm_mm_walk,
+                       &page);
+       return page;
+}
+
 /*
  * Requires current->mm->mmap_sem to be held
  */
Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
        gfn_t table_gfn[PT_MAX_FULL_LEVELS];
        pt_element_t ptes[PT_MAX_FULL_LEVELS];
        gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+       struct page *page;
        unsigned pt_access;
        unsigned pte_access;
        gfn_t gfn;
@@ -203,14 +204,18 @@ walk:
                --walker->level;
        }
 
+       walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
+
        if (write_fault && !is_dirty_pte(pte)) {
                bool ret;
 
                mark_page_dirty(vcpu->kvm, table_gfn);
                ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
                            pte|PT_DIRTY_MASK);
-               if (ret)
+               if (ret) {
+                       kvm_release_page_clean(walker->page);
                        goto walk;
+               }
                pte |= PT_DIRTY_MASK;
                mutex_lock(&vcpu->kvm->lock);
                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
@@ -241,12 +246,13 @@ err:
        return 0;
 }
 
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page 
*spage,
                              u64 *spte, const void *pte, int bytes,
                              int offset_in_pte)
 {
        pt_element_t gpte;
        unsigned pte_access;
+       struct page *page;
 
        gpte = *(const pt_element_t *)pte;
        if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -256,10 +262,15 @@ static void FNAME(update_pte)(struct kvm
        }
        if (bytes < sizeof(pt_element_t))
                return;
+
+       page = kvm_find_page(vcpu->kvm, gpte_to_gfn(gpte));
+       if (!page)
+               return;
+
        pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-       pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
-       mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                    gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
+       pte_access = spage->role.access & FNAME(gpte_access)(vcpu, gpte);
+       mmu_set_spte(vcpu, spte, spage->role.access, pte_access, 0, 0,
+                    gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), page);
 }
 
 /*
@@ -323,8 +334,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
                        r = kvm_read_guest_atomic(vcpu->kvm,
                                                  walker->pte_gpa[level - 2],
                                                  &curr_pte, sizeof(curr_pte));
-                       if (r || curr_pte != walker->ptes[level - 2])
+                       if (r || curr_pte != walker->ptes[level - 2]) {
+                               kvm_release_page_clean(walker->page);
                                return NULL;
+                       }
                }
                shadow_addr = __pa(shadow_page->spt);
                shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
@@ -335,7 +348,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
        mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
                     user_fault, write_fault,
                     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-                    ptwrite, walker->gfn);
+                    ptwrite, walker->gfn, walker->page);
 
        return shadow_ent;
 }
@@ -425,6 +438,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
        if (r) {
                gpa = gfn_to_gpa(walker.gfn);
                gpa |= vaddr & ~PAGE_MASK;
+               kvm_release_page_clean(walker.page);
        }
 
        return gpa;

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to