On Sun, Dec 23, 2007 at 01:01:25PM +0200, Avi Kivity wrote: > Andrew Morton wrote: > >On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <[EMAIL PROTECTED]> wrote: > > > > > >>Andrew Morton wrote: > >> > >>>On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <[EMAIL PROTECTED]> wrote: > >>> > >>> > >>> > >>>>Avi Kivity wrote: > >>>> > >>>> > >>>>>Avi Kivity wrote: > >>>>> > >>>>> > >>>>> > >>>>>>Exactly. But it is better to be explicit about it and pass the page > >>>>>>directly like you did before. I hate to make you go back-and-fourth, > >>>>>>but I did not understand the issue completely before. > >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>btw, the call to gfn_to_page() can happen in page_fault() instead of > >>>>>walk_addr(); that will reduce the amount of error handling, and will > >>>>>simplify the callers to walk_addr() that don't need the page. > >>>>> > >>>>> > >>>>> > >>>>> > >>>>Note further that all this doesn't obviate the need for follow_page() > >>>>(or get_user_pages_inatomic()); we still need something in update_pte() > >>>>for the demand paging case. > >>>> > >>>> > >>>Please review -mm's mm/pagewalk.c for suitability. > >>> > >>>If is is unsuitable but repairable then please cc Matt Mackall > >>><[EMAIL PROTECTED]> on the review. > >>> > >>> > >>> > >>The "no locks are taken" comment is very worrying. We need accurate > >>results. > >> > > > >take down_read(mm->mmap_sem) before calling it.. > > > >You have to do that anyway for its results to be meaningful in the caller. > >Ditto get_user_pages(). > > > > > > Yes, but what about the page table locks? > > follow_page() is much more thorough.
It can acquire the pagetablelock in the callback handler. But then, vm_normal_page() must also be exported. Are you guys OK with this ? Modular KVM needs walk_page_range(), and also vm_normal_page() to be used on pagewalk callback. Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]> Index: kvm.quilt/mm/pagewalk.c =================================================================== --- kvm.quilt.orig/mm/pagewalk.c +++ kvm.quilt/mm/pagewalk.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/module.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, const struct mm_walk *walk, void *private) @@ -129,3 +130,4 @@ int walk_page_range(const struct mm_stru return err; } +EXPORT_SYMBOL(walk_page_range); Index: kvm.quilt/mm/memory.c =================================================================== --- kvm.quilt.orig/mm/memory.c +++ kvm.quilt/mm/memory.c @@ -412,6 +412,7 @@ struct page *vm_normal_page(struct vm_ar */ return pfn_to_page(pfn); } +EXPORT_SYMBOL(vm_normal_page); /* * copy one vm_area from one task to the other. Assumes the page tables [PATCH] KVM: add kvm_follow_page() In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() by using walk_page_range() instead of get_user_pages(). The fault handling work by get_user_pages() now happens outside the lock. Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]> Index: kvm.quilt/arch/x86/kvm/mmu.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/mmu.c +++ kvm.quilt/arch/x86/kvm/mmu.c @@ -882,14 +882,18 @@ struct page *gva_to_page(struct kvm_vcpu return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); } +/* + * mmu_set_spte must be called with an additional reference on + * struct page *page, and it will release that if necessary (so + * the callers should not worry about it). + */ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn) + int *ptwrite, gfn_t gfn, struct page *page) { u64 spte; int was_rmapped = is_rmap_pte(*shadow_pte); - struct page *page; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -907,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu if (!(pte_access & ACC_EXEC_MASK)) spte |= PT64_NX_MASK; - page = gfn_to_page(vcpu->kvm, gfn); - spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; @@ -983,8 +985,10 @@ static int nonpaging_map(struct kvm_vcpu table = __va(table_addr); if (level == 1) { + struct page *page; + page = gfn_to_page(vcpu->kvm, gfn); mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn); + 0, write, 1, &pt_write, gfn, page); return pt_write || is_io_pte(table[index]); } Index: kvm.quilt/include/linux/kvm_host.h =================================================================== --- kvm.quilt.orig/include/linux/kvm_host.h +++ kvm.quilt/include/linux/kvm_host.h @@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv int user_alloc); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, Index: kvm.quilt/virt/kvm/kvm_main.c =================================================================== --- kvm.quilt.orig/virt/kvm/kvm_main.c +++ kvm.quilt/virt/kvm/kvm_main.c @@ -453,6 +453,54 @@ static unsigned long gfn_to_hva(struct k return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } +static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) +{ + struct page **page = private; + struct vm_area_struct *vma; + pte_t *ptep, pte; + spinlock_t *ptl; + int err = -EFAULT; + + vma = find_vma(current->mm, addr); + if (!vma) + return err; + + ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl); + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + + *page = vm_normal_page(vma, addr, pte); + if (!*page) + goto unlock; + + get_page(*page); + err = 0; +unlock: + pte_unmap_unlock(ptep, ptl); + return err; +} + +static struct mm_walk kvm_mm_walk = { .pmd_entry = kvm_pte_range }; + +struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + struct page *page = NULL; + + addr = gfn_to_hva(kvm, gfn); + /* MMIO access */ + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return bad_page; + } + + walk_page_range(current->mm, addr, addr+PAGE_SIZE, &kvm_mm_walk, + &page); + return page; +} + /* * Requires current->mm->mmap_sem to be held */ Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h @@ -67,6 +67,7 @@ struct guest_walker { gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + struct page *page; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -203,14 +204,18 @@ walk: --walker->level; } + walker->page = gfn_to_page(vcpu->kvm, walker->gfn); + if (write_fault && !is_dirty_pte(pte)) { bool ret; mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); - if (ret) + if (ret) { + kvm_release_page_clean(walker->page); goto walk; + } pte |= PT_DIRTY_MASK; mutex_lock(&vcpu->kvm->lock); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); @@ -241,12 +246,13 @@ err: return 0; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *spage, u64 *spte, const void *pte, int bytes, int offset_in_pte) { pt_element_t gpte; unsigned pte_access; + struct page *page; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -256,10 +262,15 @@ static void FNAME(update_pte)(struct kvm } if (bytes < sizeof(pt_element_t)) return; + + page = kvm_find_page(vcpu->kvm, gpte_to_gfn(gpte)); + if (!page) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); + pte_access = spage->role.access & FNAME(gpte_access)(vcpu, gpte); + mmu_set_spte(vcpu, spte, spage->role.access, pte_access, 0, 0, + gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), page); } /* @@ -323,8 +334,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu r = kvm_read_guest_atomic(vcpu->kvm, walker->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) + if (r || curr_pte != walker->ptes[level - 2]) { + kvm_release_page_clean(walker->page); return NULL; + } } shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK @@ -335,7 +348,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn); + ptwrite, walker->gfn, walker->page); return shadow_ent; } @@ -425,6 +438,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; + kvm_release_page_clean(walker.page); } return gpa; ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel