> Date: Mon, 19 Oct 2020 10:18:51 +0200
> From: Martin Pieuchot <[email protected]>
>
> uvm_fault() is one of the most contended "entry point" of the kernel.
> To reduce this contention I'm carefully refactoring this code to be able
> to push the KERNEL_LOCK() inside the fault handler.
>
> The first aim of this project would be to get the upper layer faults
> (cases 1A and 1B) out of ze big lock. As these faults do not involve
> `uobj' the scope of this project should be limited to serializing amap
> changes without the KERNEL_LOCK().
>
> The diff below moves the first part of uvm_fault() into its own
> function: uvm_fault_check(). It is inspired by/imitates the current
> code structure of NetBSD's fault handler.
>
> This diff should not have any functional change.
>
> I hope it helps build better understanding of this area.
>
> Comments? Oks?
ok kettenis@
One nit below that you may want to fix before committing this.
>
> Index: uvm/uvm_fault.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.102
> diff -u -p -r1.102 uvm_fault.c
> --- uvm/uvm_fault.c 29 Sep 2020 11:47:41 -0000 1.102
> +++ uvm/uvm_fault.c 12 Oct 2020 09:01:05 -0000
> @@ -472,114 +472,101 @@ uvmfault_update_stats(struct uvm_faultin
> }
> }
>
> -/*
> - * F A U L T - m a i n e n t r y p o i n t
> - */
> +struct uvm_faultctx {
> + /*
> + * the following members are set up by uvm_fault_check() and
> + * read-only after that.
> + */
> + vm_prot_t enter_prot;
> + vaddr_t startva;
> + int npages;
> + int centeridx;
> + boolean_t narrow;
> + boolean_t wired;
> + paddr_t pa_flags;
> +};
>
> /*
> - * uvm_fault: page fault handler
> + * uvm_fault_check: check prot, handle needs-copy, etc.
> *
> - * => called from MD code to resolve a page fault
> - * => VM data structures usually should be unlocked. however, it is
> - * possible to call here with the main map locked if the caller
> - * gets a write lock, sets it recursive, and then calls us (c.f.
> - * uvm_map_pageable). this should be avoided because it keeps
> - * the map locked off during I/O.
> + * 1. lookup entry.
> + * 2. check protection.
> + * 3. adjust fault condition (mainly for simulated fault).
> + * 4. handle needs-copy (lazy amap copy).
> + * 5. establish range of interest for neighbor fault (aka pre-fault).
> + * 6. look up anons (if amap exists).
> + * 7. flush pages (if MADV_SEQUENTIAL)
> + *
> + * => called with nothing locked.
> + * => if we fail (result != 0) we unlock everything.
> + * => initialize/adjust many members of flt.
> */
> -#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
> - ~PROT_WRITE : PROT_MASK)
> int
> -uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
> - vm_prot_t access_type)
> +uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
> + struct vm_anon ***ranons, vm_prot_t access_type)
> {
> - struct uvm_faultinfo ufi;
> - vm_prot_t enter_prot;
> - boolean_t wired, narrow, promote, locked, shadowed;
> - int npages, nback, nforw, centeridx, result, lcv, gotpages, ret;
> - vaddr_t startva, currva;
> - voff_t uoff;
> - paddr_t pa, pa_flags;
> struct vm_amap *amap;
> struct uvm_object *uobj;
> - struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon, *oanon;
> - struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage;
> + int nback, nforw;
>
> - anon = NULL;
> - pg = NULL;
> -
> - uvmexp.faults++; /* XXX: locking? */
> - TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
> -
> - /* init the IN parameters in the ufi */
> - ufi.orig_map = orig_map;
> - ufi.orig_rvaddr = trunc_page(vaddr);
> - ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
> - if (fault_type == VM_FAULT_WIRE)
> - narrow = TRUE; /* don't look for neighborhood
> - * pages on wire */
> - else
> - narrow = FALSE; /* normal fault */
> -
> - /* "goto ReFault" means restart the page fault from ground zero. */
> -ReFault:
> /* lookup and lock the maps */
> - if (uvmfault_lookup(&ufi, FALSE) == FALSE) {
> + if (uvmfault_lookup(ufi, FALSE) == FALSE) {
> return (EFAULT);
> }
>
> #ifdef DIAGNOSTIC
> - if ((ufi.map->flags & VM_MAP_PAGEABLE) == 0)
> + if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0)
> panic("uvm_fault: fault on non-pageable map (%p, 0x%lx)",
> - ufi.map, vaddr);
> + ufi->map, ufi->orig_rvaddr);
> #endif
>
> /* check protection */
> - if ((ufi.entry->protection & access_type) != access_type) {
> - uvmfault_unlockmaps(&ufi, FALSE);
> + if ((ufi->entry->protection & access_type) != access_type) {
> + uvmfault_unlockmaps(ufi, FALSE);
> return (EACCES);
> }
>
> /*
> * "enter_prot" is the protection we want to enter the page in at.
> * for certain pages (e.g. copy-on-write pages) this protection can
> - * be more strict than ufi.entry->protection. "wired" means either
> + * be more strict than ufi->entry->protection. "wired" means either
> * the entry is wired or we are fault-wiring the pg.
> */
>
> - enter_prot = ufi.entry->protection;
> - pa_flags = UVM_ET_ISWC(ufi.entry) ? PMAP_WC : 0;
> - wired = VM_MAPENT_ISWIRED(ufi.entry) || (fault_type == VM_FAULT_WIRE);
> - if (wired)
> - access_type = enter_prot; /* full access for wired */
> + flt->enter_prot = ufi->entry->protection;
> + flt->pa_flags = UVM_ET_ISWC(ufi->entry) ? PMAP_WC : 0;
> + flt->wired = VM_MAPENT_ISWIRED(ufi->entry) || (flt->narrow == TRUE);
> + if (flt->wired)
> + access_type = flt->enter_prot; /* full access for wired */
>
> /* handle "needs_copy" case. */
> - if (UVM_ET_ISNEEDSCOPY(ufi.entry)) {
> + if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
> if ((access_type & PROT_WRITE) ||
> - (ufi.entry->object.uvm_obj == NULL)) {
> + (ufi->entry->object.uvm_obj == NULL)) {
> /* need to clear */
> - uvmfault_unlockmaps(&ufi, FALSE);
> - uvmfault_amapcopy(&ufi);
> + uvmfault_unlockmaps(ufi, FALSE);
> + uvmfault_amapcopy(ufi);
> uvmexp.fltamcopy++;
> - goto ReFault;
> + return (ERESTART);
> } else {
> /*
> * ensure that we pmap_enter page R/O since
> * needs_copy is still true
> */
> - enter_prot &= ~PROT_WRITE;
> + flt->enter_prot &= ~PROT_WRITE;
> }
> }
>
> /* identify the players */
> - amap = ufi.entry->aref.ar_amap; /* top layer */
> - uobj = ufi.entry->object.uvm_obj; /* bottom layer */
> + amap = ufi->entry->aref.ar_amap; /* top layer */
> + uobj = ufi->entry->object.uvm_obj; /* bottom layer */
>
> /*
> * check for a case 0 fault. if nothing backing the entry then
> * error now.
> */
> if (amap == NULL && uobj == NULL) {
> - uvmfault_unlockmaps(&ufi, FALSE);
> + uvmfault_unlockmaps(ufi, FALSE);
> return (EFAULT);
> }
>
> @@ -589,77 +576,148 @@ ReFault:
> * to do this the first time through the fault. if we
> * ReFault we will disable this by setting "narrow" to true.
> */
> - if (narrow == FALSE) {
> + if (flt->narrow == FALSE) {
>
> /* wide fault (!narrow) */
> - nback = min(uvmadvice[ufi.entry->advice].nback,
> - (ufi.orig_rvaddr - ufi.entry->start) >> PAGE_SHIFT);
> - startva = ufi.orig_rvaddr - ((vsize_t)nback << PAGE_SHIFT);
> - nforw = min(uvmadvice[ufi.entry->advice].nforw,
> - ((ufi.entry->end - ufi.orig_rvaddr) >>
> + nback = min(uvmadvice[ufi->entry->advice].nback,
> + (ufi->orig_rvaddr - ufi->entry->start) >>
> PAGE_SHIFT);
If you indent the line above according to style(9) and how NetBSD does
it (four spaces) this doesn't wrap.
> + flt->startva = ufi->orig_rvaddr - ((vsize_t)nback <<
> PAGE_SHIFT);
> + nforw = min(uvmadvice[ufi->entry->advice].nforw,
> + ((ufi->entry->end - ufi->orig_rvaddr) >>
> PAGE_SHIFT) - 1);
> /*
> * note: "-1" because we don't want to count the
> * faulting page as forw
> */
> - npages = nback + nforw + 1;
> - centeridx = nback;
> + flt->npages = nback + nforw + 1;
> + flt->centeridx = nback;
>
> - narrow = TRUE; /* ensure only once per-fault */
> + flt->narrow = TRUE; /* ensure only once per-fault */
> } else {
> /* narrow fault! */
> nback = nforw = 0;
> - startva = ufi.orig_rvaddr;
> - npages = 1;
> - centeridx = 0;
> + flt->startva = ufi->orig_rvaddr;
> + flt->npages = 1;
> + flt->centeridx = 0;
> }
>
> /* if we've got an amap, extract current anons. */
> if (amap) {
> - anons = anons_store;
> - amap_lookups(&ufi.entry->aref, startva - ufi.entry->start,
> - anons, npages);
> + amap_lookups(&ufi->entry->aref,
> + flt->startva - ufi->entry->start, *ranons, flt->npages);
> } else {
> - anons = NULL; /* to be safe */
> + *ranons = NULL; /* to be safe */
> }
>
> /*
> * for MADV_SEQUENTIAL mappings we want to deactivate the back pages
> * now and then forget about them (for the rest of the fault).
> */
> - if (ufi.entry->advice == MADV_SEQUENTIAL && nback != 0) {
> + if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
> /* flush back-page anons? */
> if (amap)
> - uvmfault_anonflush(anons, nback);
> + uvmfault_anonflush(*ranons, nback);
>
> /* flush object? */
> if (uobj) {
> - uoff = (startva - ufi.entry->start) + ufi.entry->offset;
> + voff_t uoff;
> +
> + uoff = (flt->startva - ufi->entry->start) +
> ufi->entry->offset;
> (void) uobj->pgops->pgo_flush(uobj, uoff, uoff +
> ((vsize_t)nback << PAGE_SHIFT), PGO_DEACTIVATE);
> }
>
> /* now forget about the backpages */
> if (amap)
> - anons += nback;
> - startva += ((vsize_t)nback << PAGE_SHIFT);
> - npages -= nback;
> - centeridx = 0;
> + *ranons += nback;
> + flt->startva += ((vsize_t)nback << PAGE_SHIFT);
> + flt->npages -= nback;
> + flt->centeridx = 0;
> }
>
> + return 0;
> +}
> +
> +/*
> + * F A U L T - m a i n e n t r y p o i n t
> + */
> +
> +/*
> + * uvm_fault: page fault handler
> + *
> + * => called from MD code to resolve a page fault
> + * => VM data structures usually should be unlocked. however, it is
> + * possible to call here with the main map locked if the caller
> + * gets a write lock, sets it recursive, and then calls us (c.f.
> + * uvm_map_pageable). this should be avoided because it keeps
> + * the map locked off during I/O.
> + */
> +#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
> + ~PROT_WRITE : PROT_MASK)
> +int
> +uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
> + vm_prot_t access_type)
> +{
> + struct uvm_faultinfo ufi;
> + struct uvm_faultctx flt;
> + boolean_t promote, locked, shadowed;
> + int result, lcv, gotpages, ret;
> + vaddr_t currva;
> + voff_t uoff;
> + paddr_t pa;
> + struct vm_amap *amap;
> + struct uvm_object *uobj;
> + struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon, *oanon;
> + struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage;
> + int error;
> +
> + anon = NULL;
> + pg = NULL;
> +
> + uvmexp.faults++; /* XXX: locking? */
> + TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
> +
> + /* init the IN parameters in the ufi */
> + ufi.orig_map = orig_map;
> + ufi.orig_rvaddr = trunc_page(vaddr);
> + ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
> + if (fault_type == VM_FAULT_WIRE)
> + flt.narrow = TRUE; /* don't look for neighborhood
> + * pages on wire */
> + else
> + flt.narrow = FALSE; /* normal fault */
> +
> +
> + /* "goto ReFault" means restart the page fault from ground zero. */
> +ReFault:
> + anons = anons_store;
> +
> + error = uvm_fault_check(&ufi, &flt, &anons, access_type);
> + switch (error) {
> + case 0:
> + break;
> + case ERESTART:
> + goto ReFault;
> + default:
> + return error;
> + }
> +
> + amap = ufi.entry->aref.ar_amap;
> + uobj = ufi.entry->object.uvm_obj;
> +
> /*
> * map in the backpages and frontpages we found in the amap in hopes
> * of preventing future faults. we also init the pages[] array as
> * we go.
> */
> - currva = startva;
> + currva = flt.startva;
> shadowed = FALSE;
> - for (lcv = 0 ; lcv < npages ; lcv++, currva += PAGE_SIZE) {
> + for (lcv = 0 ; lcv < flt.npages ; lcv++, currva += PAGE_SIZE) {
> /*
> * dont play with VAs that are already mapped
> * except for center)
> */
> - if (lcv != centeridx &&
> + if (lcv != flt.centeridx &&
> pmap_extract(ufi.orig_map->pmap, currva, &pa)) {
> pages[lcv] = PGO_DONTCARE;
> continue;
> @@ -673,7 +731,7 @@ ReFault:
>
> /* check for present page and map if possible. re-activate
> it. */
> pages[lcv] = PGO_DONTCARE;
> - if (lcv == centeridx) { /* save center for later! */
> + if (lcv == flt.centeridx) { /* save center for later! */
> shadowed = TRUE;
> continue;
> }
> @@ -691,14 +749,14 @@ ReFault:
> * that we enter these right now.
> */
> (void) pmap_enter(ufi.orig_map->pmap, currva,
> - VM_PAGE_TO_PHYS(anon->an_page) | pa_flags,
> - (anon->an_ref > 1) ? (enter_prot & ~PROT_WRITE) :
> - enter_prot,
> + VM_PAGE_TO_PHYS(anon->an_page) | flt.pa_flags,
> + (anon->an_ref > 1) ?
> + (flt.enter_prot & ~PROT_WRITE) : flt.enter_prot,
> PMAP_CANFAIL |
> (VM_MAPENT_ISWIRED(ufi.entry) ? PMAP_WIRED : 0));
> }
> }
> - if (npages > 1)
> + if (flt.npages > 1)
> pmap_update(ufi.orig_map->pmap);
>
> /* (shadowed == TRUE) if there is an anon at the faulting address */
> @@ -710,9 +768,9 @@ ReFault:
> * providing a pgo_fault routine.
> */
> if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) {
> - result = uobj->pgops->pgo_fault(&ufi, startva, pages, npages,
> - centeridx, fault_type, access_type,
> - PGO_LOCKED);
> + result = uobj->pgops->pgo_fault(&ufi, flt.startva, pages,
> + flt.npages, flt.centeridx, fault_type, access_type,
> + PGO_LOCKED);
>
> if (result == VM_PAGER_OK)
> return (0); /* pgo_fault did pmap enter */
> @@ -733,18 +791,18 @@ ReFault:
> */
> if (uobj && shadowed == FALSE) {
> uvmexp.fltlget++;
> - gotpages = npages;
> + gotpages = flt.npages;
> (void) uobj->pgops->pgo_get(uobj, ufi.entry->offset +
> - (startva - ufi.entry->start),
> - pages, &gotpages, centeridx,
> + (flt.startva - ufi.entry->start),
> + pages, &gotpages, flt.centeridx,
> access_type & MASK(ufi.entry),
> ufi.entry->advice, PGO_LOCKED);
>
> /* check for pages to map, if we got any */
> uobjpage = NULL;
> if (gotpages) {
> - currva = startva;
> - for (lcv = 0 ; lcv < npages ;
> + currva = flt.startva;
> + for (lcv = 0 ; lcv < flt.npages ;
> lcv++, currva += PAGE_SIZE) {
> if (pages[lcv] == NULL ||
> pages[lcv] == PGO_DONTCARE)
> @@ -759,7 +817,7 @@ ReFault:
> * remember this page as "uobjpage."
> * (for later use).
> */
> - if (lcv == centeridx) {
> + if (lcv == flt.centeridx) {
> uobjpage = pages[lcv];
> continue;
> }
> @@ -785,10 +843,10 @@ ReFault:
> * enter these right now.
> */
> (void) pmap_enter(ufi.orig_map->pmap, currva,
> - VM_PAGE_TO_PHYS(pages[lcv]) | pa_flags,
> - enter_prot & MASK(ufi.entry),
> + VM_PAGE_TO_PHYS(pages[lcv]) | flt.pa_flags,
> + flt.enter_prot & MASK(ufi.entry),
> PMAP_CANFAIL |
> - (wired ? PMAP_WIRED : 0));
> + (flt.wired ? PMAP_WIRED : 0));
>
> /*
> * NOTE: page can't be PG_WANTED because
> @@ -824,7 +882,7 @@ ReFault:
> goto Case2;
>
> /* handle case 1: fault on an anon in our amap */
> - anon = anons[centeridx];
> + anon = anons[flt.centeridx];
>
> /*
> * no matter if we have case 1A or case 1B we are going to need to
> @@ -921,7 +979,7 @@ ReFault:
> oanon = anon;
> pg = anon->an_page;
> if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */
> - enter_prot = enter_prot & ~PROT_WRITE;
> + flt.enter_prot = flt.enter_prot & ~PROT_WRITE;
> }
>
> /*
> @@ -931,8 +989,8 @@ ReFault:
> * under us between the unlock and the pmap_enter.
> */
> if (pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr,
> - VM_PAGE_TO_PHYS(pg) | pa_flags, enter_prot,
> - access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)) != 0) {
> + VM_PAGE_TO_PHYS(pg) | flt.pa_flags, flt.enter_prot,
> + access_type | PMAP_CANFAIL | (flt.wired ? PMAP_WIRED : 0)) != 0) {
> /*
> * No need to undo what we did; we can simply think of
> * this as the pmap throwing away the mapping information.
> @@ -1086,7 +1144,7 @@ Case2:
> */
> uvmexp.flt_obj++;
> if (UVM_ET_ISCOPYONWRITE(ufi.entry))
> - enter_prot &= ~PROT_WRITE;
> + flt.enter_prot &= ~PROT_WRITE;
> pg = uobjpage; /* map in the actual object */
>
> /* assert(uobjpage != PGO_DONTCARE) */
> @@ -1206,8 +1264,8 @@ Case2:
> * resources.
> */
> if (pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr,
> - VM_PAGE_TO_PHYS(pg) | pa_flags, enter_prot,
> - access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)) != 0) {
> + VM_PAGE_TO_PHYS(pg) | flt.pa_flags, flt.enter_prot,
> + access_type | PMAP_CANFAIL | (flt.wired ? PMAP_WIRED : 0)) != 0) {
> /*
> * No need to undo what we did; we can simply think of
> * this as the pmap throwing away the mapping information.
>
>