uvm_fault() is one of the most contended "entry point" of the kernel.
To reduce this contention I'm carefully refactoring this code to be able
to push the KERNEL_LOCK() inside the fault handler.
The first aim of this project would be to get the upper layer faults
(cases 1A and 1B) out of ze big lock. As these faults do not involve
`uobj' the scope of this project should be limited to serializing amap
changes without the KERNEL_LOCK().
The diff below moves the first part of uvm_fault() into its own
function: uvm_fault_check(). It is inspired by/imitates the current
code structure of NetBSD's fault handler.
This diff should not have any functional change.
I hope it helps build better understanding of this area.
Comments? Oks?
Index: uvm/uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.102
diff -u -p -r1.102 uvm_fault.c
--- uvm/uvm_fault.c 29 Sep 2020 11:47:41 -0000 1.102
+++ uvm/uvm_fault.c 12 Oct 2020 09:01:05 -0000
@@ -472,114 +472,101 @@ uvmfault_update_stats(struct uvm_faultin
}
}
-/*
- * F A U L T - m a i n e n t r y p o i n t
- */
+struct uvm_faultctx {
+ /*
+ * the following members are set up by uvm_fault_check() and
+ * read-only after that.
+ */
+ vm_prot_t enter_prot;
+ vaddr_t startva;
+ int npages;
+ int centeridx;
+ boolean_t narrow;
+ boolean_t wired;
+ paddr_t pa_flags;
+};
/*
- * uvm_fault: page fault handler
+ * uvm_fault_check: check prot, handle needs-copy, etc.
*
- * => called from MD code to resolve a page fault
- * => VM data structures usually should be unlocked. however, it is
- * possible to call here with the main map locked if the caller
- * gets a write lock, sets it recursive, and then calls us (c.f.
- * uvm_map_pageable). this should be avoided because it keeps
- * the map locked off during I/O.
+ * 1. lookup entry.
+ * 2. check protection.
+ * 3. adjust fault condition (mainly for simulated fault).
+ * 4. handle needs-copy (lazy amap copy).
+ * 5. establish range of interest for neighbor fault (aka pre-fault).
+ * 6. look up anons (if amap exists).
+ * 7. flush pages (if MADV_SEQUENTIAL)
+ *
+ * => called with nothing locked.
+ * => if we fail (result != 0) we unlock everything.
+ * => initialize/adjust many members of flt.
*/
-#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
- ~PROT_WRITE : PROT_MASK)
int
-uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
- vm_prot_t access_type)
+uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
+ struct vm_anon ***ranons, vm_prot_t access_type)
{
- struct uvm_faultinfo ufi;
- vm_prot_t enter_prot;
- boolean_t wired, narrow, promote, locked, shadowed;
- int npages, nback, nforw, centeridx, result, lcv, gotpages, ret;
- vaddr_t startva, currva;
- voff_t uoff;
- paddr_t pa, pa_flags;
struct vm_amap *amap;
struct uvm_object *uobj;
- struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon, *oanon;
- struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage;
+ int nback, nforw;
- anon = NULL;
- pg = NULL;
-
- uvmexp.faults++; /* XXX: locking? */
- TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
-
- /* init the IN parameters in the ufi */
- ufi.orig_map = orig_map;
- ufi.orig_rvaddr = trunc_page(vaddr);
- ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
- if (fault_type == VM_FAULT_WIRE)
- narrow = TRUE; /* don't look for neighborhood
- * pages on wire */
- else
- narrow = FALSE; /* normal fault */
-
- /* "goto ReFault" means restart the page fault from ground zero. */
-ReFault:
/* lookup and lock the maps */
- if (uvmfault_lookup(&ufi, FALSE) == FALSE) {
+ if (uvmfault_lookup(ufi, FALSE) == FALSE) {
return (EFAULT);
}
#ifdef DIAGNOSTIC
- if ((ufi.map->flags & VM_MAP_PAGEABLE) == 0)
+ if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0)
panic("uvm_fault: fault on non-pageable map (%p, 0x%lx)",
- ufi.map, vaddr);
+ ufi->map, ufi->orig_rvaddr);
#endif
/* check protection */
- if ((ufi.entry->protection & access_type) != access_type) {
- uvmfault_unlockmaps(&ufi, FALSE);
+ if ((ufi->entry->protection & access_type) != access_type) {
+ uvmfault_unlockmaps(ufi, FALSE);
return (EACCES);
}
/*
* "enter_prot" is the protection we want to enter the page in at.
* for certain pages (e.g. copy-on-write pages) this protection can
- * be more strict than ufi.entry->protection. "wired" means either
+ * be more strict than ufi->entry->protection. "wired" means either
* the entry is wired or we are fault-wiring the pg.
*/
- enter_prot = ufi.entry->protection;
- pa_flags = UVM_ET_ISWC(ufi.entry) ? PMAP_WC : 0;
- wired = VM_MAPENT_ISWIRED(ufi.entry) || (fault_type == VM_FAULT_WIRE);
- if (wired)
- access_type = enter_prot; /* full access for wired */
+ flt->enter_prot = ufi->entry->protection;
+ flt->pa_flags = UVM_ET_ISWC(ufi->entry) ? PMAP_WC : 0;
+ flt->wired = VM_MAPENT_ISWIRED(ufi->entry) || (flt->narrow == TRUE);
+ if (flt->wired)
+ access_type = flt->enter_prot; /* full access for wired */
/* handle "needs_copy" case. */
- if (UVM_ET_ISNEEDSCOPY(ufi.entry)) {
+ if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
if ((access_type & PROT_WRITE) ||
- (ufi.entry->object.uvm_obj == NULL)) {
+ (ufi->entry->object.uvm_obj == NULL)) {
/* need to clear */
- uvmfault_unlockmaps(&ufi, FALSE);
- uvmfault_amapcopy(&ufi);
+ uvmfault_unlockmaps(ufi, FALSE);
+ uvmfault_amapcopy(ufi);
uvmexp.fltamcopy++;
- goto ReFault;
+ return (ERESTART);
} else {
/*
* ensure that we pmap_enter page R/O since
* needs_copy is still true
*/
- enter_prot &= ~PROT_WRITE;
+ flt->enter_prot &= ~PROT_WRITE;
}
}
/* identify the players */
- amap = ufi.entry->aref.ar_amap; /* top layer */
- uobj = ufi.entry->object.uvm_obj; /* bottom layer */
+ amap = ufi->entry->aref.ar_amap; /* top layer */
+ uobj = ufi->entry->object.uvm_obj; /* bottom layer */
/*
* check for a case 0 fault. if nothing backing the entry then
* error now.
*/
if (amap == NULL && uobj == NULL) {
- uvmfault_unlockmaps(&ufi, FALSE);
+ uvmfault_unlockmaps(ufi, FALSE);
return (EFAULT);
}
@@ -589,77 +576,148 @@ ReFault:
* to do this the first time through the fault. if we
* ReFault we will disable this by setting "narrow" to true.
*/
- if (narrow == FALSE) {
+ if (flt->narrow == FALSE) {
/* wide fault (!narrow) */
- nback = min(uvmadvice[ufi.entry->advice].nback,
- (ufi.orig_rvaddr - ufi.entry->start) >> PAGE_SHIFT);
- startva = ufi.orig_rvaddr - ((vsize_t)nback << PAGE_SHIFT);
- nforw = min(uvmadvice[ufi.entry->advice].nforw,
- ((ufi.entry->end - ufi.orig_rvaddr) >>
+ nback = min(uvmadvice[ufi->entry->advice].nback,
+ (ufi->orig_rvaddr - ufi->entry->start) >>
PAGE_SHIFT);
+ flt->startva = ufi->orig_rvaddr - ((vsize_t)nback <<
PAGE_SHIFT);
+ nforw = min(uvmadvice[ufi->entry->advice].nforw,
+ ((ufi->entry->end - ufi->orig_rvaddr) >>
PAGE_SHIFT) - 1);
/*
* note: "-1" because we don't want to count the
* faulting page as forw
*/
- npages = nback + nforw + 1;
- centeridx = nback;
+ flt->npages = nback + nforw + 1;
+ flt->centeridx = nback;
- narrow = TRUE; /* ensure only once per-fault */
+ flt->narrow = TRUE; /* ensure only once per-fault */
} else {
/* narrow fault! */
nback = nforw = 0;
- startva = ufi.orig_rvaddr;
- npages = 1;
- centeridx = 0;
+ flt->startva = ufi->orig_rvaddr;
+ flt->npages = 1;
+ flt->centeridx = 0;
}
/* if we've got an amap, extract current anons. */
if (amap) {
- anons = anons_store;
- amap_lookups(&ufi.entry->aref, startva - ufi.entry->start,
- anons, npages);
+ amap_lookups(&ufi->entry->aref,
+ flt->startva - ufi->entry->start, *ranons, flt->npages);
} else {
- anons = NULL; /* to be safe */
+ *ranons = NULL; /* to be safe */
}
/*
* for MADV_SEQUENTIAL mappings we want to deactivate the back pages
* now and then forget about them (for the rest of the fault).
*/
- if (ufi.entry->advice == MADV_SEQUENTIAL && nback != 0) {
+ if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
/* flush back-page anons? */
if (amap)
- uvmfault_anonflush(anons, nback);
+ uvmfault_anonflush(*ranons, nback);
/* flush object? */
if (uobj) {
- uoff = (startva - ufi.entry->start) + ufi.entry->offset;
+ voff_t uoff;
+
+ uoff = (flt->startva - ufi->entry->start) +
ufi->entry->offset;
(void) uobj->pgops->pgo_flush(uobj, uoff, uoff +
((vsize_t)nback << PAGE_SHIFT), PGO_DEACTIVATE);
}
/* now forget about the backpages */
if (amap)
- anons += nback;
- startva += ((vsize_t)nback << PAGE_SHIFT);
- npages -= nback;
- centeridx = 0;
+ *ranons += nback;
+ flt->startva += ((vsize_t)nback << PAGE_SHIFT);
+ flt->npages -= nback;
+ flt->centeridx = 0;
}
+ return 0;
+}
+
+/*
+ * F A U L T - m a i n e n t r y p o i n t
+ */
+
+/*
+ * uvm_fault: page fault handler
+ *
+ * => called from MD code to resolve a page fault
+ * => VM data structures usually should be unlocked. however, it is
+ * possible to call here with the main map locked if the caller
+ * gets a write lock, sets it recursive, and then calls us (c.f.
+ * uvm_map_pageable). this should be avoided because it keeps
+ * the map locked off during I/O.
+ */
+#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
+ ~PROT_WRITE : PROT_MASK)
+int
+uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
+ vm_prot_t access_type)
+{
+ struct uvm_faultinfo ufi;
+ struct uvm_faultctx flt;
+ boolean_t promote, locked, shadowed;
+ int result, lcv, gotpages, ret;
+ vaddr_t currva;
+ voff_t uoff;
+ paddr_t pa;
+ struct vm_amap *amap;
+ struct uvm_object *uobj;
+ struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon, *oanon;
+ struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage;
+ int error;
+
+ anon = NULL;
+ pg = NULL;
+
+ uvmexp.faults++; /* XXX: locking? */
+ TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
+
+ /* init the IN parameters in the ufi */
+ ufi.orig_map = orig_map;
+ ufi.orig_rvaddr = trunc_page(vaddr);
+ ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
+ if (fault_type == VM_FAULT_WIRE)
+ flt.narrow = TRUE; /* don't look for neighborhood
+ * pages on wire */
+ else
+ flt.narrow = FALSE; /* normal fault */
+
+
+ /* "goto ReFault" means restart the page fault from ground zero. */
+ReFault:
+ anons = anons_store;
+
+ error = uvm_fault_check(&ufi, &flt, &anons, access_type);
+ switch (error) {
+ case 0:
+ break;
+ case ERESTART:
+ goto ReFault;
+ default:
+ return error;
+ }
+
+ amap = ufi.entry->aref.ar_amap;
+ uobj = ufi.entry->object.uvm_obj;
+
/*
* map in the backpages and frontpages we found in the amap in hopes
* of preventing future faults. we also init the pages[] array as
* we go.
*/
- currva = startva;
+ currva = flt.startva;
shadowed = FALSE;
- for (lcv = 0 ; lcv < npages ; lcv++, currva += PAGE_SIZE) {
+ for (lcv = 0 ; lcv < flt.npages ; lcv++, currva += PAGE_SIZE) {
/*
* dont play with VAs that are already mapped
* except for center)
*/
- if (lcv != centeridx &&
+ if (lcv != flt.centeridx &&
pmap_extract(ufi.orig_map->pmap, currva, &pa)) {
pages[lcv] = PGO_DONTCARE;
continue;
@@ -673,7 +731,7 @@ ReFault:
/* check for present page and map if possible. re-activate
it. */
pages[lcv] = PGO_DONTCARE;
- if (lcv == centeridx) { /* save center for later! */
+ if (lcv == flt.centeridx) { /* save center for later! */
shadowed = TRUE;
continue;
}
@@ -691,14 +749,14 @@ ReFault:
* that we enter these right now.
*/
(void) pmap_enter(ufi.orig_map->pmap, currva,
- VM_PAGE_TO_PHYS(anon->an_page) | pa_flags,
- (anon->an_ref > 1) ? (enter_prot & ~PROT_WRITE) :
- enter_prot,
+ VM_PAGE_TO_PHYS(anon->an_page) | flt.pa_flags,
+ (anon->an_ref > 1) ?
+ (flt.enter_prot & ~PROT_WRITE) : flt.enter_prot,
PMAP_CANFAIL |
(VM_MAPENT_ISWIRED(ufi.entry) ? PMAP_WIRED : 0));
}
}
- if (npages > 1)
+ if (flt.npages > 1)
pmap_update(ufi.orig_map->pmap);
/* (shadowed == TRUE) if there is an anon at the faulting address */
@@ -710,9 +768,9 @@ ReFault:
* providing a pgo_fault routine.
*/
if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) {
- result = uobj->pgops->pgo_fault(&ufi, startva, pages, npages,
- centeridx, fault_type, access_type,
- PGO_LOCKED);
+ result = uobj->pgops->pgo_fault(&ufi, flt.startva, pages,
+ flt.npages, flt.centeridx, fault_type, access_type,
+ PGO_LOCKED);
if (result == VM_PAGER_OK)
return (0); /* pgo_fault did pmap enter */
@@ -733,18 +791,18 @@ ReFault:
*/
if (uobj && shadowed == FALSE) {
uvmexp.fltlget++;
- gotpages = npages;
+ gotpages = flt.npages;
(void) uobj->pgops->pgo_get(uobj, ufi.entry->offset +
- (startva - ufi.entry->start),
- pages, &gotpages, centeridx,
+ (flt.startva - ufi.entry->start),
+ pages, &gotpages, flt.centeridx,
access_type & MASK(ufi.entry),
ufi.entry->advice, PGO_LOCKED);
/* check for pages to map, if we got any */
uobjpage = NULL;
if (gotpages) {
- currva = startva;
- for (lcv = 0 ; lcv < npages ;
+ currva = flt.startva;
+ for (lcv = 0 ; lcv < flt.npages ;
lcv++, currva += PAGE_SIZE) {
if (pages[lcv] == NULL ||
pages[lcv] == PGO_DONTCARE)
@@ -759,7 +817,7 @@ ReFault:
* remember this page as "uobjpage."
* (for later use).
*/
- if (lcv == centeridx) {
+ if (lcv == flt.centeridx) {
uobjpage = pages[lcv];
continue;
}
@@ -785,10 +843,10 @@ ReFault:
* enter these right now.
*/
(void) pmap_enter(ufi.orig_map->pmap, currva,
- VM_PAGE_TO_PHYS(pages[lcv]) | pa_flags,
- enter_prot & MASK(ufi.entry),
+ VM_PAGE_TO_PHYS(pages[lcv]) | flt.pa_flags,
+ flt.enter_prot & MASK(ufi.entry),
PMAP_CANFAIL |
- (wired ? PMAP_WIRED : 0));
+ (flt.wired ? PMAP_WIRED : 0));
/*
* NOTE: page can't be PG_WANTED because
@@ -824,7 +882,7 @@ ReFault:
goto Case2;
/* handle case 1: fault on an anon in our amap */
- anon = anons[centeridx];
+ anon = anons[flt.centeridx];
/*
* no matter if we have case 1A or case 1B we are going to need to
@@ -921,7 +979,7 @@ ReFault:
oanon = anon;
pg = anon->an_page;
if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */
- enter_prot = enter_prot & ~PROT_WRITE;
+ flt.enter_prot = flt.enter_prot & ~PROT_WRITE;
}
/*
@@ -931,8 +989,8 @@ ReFault:
* under us between the unlock and the pmap_enter.
*/
if (pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr,
- VM_PAGE_TO_PHYS(pg) | pa_flags, enter_prot,
- access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)) != 0) {
+ VM_PAGE_TO_PHYS(pg) | flt.pa_flags, flt.enter_prot,
+ access_type | PMAP_CANFAIL | (flt.wired ? PMAP_WIRED : 0)) != 0) {
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
@@ -1086,7 +1144,7 @@ Case2:
*/
uvmexp.flt_obj++;
if (UVM_ET_ISCOPYONWRITE(ufi.entry))
- enter_prot &= ~PROT_WRITE;
+ flt.enter_prot &= ~PROT_WRITE;
pg = uobjpage; /* map in the actual object */
/* assert(uobjpage != PGO_DONTCARE) */
@@ -1206,8 +1264,8 @@ Case2:
* resources.
*/
if (pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr,
- VM_PAGE_TO_PHYS(pg) | pa_flags, enter_prot,
- access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)) != 0) {
+ VM_PAGE_TO_PHYS(pg) | flt.pa_flags, flt.enter_prot,
+ access_type | PMAP_CANFAIL | (flt.wired ? PMAP_WIRED : 0)) != 0) {
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.