> Date: Wed, 23 Dec 2020 10:58:16 -0300 > From: Martin Pieuchot <m...@openbsd.org> > > On 22/12/20(Tue) 23:43, Mark Kettenis wrote: > > > Date: Mon, 21 Dec 2020 16:46:32 -0300 > > > From: Martin Pieuchot <m...@openbsd.org> > > > > > > During a page fault multiples counters are updated. They fall into two > > > categories "fault counters" and "global statistics" both of which are > > > currently represented by int-sized fields inside a global: `uvmexp'. > > > > > > Diff below makes use of the per-CPU counters_inc(9) API to make sure no > > > update is lost with an unlocked fault handler. I only converted the > > > fields touched by uvm_fault() to have a working solution and start a > > > discussion. > > > > > > - Should we keep a single enum for all fields inside `uvmexp' or do we > > > want to separate "statistics counters" which are mostly used sys/arch > > > from "fault counters" which are only used in uvm/uvm_fault.c?
I don't know. Keep it simple for now, so maybe stick with what you have. > > > - The counter_add(9) API deals with uint64_t and currently uvmexp uses > > > int. Should we truncate or change the size of uvmexp fields or do > > > something else? Truncating should be good enough. > > > Comments? > > > > I think this breaks "show uvmexp" in ddb. > > Updated diff below fixes that. Any comment on the issues raised above? > > > You fear that using atomic operations for these counters would lead to > > too much bus contention on systems with a large number of CPUs? > > I don't know. I don't see the point of using atomic operations for > "real" counters that are not used to make any decision. Atomic > operations have a high cost, bus contention might be one of them. Sure. If dlg@ thinks this is an appropriate use of the per-CPU counters, this is ok with me. > Index: kern/init_main.c > =================================================================== > RCS file: /cvs/src/sys/kern/init_main.c,v > retrieving revision 1.302 > diff -u -p -r1.302 init_main.c > --- kern/init_main.c 7 Dec 2020 16:55:28 -0000 1.302 > +++ kern/init_main.c 21 Dec 2020 19:37:13 -0000 > @@ -432,6 +432,7 @@ main(void *framep) > #endif > > mbcpuinit(); /* enable per cpu mbuf data */ > + uvm_init_percpu(); > > /* init exec and emul */ > init_exec(); > Index: uvm/uvm_extern.h > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_extern.h,v > retrieving revision 1.155 > diff -u -p -r1.155 uvm_extern.h > --- uvm/uvm_extern.h 1 Dec 2020 13:56:22 -0000 1.155 > +++ uvm/uvm_extern.h 21 Dec 2020 19:37:13 -0000 > @@ -289,6 +289,7 @@ void uvm_vsunlock_device(struct proc > * > void *); > void uvm_pause(void); > void uvm_init(void); > +void uvm_init_percpu(void); > int uvm_io(vm_map_t, struct uio *, int); > > #define UVM_IO_FIXPROT 0x01 > Index: uvm/uvm_fault.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v > retrieving revision 1.109 > diff -u -p -r1.109 uvm_fault.c > --- uvm/uvm_fault.c 8 Dec 2020 12:26:31 -0000 1.109 > +++ uvm/uvm_fault.c 21 Dec 2020 19:37:13 -0000 > @@ -35,6 +35,7 @@ > #include <sys/param.h> > #include <sys/systm.h> > #include <sys/kernel.h> > +#include <sys/percpu.h> > #include <sys/proc.h> > #include <sys/malloc.h> > #include <sys/mman.h> > @@ -271,7 +272,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > int result; > > result = 0; /* XXX shut up gcc */ > - uvmexp.fltanget++; > + counters_inc(uvmexp_counters, flt_anget); > /* bump rusage counters */ > if (anon->an_page) > curproc->p_ru.ru_minflt++; > @@ -295,7 +296,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0) > return (VM_PAGER_OK); > atomic_setbits_int(&pg->pg_flags, PG_WANTED); > - uvmexp.fltpgwait++; > + counters_inc(uvmexp_counters, flt_pgwait); > > /* > * the last unlock must be an atomic unlock+wait on > @@ -310,7 +311,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > > if (pg == NULL) { /* out of RAM. */ > uvmfault_unlockall(ufi, amap, NULL); > - uvmexp.fltnoram++; > + counters_inc(uvmexp_counters, flt_noram); > uvm_wait("flt_noram1"); > /* ready to relock and try again */ > } else { > @@ -325,7 +326,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > * it is ok to read an_swslot here because > * we hold PG_BUSY on the page. > */ > - uvmexp.pageins++; > + counters_inc(uvmexp_counters, pageins); > result = uvm_swap_get(pg, anon->an_swslot, > PGO_SYNCIO); > > @@ -369,7 +370,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > uvm_anfree(anon); /* frees page for us */ > if (locked) > uvmfault_unlockall(ufi, amap, NULL); > - uvmexp.fltpgrele++; > + counters_inc(uvmexp_counters, flt_pgrele); > return (VM_PAGER_REFAULT); /* refault! */ > } > > @@ -426,7 +427,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > } > > /* try it again! */ > - uvmexp.fltanretry++; > + counters_inc(uvmexp_counters, flt_anretry); > continue; > > } /* while (1) */ > @@ -547,7 +548,7 @@ uvm_fault_check(struct uvm_faultinfo *uf > /* need to clear */ > uvmfault_unlockmaps(ufi, FALSE); > uvmfault_amapcopy(ufi); > - uvmexp.fltamcopy++; > + counters_inc(uvmexp_counters, flt_amcopy); > return (ERESTART); > } else { > /* > @@ -699,7 +700,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf > */ > > if ((access_type & PROT_WRITE) != 0 && anon->an_ref > 1) { > - uvmexp.flt_acow++; > + counters_inc(uvmexp_counters, flt_acow); > oanon = anon; /* oanon = old */ > anon = uvm_analloc(); > if (anon) { > @@ -710,10 +711,10 @@ uvm_fault_upper(struct uvm_faultinfo *uf > if (anon == NULL || pg == NULL) { > uvmfault_unlockall(ufi, amap, NULL); > if (anon == NULL) > - uvmexp.fltnoanon++; > + counters_inc(uvmexp_counters, flt_noanon); > else { > uvm_anfree(anon); > - uvmexp.fltnoram++; > + counters_inc(uvmexp_counters, flt_noram); > } > > if (uvm_swapisfull()) > @@ -745,7 +746,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf > * thus, no one can get at it until we are done with it. > */ > } else { > - uvmexp.flt_anon++; > + counters_inc(uvmexp_counters, flt_anon); > oanon = anon; > pg = anon->an_page; > if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ > @@ -861,7 +862,7 @@ uvm_fault_upper_lookup(struct uvm_faulti > uvm_lock_pageq(); > uvm_pageactivate(anon->an_page); /* reactivate */ > uvm_unlock_pageq(); > - uvmexp.fltnamap++; > + counters_inc(uvmexp_counters, flt_namap); > > /* > * Since this isn't the page that's actually faulting, > @@ -909,7 +910,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad > struct vm_page *pages[UVM_MAXRANGE]; > int error = ERESTART; > > - uvmexp.faults++; /* XXX: locking? */ > + counters_inc(uvmexp_counters, faults); > TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); > > /* init the IN parameters in the ufi */ > @@ -994,7 +995,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > * ("get" has the option of doing a pmap_enter for us) > */ > if (uobj != NULL) { > - uvmexp.fltlget++; > + counters_inc(uvmexp_counters, flt_lget); > gotpages = flt->npages; > (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset + > (flt->startva - ufi->entry->start), > @@ -1038,7 +1039,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > uvm_lock_pageq(); > uvm_pageactivate(pages[lcv]); /* reactivate */ > uvm_unlock_pageq(); > - uvmexp.fltnomap++; > + counters_inc(uvmexp_counters, flt_nomap); > > /* > * Since this page isn't the page that's > @@ -1109,7 +1110,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > > uvmfault_unlockall(ufi, amap, NULL); > > - uvmexp.fltget++; > + counters_inc(uvmexp_counters, flt_get); > gotpages = 1; > uoff = (ufi->orig_rvaddr - ufi->entry->start) + > ufi->entry->offset; > result = uobj->pgops->pgo_get(uobj, uoff, &uobjpage, &gotpages, > @@ -1183,7 +1184,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > * > * set "pg" to the page we want to map in (uobjpage, usually) > */ > - uvmexp.flt_obj++; > + counters_inc(uvmexp_counters, flt_obj); > if (UVM_ET_ISCOPYONWRITE(ufi->entry)) > flt->enter_prot &= ~PROT_WRITE; > pg = uobjpage; /* map in the actual object */ > @@ -1235,10 +1236,10 @@ uvm_fault_lower(struct uvm_faultinfo *uf > /* unlock and fail ... */ > uvmfault_unlockall(ufi, amap, uobj); > if (anon == NULL) > - uvmexp.fltnoanon++; > + counters_inc(uvmexp_counters, flt_noanon); > else { > uvm_anfree(anon); > - uvmexp.fltnoram++; > + counters_inc(uvmexp_counters, flt_noram); > } > > if (uvm_swapisfull()) > @@ -1254,7 +1255,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > > /* fill in the data */ > if (uobjpage != PGO_DONTCARE) { > - uvmexp.flt_prcopy++; > + counters_inc(uvmexp_counters, flt_prcopy); > /* copy page [pg now dirty] */ > uvm_pagecopy(uobjpage, pg); > > @@ -1277,7 +1278,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > uvm_unlock_pageq(); > uobj = NULL; > } else { > - uvmexp.flt_przero++; > + counters_inc(uvmexp_counters, flt_przero); > /* > * Page is zero'd and marked dirty by uvm_pagealloc() > * above. > @@ -1288,7 +1289,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > ufi->orig_rvaddr - ufi->entry->start, anon, 0)) { > uvmfault_unlockall(ufi, amap, NULL); > uvm_anfree(anon); > - uvmexp.fltnoamap++; > + counters_inc(uvmexp_counters, flt_noamap); > > if (uvm_swapisfull()) > return (ENOMEM); > @@ -1580,7 +1581,7 @@ uvmfault_relock(struct uvm_faultinfo *uf > return TRUE; > } > > - uvmexp.fltrelck++; > + counters_inc(uvmexp_counters, flt_relck); > > /* > * relock map. fail if version mismatch (in which case nothing > @@ -1592,6 +1593,6 @@ uvmfault_relock(struct uvm_faultinfo *uf > return(FALSE); > } > > - uvmexp.fltrelckok++; > + counters_inc(uvmexp_counters, flt_relckok); > return(TRUE); /* got it! */ > } > Index: uvm/uvm_init.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_init.c,v > retrieving revision 1.40 > diff -u -p -r1.40 uvm_init.c > --- uvm/uvm_init.c 11 May 2017 00:42:05 -0000 1.40 > +++ uvm/uvm_init.c 21 Dec 2020 19:37:13 -0000 > @@ -35,6 +35,7 @@ > #include <sys/param.h> > #include <sys/systm.h> > #include <sys/filedesc.h> > +#include <sys/percpu.h> > #include <sys/resourcevar.h> > #include <sys/mman.h> > #include <sys/malloc.h> > @@ -52,6 +53,9 @@ > struct uvm uvm; /* decl */ > struct uvmexp uvmexp; /* decl */ > > +COUNTERS_BOOT_MEMORY(uvmexp_countersboot, exp_ncounters); > +struct cpumem *uvmexp_counters = > COUNTERS_BOOT_INITIALIZER(uvmexp_countersboot); > + > #if defined(VM_MIN_KERNEL_ADDRESS) > vaddr_t vm_min_kernel_address = VM_MIN_KERNEL_ADDRESS; > #else > @@ -184,4 +188,10 @@ uvm_init(void) > uaddr_bestfit_create(vm_map_min(kmem_map), > vm_map_max(kmem_map))); > #endif /* !SMALL_KERNEL */ > +} > + > +void > +uvm_init_percpu(void) > +{ > + uvmexp_counters = counters_alloc_ncpus(uvmexp_counters, exp_ncounters); > } > Index: uvm/uvm_meter.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_meter.c,v > retrieving revision 1.41 > diff -u -p -r1.41 uvm_meter.c > --- uvm/uvm_meter.c 24 Jun 2020 22:03:45 -0000 1.41 > +++ uvm/uvm_meter.c 23 Dec 2020 13:26:40 -0000 > @@ -39,6 +39,7 @@ > #include <sys/param.h> > #include <sys/systm.h> > #include <sys/kernel.h> > +#include <sys/percpu.h> > #include <sys/proc.h> > #include <sys/sysctl.h> > #include <sys/vmmeter.h> > @@ -78,6 +79,7 @@ static fixpt_t cexp[3] = { > > static void uvm_loadav(struct loadavg *); > void uvm_total(struct vmtotal *); > +void uvmexp_read(struct uvmexp *); > > /* > * uvm_meter: calculate load average and wake up the swapper (if needed) > @@ -151,6 +153,7 @@ uvm_sysctl(int *name, u_int namelen, voi > { > struct process *pr = p->p_p; > struct vmtotal vmtotals; > + struct uvmexp uexp; > int rv, t; > > switch (name[0]) { > @@ -179,8 +182,9 @@ uvm_sysctl(int *name, u_int namelen, voi > sizeof(vmtotals))); > > case VM_UVMEXP: > - return (sysctl_rdstruct(oldp, oldlenp, newp, &uvmexp, > - sizeof(uvmexp))); > + uvmexp_read(&uexp); > + return (sysctl_rdstruct(oldp, oldlenp, newp, &uexp, > + sizeof(uexp))); > > case VM_NKMEMPAGES: > return (sysctl_rdint(oldp, oldlenp, newp, nkmempages)); > @@ -315,6 +319,41 @@ uvm_total(struct vmtotal *totalp) > totalp->t_armshr = 0; /* XXX */ > } > > +void > +uvmexp_read(struct uvmexp *uexp) > +{ > + uint64_t counters[exp_ncounters]; > + > + memcpy(uexp, &uvmexp, sizeof(*uexp)); > + > + counters_read(uvmexp_counters, counters, exp_ncounters); > + > + /* stat counters */ > + uexp->faults = (int)counters[faults]; > + uexp->pageins = (int)counters[pageins]; > + > + /* fault subcounters */ > + uexp->fltnoram = (int)counters[flt_noram]; > + uexp->fltnoanon = (int)counters[flt_noanon]; > + uexp->fltnoamap = (int)counters[flt_noamap]; > + uexp->fltpgwait = (int)counters[flt_pgwait]; > + uexp->fltpgrele = (int)counters[flt_pgrele]; > + uexp->fltrelck = (int)counters[flt_relck]; > + uexp->fltrelckok = (int)counters[flt_relckok]; > + uexp->fltanget = (int)counters[flt_anget]; > + uexp->fltanretry = (int)counters[flt_anretry]; > + uexp->fltamcopy = (int)counters[flt_amcopy]; > + uexp->fltnamap = (int)counters[flt_namap]; > + uexp->fltnomap = (int)counters[flt_nomap]; > + uexp->fltlget = (int)counters[flt_lget]; > + uexp->fltget = (int)counters[flt_get]; > + uexp->flt_anon = (int)counters[flt_anon]; > + uexp->flt_acow = (int)counters[flt_acow]; > + uexp->flt_obj = (int)counters[flt_obj]; > + uexp->flt_prcopy = (int)counters[flt_prcopy]; > + uexp->flt_przero = (int)counters[flt_przero]; > +} > + > #ifdef DDB > > /* > @@ -323,51 +362,54 @@ uvm_total(struct vmtotal *totalp) > void > uvmexp_print(int (*pr)(const char *, ...)) > { > + struct uvmexp uexp; > + > + uvmexp_read(&uexp); > > (*pr)("Current UVM status:\n"); > (*pr)(" pagesize=%d (0x%x), pagemask=0x%x, pageshift=%d\n", > - uvmexp.pagesize, uvmexp.pagesize, uvmexp.pagemask, > - uvmexp.pageshift); > + uexp.pagesize, uexp.pagesize, uexp.pagemask, > + uexp.pageshift); > (*pr)(" %d VM pages: %d active, %d inactive, %d wired, %d free (%d > zero)\n", > - uvmexp.npages, uvmexp.active, uvmexp.inactive, uvmexp.wired, > - uvmexp.free, uvmexp.zeropages); > + uexp.npages, uexp.active, uexp.inactive, uexp.wired, > + uexp.free, uexp.zeropages); > (*pr)(" min %d%% (%d) anon, %d%% (%d) vnode, %d%% (%d) vtext\n", > - uvmexp.anonminpct, uvmexp.anonmin, uvmexp.vnodeminpct, > - uvmexp.vnodemin, uvmexp.vtextminpct, uvmexp.vtextmin); > + uexp.anonminpct, uexp.anonmin, uexp.vnodeminpct, > + uexp.vnodemin, uexp.vtextminpct, uexp.vtextmin); > (*pr)(" freemin=%d, free-target=%d, inactive-target=%d, " > - "wired-max=%d\n", uvmexp.freemin, uvmexp.freetarg, uvmexp.inactarg, > - uvmexp.wiredmax); > + "wired-max=%d\n", uexp.freemin, uexp.freetarg, uexp.inactarg, > + uexp.wiredmax); > (*pr)(" faults=%d, traps=%d, intrs=%d, ctxswitch=%d fpuswitch=%d\n", > - uvmexp.faults, uvmexp.traps, uvmexp.intrs, uvmexp.swtch, > - uvmexp.fpswtch); > + uexp.faults, uexp.traps, uexp.intrs, uexp.swtch, > + uexp.fpswtch); > (*pr)(" softint=%d, syscalls=%d, kmapent=%d\n", > - uvmexp.softs, uvmexp.syscalls, uvmexp.kmapent); > + uexp.softs, uexp.syscalls, uexp.kmapent); > > (*pr)(" fault counts:\n"); > (*pr)(" noram=%d, noanon=%d, noamap=%d, pgwait=%d, pgrele=%d\n", > - uvmexp.fltnoram, uvmexp.fltnoanon, uvmexp.fltnoamap, > - uvmexp.fltpgwait, uvmexp.fltpgrele); > + uexp.fltnoram, uexp.fltnoanon, uexp.fltnoamap, > + uexp.fltpgwait, uexp.fltpgrele); > (*pr)(" ok relocks(total)=%d(%d), anget(retries)=%d(%d), " > - "amapcopy=%d\n", uvmexp.fltrelckok, uvmexp.fltrelck, > - uvmexp.fltanget, uvmexp.fltanretry, uvmexp.fltamcopy); > + "amapcopy=%d\n", uexp.fltrelckok, uexp.fltrelck, > + uexp.fltanget, uexp.fltanretry, uexp.fltamcopy); > (*pr)(" neighbor anon/obj pg=%d/%d, gets(lock/unlock)=%d/%d\n", > - uvmexp.fltnamap, uvmexp.fltnomap, uvmexp.fltlget, uvmexp.fltget); > + uexp.fltnamap, uexp.fltnomap, uexp.fltlget, uexp.fltget); > (*pr)(" cases: anon=%d, anoncow=%d, obj=%d, prcopy=%d, przero=%d\n", > - uvmexp.flt_anon, uvmexp.flt_acow, uvmexp.flt_obj, uvmexp.flt_prcopy, > - uvmexp.flt_przero); > + uexp.flt_anon, uexp.flt_acow, uexp.flt_obj, uexp.flt_prcopy, > + uexp.flt_przero); > > (*pr)(" daemon and swap counts:\n"); > (*pr)(" woke=%d, revs=%d, scans=%d, obscans=%d, anscans=%d\n", > - uvmexp.pdwoke, uvmexp.pdrevs, uvmexp.pdscans, uvmexp.pdobscan, > - uvmexp.pdanscan); > + uexp.pdwoke, uexp.pdrevs, uexp.pdscans, uexp.pdobscan, > + uexp.pdanscan); > (*pr)(" busy=%d, freed=%d, reactivate=%d, deactivate=%d\n", > - uvmexp.pdbusy, uvmexp.pdfreed, uvmexp.pdreact, uvmexp.pddeact); > - (*pr)(" pageouts=%d, pending=%d, nswget=%d\n", uvmexp.pdpageouts, > - uvmexp.pdpending, uvmexp.nswget); > + uexp.pdbusy, uexp.pdfreed, uexp.pdreact, uexp.pddeact); > + (*pr)(" pageouts=%d, pending=%d, nswget=%d\n", uexp.pdpageouts, > + uexp.pdpending, uexp.nswget); > (*pr)(" nswapdev=%d\n", > - uvmexp.nswapdev); > + uexp.nswapdev); > (*pr)(" swpages=%d, swpginuse=%d, swpgonly=%d paging=%d\n", > - uvmexp.swpages, uvmexp.swpginuse, uvmexp.swpgonly, uvmexp.paging); > + uexp.swpages, uexp.swpginuse, uexp.swpgonly, uexp.paging); > > (*pr)(" kernel pointers:\n"); > (*pr)(" objs(kern)=%p\n", uvm.kernel_object); > Index: uvm/uvmexp.h > =================================================================== > RCS file: /cvs/src/sys/uvm/uvmexp.h,v > retrieving revision 1.7 > diff -u -p -r1.7 uvmexp.h > --- uvm/uvmexp.h 14 Dec 2020 13:29:18 -0000 1.7 > +++ uvm/uvmexp.h 21 Dec 2020 19:37:13 -0000 > @@ -156,4 +156,41 @@ struct _ps_strings { > void *val; > }; > > +#ifdef _KERNEL > + > +/* > + * Per-cpu UVM counters. > + */ > +extern struct cpumem *uvmexp_counters; > + > +enum uvm_exp_counters { > + /* stat counters */ > + faults, /* page fault count */ > + pageins, /* pagein operation count */ > + > + /* fault subcounters */ > + flt_noram, /* number of times fault was out of ram */ > + flt_noanon, /* number of times fault was out of anons */ > + flt_noamap, /* number of times fault was out of amap chunks */ > + flt_pgwait, /* number of times fault had to wait on a page */ > + flt_pgrele, /* number of times fault found a released page */ > + flt_relck, /* number of times fault relock called */ > + flt_relckok, /* number of times fault relock is a success */ > + flt_anget, /* number of times fault gets anon page */ > + flt_anretry, /* number of times fault retrys an anon get */ > + flt_amcopy, /* number of times fault clears "needs copy" */ > + flt_namap, /* number of times fault maps a neighbor anon page */ > + flt_nomap, /* number of times fault maps a neighbor obj page */ > + flt_lget, /* number of times fault does a locked pgo_get */ > + flt_get, /* number of times fault does an unlocked get */ > + flt_anon, /* number of times fault anon (case 1a) */ > + flt_acow, /* number of times fault anon cow (case 1b) */ > + flt_obj, /* number of times fault is on object page (2a) */ > + flt_prcopy, /* number of times fault promotes with copy (2b) */ > + flt_przero, /* number of times fault promotes with zerofill (2b) */ > + > + exp_ncounters > +}; > + > +#endif /* _KERNEL */ > #endif /*_UVM_UVMEXP_ */ >