On 22/12/20(Tue) 23:43, Mark Kettenis wrote: > > Date: Mon, 21 Dec 2020 16:46:32 -0300 > > From: Martin Pieuchot <m...@openbsd.org> > > > > During a page fault multiples counters are updated. They fall into two > > categories "fault counters" and "global statistics" both of which are > > currently represented by int-sized fields inside a global: `uvmexp'. > > > > Diff below makes use of the per-CPU counters_inc(9) API to make sure no > > update is lost with an unlocked fault handler. I only converted the > > fields touched by uvm_fault() to have a working solution and start a > > discussion. > > > > - Should we keep a single enum for all fields inside `uvmexp' or do we > > want to separate "statistics counters" which are mostly used sys/arch > > from "fault counters" which are only used in uvm/uvm_fault.c? > > > > - The counter_add(9) API deals with uint64_t and currently uvmexp uses > > int. Should we truncate or change the size of uvmexp fields or do > > something else? > > > > Comments? > > I think this breaks "show uvmexp" in ddb.
Updated diff below fixes that. Any comment on the issues raised above? > You fear that using atomic operations for these counters would lead to > too much bus contention on systems with a large number of CPUs? I don't know. I don't see the point of using atomic operations for "real" counters that are not used to make any decision. Atomic operations have a high cost, bus contention might be one of them. Index: kern/init_main.c =================================================================== RCS file: /cvs/src/sys/kern/init_main.c,v retrieving revision 1.302 diff -u -p -r1.302 init_main.c --- kern/init_main.c 7 Dec 2020 16:55:28 -0000 1.302 +++ kern/init_main.c 21 Dec 2020 19:37:13 -0000 @@ -432,6 +432,7 @@ main(void *framep) #endif mbcpuinit(); /* enable per cpu mbuf data */ + uvm_init_percpu(); /* init exec and emul */ init_exec(); Index: uvm/uvm_extern.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_extern.h,v retrieving revision 1.155 diff -u -p -r1.155 uvm_extern.h --- uvm/uvm_extern.h 1 Dec 2020 13:56:22 -0000 1.155 +++ uvm/uvm_extern.h 21 Dec 2020 19:37:13 -0000 @@ -289,6 +289,7 @@ void uvm_vsunlock_device(struct proc * void *); void uvm_pause(void); void uvm_init(void); +void uvm_init_percpu(void); int uvm_io(vm_map_t, struct uio *, int); #define UVM_IO_FIXPROT 0x01 Index: uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v retrieving revision 1.109 diff -u -p -r1.109 uvm_fault.c --- uvm/uvm_fault.c 8 Dec 2020 12:26:31 -0000 1.109 +++ uvm/uvm_fault.c 21 Dec 2020 19:37:13 -0000 @@ -35,6 +35,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/percpu.h> #include <sys/proc.h> #include <sys/malloc.h> #include <sys/mman.h> @@ -271,7 +272,7 @@ uvmfault_anonget(struct uvm_faultinfo *u int result; result = 0; /* XXX shut up gcc */ - uvmexp.fltanget++; + counters_inc(uvmexp_counters, flt_anget); /* bump rusage counters */ if (anon->an_page) curproc->p_ru.ru_minflt++; @@ -295,7 +296,7 @@ uvmfault_anonget(struct uvm_faultinfo *u if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0) return (VM_PAGER_OK); atomic_setbits_int(&pg->pg_flags, PG_WANTED); - uvmexp.fltpgwait++; + counters_inc(uvmexp_counters, flt_pgwait); /* * the last unlock must be an atomic unlock+wait on @@ -310,7 +311,7 @@ uvmfault_anonget(struct uvm_faultinfo *u if (pg == NULL) { /* out of RAM. */ uvmfault_unlockall(ufi, amap, NULL); - uvmexp.fltnoram++; + counters_inc(uvmexp_counters, flt_noram); uvm_wait("flt_noram1"); /* ready to relock and try again */ } else { @@ -325,7 +326,7 @@ uvmfault_anonget(struct uvm_faultinfo *u * it is ok to read an_swslot here because * we hold PG_BUSY on the page. */ - uvmexp.pageins++; + counters_inc(uvmexp_counters, pageins); result = uvm_swap_get(pg, anon->an_swslot, PGO_SYNCIO); @@ -369,7 +370,7 @@ uvmfault_anonget(struct uvm_faultinfo *u uvm_anfree(anon); /* frees page for us */ if (locked) uvmfault_unlockall(ufi, amap, NULL); - uvmexp.fltpgrele++; + counters_inc(uvmexp_counters, flt_pgrele); return (VM_PAGER_REFAULT); /* refault! */ } @@ -426,7 +427,7 @@ uvmfault_anonget(struct uvm_faultinfo *u } /* try it again! */ - uvmexp.fltanretry++; + counters_inc(uvmexp_counters, flt_anretry); continue; } /* while (1) */ @@ -547,7 +548,7 @@ uvm_fault_check(struct uvm_faultinfo *uf /* need to clear */ uvmfault_unlockmaps(ufi, FALSE); uvmfault_amapcopy(ufi); - uvmexp.fltamcopy++; + counters_inc(uvmexp_counters, flt_amcopy); return (ERESTART); } else { /* @@ -699,7 +700,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf */ if ((access_type & PROT_WRITE) != 0 && anon->an_ref > 1) { - uvmexp.flt_acow++; + counters_inc(uvmexp_counters, flt_acow); oanon = anon; /* oanon = old */ anon = uvm_analloc(); if (anon) { @@ -710,10 +711,10 @@ uvm_fault_upper(struct uvm_faultinfo *uf if (anon == NULL || pg == NULL) { uvmfault_unlockall(ufi, amap, NULL); if (anon == NULL) - uvmexp.fltnoanon++; + counters_inc(uvmexp_counters, flt_noanon); else { uvm_anfree(anon); - uvmexp.fltnoram++; + counters_inc(uvmexp_counters, flt_noram); } if (uvm_swapisfull()) @@ -745,7 +746,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf * thus, no one can get at it until we are done with it. */ } else { - uvmexp.flt_anon++; + counters_inc(uvmexp_counters, flt_anon); oanon = anon; pg = anon->an_page; if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ @@ -861,7 +862,7 @@ uvm_fault_upper_lookup(struct uvm_faulti uvm_lock_pageq(); uvm_pageactivate(anon->an_page); /* reactivate */ uvm_unlock_pageq(); - uvmexp.fltnamap++; + counters_inc(uvmexp_counters, flt_namap); /* * Since this isn't the page that's actually faulting, @@ -909,7 +910,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad struct vm_page *pages[UVM_MAXRANGE]; int error = ERESTART; - uvmexp.faults++; /* XXX: locking? */ + counters_inc(uvmexp_counters, faults); TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); /* init the IN parameters in the ufi */ @@ -994,7 +995,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf * ("get" has the option of doing a pmap_enter for us) */ if (uobj != NULL) { - uvmexp.fltlget++; + counters_inc(uvmexp_counters, flt_lget); gotpages = flt->npages; (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset + (flt->startva - ufi->entry->start), @@ -1038,7 +1039,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf uvm_lock_pageq(); uvm_pageactivate(pages[lcv]); /* reactivate */ uvm_unlock_pageq(); - uvmexp.fltnomap++; + counters_inc(uvmexp_counters, flt_nomap); /* * Since this page isn't the page that's @@ -1109,7 +1110,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf uvmfault_unlockall(ufi, amap, NULL); - uvmexp.fltget++; + counters_inc(uvmexp_counters, flt_get); gotpages = 1; uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset; result = uobj->pgops->pgo_get(uobj, uoff, &uobjpage, &gotpages, @@ -1183,7 +1184,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf * * set "pg" to the page we want to map in (uobjpage, usually) */ - uvmexp.flt_obj++; + counters_inc(uvmexp_counters, flt_obj); if (UVM_ET_ISCOPYONWRITE(ufi->entry)) flt->enter_prot &= ~PROT_WRITE; pg = uobjpage; /* map in the actual object */ @@ -1235,10 +1236,10 @@ uvm_fault_lower(struct uvm_faultinfo *uf /* unlock and fail ... */ uvmfault_unlockall(ufi, amap, uobj); if (anon == NULL) - uvmexp.fltnoanon++; + counters_inc(uvmexp_counters, flt_noanon); else { uvm_anfree(anon); - uvmexp.fltnoram++; + counters_inc(uvmexp_counters, flt_noram); } if (uvm_swapisfull()) @@ -1254,7 +1255,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf /* fill in the data */ if (uobjpage != PGO_DONTCARE) { - uvmexp.flt_prcopy++; + counters_inc(uvmexp_counters, flt_prcopy); /* copy page [pg now dirty] */ uvm_pagecopy(uobjpage, pg); @@ -1277,7 +1278,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf uvm_unlock_pageq(); uobj = NULL; } else { - uvmexp.flt_przero++; + counters_inc(uvmexp_counters, flt_przero); /* * Page is zero'd and marked dirty by uvm_pagealloc() * above. @@ -1288,7 +1289,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf ufi->orig_rvaddr - ufi->entry->start, anon, 0)) { uvmfault_unlockall(ufi, amap, NULL); uvm_anfree(anon); - uvmexp.fltnoamap++; + counters_inc(uvmexp_counters, flt_noamap); if (uvm_swapisfull()) return (ENOMEM); @@ -1580,7 +1581,7 @@ uvmfault_relock(struct uvm_faultinfo *uf return TRUE; } - uvmexp.fltrelck++; + counters_inc(uvmexp_counters, flt_relck); /* * relock map. fail if version mismatch (in which case nothing @@ -1592,6 +1593,6 @@ uvmfault_relock(struct uvm_faultinfo *uf return(FALSE); } - uvmexp.fltrelckok++; + counters_inc(uvmexp_counters, flt_relckok); return(TRUE); /* got it! */ } Index: uvm/uvm_init.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_init.c,v retrieving revision 1.40 diff -u -p -r1.40 uvm_init.c --- uvm/uvm_init.c 11 May 2017 00:42:05 -0000 1.40 +++ uvm/uvm_init.c 21 Dec 2020 19:37:13 -0000 @@ -35,6 +35,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> +#include <sys/percpu.h> #include <sys/resourcevar.h> #include <sys/mman.h> #include <sys/malloc.h> @@ -52,6 +53,9 @@ struct uvm uvm; /* decl */ struct uvmexp uvmexp; /* decl */ +COUNTERS_BOOT_MEMORY(uvmexp_countersboot, exp_ncounters); +struct cpumem *uvmexp_counters = COUNTERS_BOOT_INITIALIZER(uvmexp_countersboot); + #if defined(VM_MIN_KERNEL_ADDRESS) vaddr_t vm_min_kernel_address = VM_MIN_KERNEL_ADDRESS; #else @@ -184,4 +188,10 @@ uvm_init(void) uaddr_bestfit_create(vm_map_min(kmem_map), vm_map_max(kmem_map))); #endif /* !SMALL_KERNEL */ +} + +void +uvm_init_percpu(void) +{ + uvmexp_counters = counters_alloc_ncpus(uvmexp_counters, exp_ncounters); } Index: uvm/uvm_meter.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_meter.c,v retrieving revision 1.41 diff -u -p -r1.41 uvm_meter.c --- uvm/uvm_meter.c 24 Jun 2020 22:03:45 -0000 1.41 +++ uvm/uvm_meter.c 23 Dec 2020 13:26:40 -0000 @@ -39,6 +39,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/percpu.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/vmmeter.h> @@ -78,6 +79,7 @@ static fixpt_t cexp[3] = { static void uvm_loadav(struct loadavg *); void uvm_total(struct vmtotal *); +void uvmexp_read(struct uvmexp *); /* * uvm_meter: calculate load average and wake up the swapper (if needed) @@ -151,6 +153,7 @@ uvm_sysctl(int *name, u_int namelen, voi { struct process *pr = p->p_p; struct vmtotal vmtotals; + struct uvmexp uexp; int rv, t; switch (name[0]) { @@ -179,8 +182,9 @@ uvm_sysctl(int *name, u_int namelen, voi sizeof(vmtotals))); case VM_UVMEXP: - return (sysctl_rdstruct(oldp, oldlenp, newp, &uvmexp, - sizeof(uvmexp))); + uvmexp_read(&uexp); + return (sysctl_rdstruct(oldp, oldlenp, newp, &uexp, + sizeof(uexp))); case VM_NKMEMPAGES: return (sysctl_rdint(oldp, oldlenp, newp, nkmempages)); @@ -315,6 +319,41 @@ uvm_total(struct vmtotal *totalp) totalp->t_armshr = 0; /* XXX */ } +void +uvmexp_read(struct uvmexp *uexp) +{ + uint64_t counters[exp_ncounters]; + + memcpy(uexp, &uvmexp, sizeof(*uexp)); + + counters_read(uvmexp_counters, counters, exp_ncounters); + + /* stat counters */ + uexp->faults = (int)counters[faults]; + uexp->pageins = (int)counters[pageins]; + + /* fault subcounters */ + uexp->fltnoram = (int)counters[flt_noram]; + uexp->fltnoanon = (int)counters[flt_noanon]; + uexp->fltnoamap = (int)counters[flt_noamap]; + uexp->fltpgwait = (int)counters[flt_pgwait]; + uexp->fltpgrele = (int)counters[flt_pgrele]; + uexp->fltrelck = (int)counters[flt_relck]; + uexp->fltrelckok = (int)counters[flt_relckok]; + uexp->fltanget = (int)counters[flt_anget]; + uexp->fltanretry = (int)counters[flt_anretry]; + uexp->fltamcopy = (int)counters[flt_amcopy]; + uexp->fltnamap = (int)counters[flt_namap]; + uexp->fltnomap = (int)counters[flt_nomap]; + uexp->fltlget = (int)counters[flt_lget]; + uexp->fltget = (int)counters[flt_get]; + uexp->flt_anon = (int)counters[flt_anon]; + uexp->flt_acow = (int)counters[flt_acow]; + uexp->flt_obj = (int)counters[flt_obj]; + uexp->flt_prcopy = (int)counters[flt_prcopy]; + uexp->flt_przero = (int)counters[flt_przero]; +} + #ifdef DDB /* @@ -323,51 +362,54 @@ uvm_total(struct vmtotal *totalp) void uvmexp_print(int (*pr)(const char *, ...)) { + struct uvmexp uexp; + + uvmexp_read(&uexp); (*pr)("Current UVM status:\n"); (*pr)(" pagesize=%d (0x%x), pagemask=0x%x, pageshift=%d\n", - uvmexp.pagesize, uvmexp.pagesize, uvmexp.pagemask, - uvmexp.pageshift); + uexp.pagesize, uexp.pagesize, uexp.pagemask, + uexp.pageshift); (*pr)(" %d VM pages: %d active, %d inactive, %d wired, %d free (%d zero)\n", - uvmexp.npages, uvmexp.active, uvmexp.inactive, uvmexp.wired, - uvmexp.free, uvmexp.zeropages); + uexp.npages, uexp.active, uexp.inactive, uexp.wired, + uexp.free, uexp.zeropages); (*pr)(" min %d%% (%d) anon, %d%% (%d) vnode, %d%% (%d) vtext\n", - uvmexp.anonminpct, uvmexp.anonmin, uvmexp.vnodeminpct, - uvmexp.vnodemin, uvmexp.vtextminpct, uvmexp.vtextmin); + uexp.anonminpct, uexp.anonmin, uexp.vnodeminpct, + uexp.vnodemin, uexp.vtextminpct, uexp.vtextmin); (*pr)(" freemin=%d, free-target=%d, inactive-target=%d, " - "wired-max=%d\n", uvmexp.freemin, uvmexp.freetarg, uvmexp.inactarg, - uvmexp.wiredmax); + "wired-max=%d\n", uexp.freemin, uexp.freetarg, uexp.inactarg, + uexp.wiredmax); (*pr)(" faults=%d, traps=%d, intrs=%d, ctxswitch=%d fpuswitch=%d\n", - uvmexp.faults, uvmexp.traps, uvmexp.intrs, uvmexp.swtch, - uvmexp.fpswtch); + uexp.faults, uexp.traps, uexp.intrs, uexp.swtch, + uexp.fpswtch); (*pr)(" softint=%d, syscalls=%d, kmapent=%d\n", - uvmexp.softs, uvmexp.syscalls, uvmexp.kmapent); + uexp.softs, uexp.syscalls, uexp.kmapent); (*pr)(" fault counts:\n"); (*pr)(" noram=%d, noanon=%d, noamap=%d, pgwait=%d, pgrele=%d\n", - uvmexp.fltnoram, uvmexp.fltnoanon, uvmexp.fltnoamap, - uvmexp.fltpgwait, uvmexp.fltpgrele); + uexp.fltnoram, uexp.fltnoanon, uexp.fltnoamap, + uexp.fltpgwait, uexp.fltpgrele); (*pr)(" ok relocks(total)=%d(%d), anget(retries)=%d(%d), " - "amapcopy=%d\n", uvmexp.fltrelckok, uvmexp.fltrelck, - uvmexp.fltanget, uvmexp.fltanretry, uvmexp.fltamcopy); + "amapcopy=%d\n", uexp.fltrelckok, uexp.fltrelck, + uexp.fltanget, uexp.fltanretry, uexp.fltamcopy); (*pr)(" neighbor anon/obj pg=%d/%d, gets(lock/unlock)=%d/%d\n", - uvmexp.fltnamap, uvmexp.fltnomap, uvmexp.fltlget, uvmexp.fltget); + uexp.fltnamap, uexp.fltnomap, uexp.fltlget, uexp.fltget); (*pr)(" cases: anon=%d, anoncow=%d, obj=%d, prcopy=%d, przero=%d\n", - uvmexp.flt_anon, uvmexp.flt_acow, uvmexp.flt_obj, uvmexp.flt_prcopy, - uvmexp.flt_przero); + uexp.flt_anon, uexp.flt_acow, uexp.flt_obj, uexp.flt_prcopy, + uexp.flt_przero); (*pr)(" daemon and swap counts:\n"); (*pr)(" woke=%d, revs=%d, scans=%d, obscans=%d, anscans=%d\n", - uvmexp.pdwoke, uvmexp.pdrevs, uvmexp.pdscans, uvmexp.pdobscan, - uvmexp.pdanscan); + uexp.pdwoke, uexp.pdrevs, uexp.pdscans, uexp.pdobscan, + uexp.pdanscan); (*pr)(" busy=%d, freed=%d, reactivate=%d, deactivate=%d\n", - uvmexp.pdbusy, uvmexp.pdfreed, uvmexp.pdreact, uvmexp.pddeact); - (*pr)(" pageouts=%d, pending=%d, nswget=%d\n", uvmexp.pdpageouts, - uvmexp.pdpending, uvmexp.nswget); + uexp.pdbusy, uexp.pdfreed, uexp.pdreact, uexp.pddeact); + (*pr)(" pageouts=%d, pending=%d, nswget=%d\n", uexp.pdpageouts, + uexp.pdpending, uexp.nswget); (*pr)(" nswapdev=%d\n", - uvmexp.nswapdev); + uexp.nswapdev); (*pr)(" swpages=%d, swpginuse=%d, swpgonly=%d paging=%d\n", - uvmexp.swpages, uvmexp.swpginuse, uvmexp.swpgonly, uvmexp.paging); + uexp.swpages, uexp.swpginuse, uexp.swpgonly, uexp.paging); (*pr)(" kernel pointers:\n"); (*pr)(" objs(kern)=%p\n", uvm.kernel_object); Index: uvm/uvmexp.h =================================================================== RCS file: /cvs/src/sys/uvm/uvmexp.h,v retrieving revision 1.7 diff -u -p -r1.7 uvmexp.h --- uvm/uvmexp.h 14 Dec 2020 13:29:18 -0000 1.7 +++ uvm/uvmexp.h 21 Dec 2020 19:37:13 -0000 @@ -156,4 +156,41 @@ struct _ps_strings { void *val; }; +#ifdef _KERNEL + +/* + * Per-cpu UVM counters. + */ +extern struct cpumem *uvmexp_counters; + +enum uvm_exp_counters { + /* stat counters */ + faults, /* page fault count */ + pageins, /* pagein operation count */ + + /* fault subcounters */ + flt_noram, /* number of times fault was out of ram */ + flt_noanon, /* number of times fault was out of anons */ + flt_noamap, /* number of times fault was out of amap chunks */ + flt_pgwait, /* number of times fault had to wait on a page */ + flt_pgrele, /* number of times fault found a released page */ + flt_relck, /* number of times fault relock called */ + flt_relckok, /* number of times fault relock is a success */ + flt_anget, /* number of times fault gets anon page */ + flt_anretry, /* number of times fault retrys an anon get */ + flt_amcopy, /* number of times fault clears "needs copy" */ + flt_namap, /* number of times fault maps a neighbor anon page */ + flt_nomap, /* number of times fault maps a neighbor obj page */ + flt_lget, /* number of times fault does a locked pgo_get */ + flt_get, /* number of times fault does an unlocked get */ + flt_anon, /* number of times fault anon (case 1a) */ + flt_acow, /* number of times fault anon cow (case 1b) */ + flt_obj, /* number of times fault is on object page (2a) */ + flt_prcopy, /* number of times fault promotes with copy (2b) */ + flt_przero, /* number of times fault promotes with zerofill (2b) */ + + exp_ncounters +}; + +#endif /* _KERNEL */ #endif /*_UVM_UVMEXP_ */