On 16/01/22(Sun) 15:35, Martin Pieuchot wrote: > On 30/12/21(Thu) 23:38, Theo Buehler wrote: > > The diff below does two things: it adds a uvm_swap_data_lock mutex and > > trades it for the KERNEL_LOCK in uvm_swapisfull() and uvm_swap_markbad() > > Why is it enough? Which fields is the lock protecting in these > function? Is it `uvmexp.swpages', could that be documented?
It is documented in the diff below. > > What about `nswapdev'? Why is the rwlock grabbed before reading it in > sys_swapctl()?i Because it is always modified with the lock, I added some documentation. > What about `swpginuse'? This is still under KERNEL_LOCK(), documented below. > If the mutex/rwlock are used to protect the global `swap_priority' could > that be also documented? Once this is documented it should be trivial to > see that some places are missing some locking. Is it intentional? > > > The uvm_swap_data_lock protects all swap data structures, so needs to be > > grabbed a few times, many of them already documented in the comments. > > > > For review, I suggest comparing to what NetBSD did and also going > > through the consumers (swaplist_insert, swaplist_find, swaplist_trim) > > and check that they are properly locked when called, or that there is > > the KERNEL_LOCK() in place when swap data structures are manipulated. > > I'd suggest using the KASSERT(rw_write_held()) idiom to further reduce > the differences with NetBSD. Done. > > In swapmount() I introduced locking since that's needed to be able to > > assert that the proper locks are held in swaplist_{insert,find,trim}. > > Could the KERNEL_LOCK() in uvm_swap_get() be pushed a bit further down? > What about `uvmexp.nswget' and `uvmexp.swpgonly' in there? This has been done as part of another change. This diff uses an atomic operation to increase `nswget' in case multiple threads fault on a page in swap at the same time. Updated diff below, ok? Index: uvm/uvm_swap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.c,v retrieving revision 1.163 diff -u -p -r1.163 uvm_swap.c --- uvm/uvm_swap.c 6 Aug 2022 13:44:04 -0000 1.163 +++ uvm/uvm_swap.c 17 Aug 2022 11:46:20 -0000 @@ -45,6 +45,7 @@ #include <sys/extent.h> #include <sys/blist.h> #include <sys/mount.h> +#include <sys/mutex.h> #include <sys/pool.h> #include <sys/syscallargs.h> #include <sys/swap.h> @@ -84,13 +85,16 @@ * the system maintains a global data structure describing all swap * partitions/files. there is a sorted LIST of "swappri" structures * which describe "swapdev"'s at that priority. this LIST is headed - * by the "swap_priority" global var. each "swappri" contains a + * by the "swap_priority" global var. each "swappri" contains a * TAILQ of "swapdev" structures at that priority. * * locking: * - swap_syscall_lock (sleep lock): this lock serializes the swapctl * system call and prevents the swap priority list from changing * while we are in the middle of a system call (e.g. SWAP_STATS). + * - uvm_swap_data_lock (mutex): this lock protects all swap data + * structures including the priority list, the swapdev structures, + * and the swapmap arena. * * each swap device has the following info: * - swap device in use (could be disabled, preventing future use) @@ -106,7 +110,7 @@ * userland controls and configures swap with the swapctl(2) system call. * the sys_swapctl performs the following operations: * [1] SWAP_NSWAP: returns the number of swap devices currently configured - * [2] SWAP_STATS: given a pointer to an array of swapent structures + * [2] SWAP_STATS: given a pointer to an array of swapent structures * (passed in via "arg") of a size passed in via "misc" ... we load * the current swap config into the array. * [3] SWAP_ON: given a pathname in arg (could be device or file) and a @@ -208,9 +212,10 @@ struct extent *swapmap; /* controls the /* list of all active swap devices [by priority] */ LIST_HEAD(swap_priority, swappri); -struct swap_priority swap_priority; +struct swap_priority swap_priority; /* [S] */ /* locks */ +struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_NONE); struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"); struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM); @@ -224,7 +229,7 @@ void swapdrum_add(struct swapdev *, in struct swapdev *swapdrum_getsdp(int); struct swapdev *swaplist_find(struct vnode *, int); -void swaplist_insert(struct swapdev *, +void swaplist_insert(struct swapdev *, struct swappri *, int); void swaplist_trim(void); @@ -472,16 +477,19 @@ uvm_swap_finicrypt_all(void) /* * swaplist_insert: insert swap device "sdp" into the global list * - * => caller must hold both swap_syscall_lock and uvm.swap_data_lock - * => caller must provide a newly malloc'd swappri structure (we will - * FREE it if we don't need it... this it to prevent malloc blocking - * here while adding swap) + * => caller must hold both swap_syscall_lock and uvm_swap_data_lock + * => caller must provide a newly allocated swappri structure (we will + * FREE it if we don't need it... this it to prevent allocation + * blocking here while adding swap) */ void swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) { struct swappri *spp, *pspp; + KASSERT(rw_write_held(&swap_syscall_lock)); + MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock); + /* * find entry at or after which to insert the new device. */ @@ -523,7 +531,7 @@ swaplist_insert(struct swapdev *sdp, str * swaplist_find: find and optionally remove a swap device from the * global list. * - * => caller must hold both swap_syscall_lock and uvm.swap_data_lock + * => caller must hold both swap_syscall_lock and uvm_swap_data_lock * => we return the swapdev we found (and removed) */ struct swapdev * @@ -532,6 +540,9 @@ swaplist_find(struct vnode *vp, boolean_ struct swapdev *sdp; struct swappri *spp; + KASSERT(rw_write_held(&swap_syscall_lock)); + MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock); + /* * search the lists for the requested vp */ @@ -554,13 +565,16 @@ swaplist_find(struct vnode *vp, boolean_ * swaplist_trim: scan priority list for empty priority entries and kill * them. * - * => caller must hold both swap_syscall_lock and uvm.swap_data_lock + * => caller must hold both swap_syscall_lock and uvm_swap_data_lock */ void swaplist_trim(void) { struct swappri *spp, *nextspp; + KASSERT(rw_write_held(&swap_syscall_lock)); + MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock); + LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { if (!TAILQ_EMPTY(&spp->spi_swapdev)) continue; @@ -573,7 +587,7 @@ swaplist_trim(void) * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. * * => caller must hold swap_syscall_lock - * => uvm.swap_data_lock should be unlocked (we may sleep) + * => uvm_swap_data_lock should be unlocked (we may sleep) */ void swapdrum_add(struct swapdev *sdp, int npages) @@ -593,7 +607,7 @@ swapdrum_add(struct swapdev *sdp, int np * to the "swapdev" that maps that section of the drum. * * => each swapdev takes one big contig chunk of the drum - * => caller must hold uvm.swap_data_lock + * => caller must hold uvm_swap_data_lock */ struct swapdev * swapdrum_getsdp(int pgno) @@ -601,6 +615,8 @@ swapdrum_getsdp(int pgno) struct swapdev *sdp; struct swappri *spp; + MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock); + LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (pgno >= sdp->swd_drumoffset && @@ -662,7 +678,7 @@ sys_swapctl(struct proc *p, void *v, reg * * note that the swap_priority list can't change as long * as we are holding the swap_syscall_lock. we don't want - * to grab the uvm.swap_data_lock because we may fault&sleep during + * to grab the uvm_swap_data_lock because we may fault&sleep during * copyout() and we don't want to be holding that lock then! */ if (SCARG(uap, cmd) == SWAP_STATS) { @@ -734,12 +750,14 @@ sys_swapctl(struct proc *p, void *v, reg */ priority = SCARG(uap, misc); spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); + mtx_enter(&uvm_swap_data_lock); if ((sdp = swaplist_find(vp, 1)) == NULL) { error = ENOENT; } else { swaplist_insert(sdp, spp, priority); swaplist_trim(); } + mtx_leave(&uvm_swap_data_lock); if (error) free(spp, M_VMSWAP, sizeof(*spp)); break; @@ -760,11 +778,8 @@ sys_swapctl(struct proc *p, void *v, reg * trying to enable this device while we are working on * it. */ + priority = SCARG(uap, misc); - if ((sdp = swaplist_find(vp, 0)) != NULL) { - error = EBUSY; - break; - } sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO); spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); sdp->swd_flags = SWF_FAKE; /* placeholder only */ @@ -778,7 +793,19 @@ sys_swapctl(struct proc *p, void *v, reg sdp->swd_cred = crdup(p->p_ucred); } + mtx_enter(&uvm_swap_data_lock); + if (swaplist_find(vp, 0) != NULL) { + error = EBUSY; + mtx_leave(&uvm_swap_data_lock); + if (vp->v_type == VREG) { + crfree(sdp->swd_cred); + } + free(sdp, M_VMSWAP, sizeof *sdp); + free(spp, M_VMSWAP, sizeof *spp); + break; + } swaplist_insert(sdp, spp, priority); + mtx_leave(&uvm_swap_data_lock); sdp->swd_pathlen = len; sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); @@ -792,8 +819,10 @@ sys_swapctl(struct proc *p, void *v, reg */ if ((error = swap_on(p, sdp)) != 0) { + mtx_enter(&uvm_swap_data_lock); (void) swaplist_find(vp, 1); /* kill fake entry */ swaplist_trim(); + mtx_leave(&uvm_swap_data_lock); if (vp->v_type == VREG) { crfree(sdp->swd_cred); } @@ -803,7 +832,9 @@ sys_swapctl(struct proc *p, void *v, reg } break; case SWAP_OFF: + mtx_enter(&uvm_swap_data_lock); if ((sdp = swaplist_find(vp, 0)) == NULL) { + mtx_leave(&uvm_swap_data_lock); error = ENXIO; break; } @@ -813,6 +844,7 @@ sys_swapctl(struct proc *p, void *v, reg * can't stop swapping from it (again). */ if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { + mtx_leave(&uvm_swap_data_lock); error = EBUSY; break; } @@ -841,7 +873,7 @@ out: * SWF_FAKE). * * => we avoid the start of the disk (to protect disk labels) - * => caller should leave uvm.swap_data_lock unlocked, we may lock it + * => caller should leave uvm_swap_data_lock unlocked, we may lock it * if needed. */ int @@ -991,13 +1023,18 @@ swap_on(struct proc *p, struct swapdev * /* now add the new swapdev to the drum and enable. */ swapdrum_add(sdp, npages); sdp->swd_npages = size; + mtx_enter(&uvm_swap_data_lock); sdp->swd_flags &= ~SWF_FAKE; /* going live */ sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); uvmexp.swpages += size; + mtx_leave(&uvm_swap_data_lock); return (0); + /* + * failure: clean up and return error. + */ + bad: - /* failure: close device if necessary and return error. */ if (vp != rootvp) (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); return (error); @@ -1011,10 +1048,15 @@ bad: int swap_off(struct proc *p, struct swapdev *sdp) { + int npages = sdp->swd_npages; int error = 0; + KASSERT(rw_write_held(&swap_syscall_lock)); + MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock); + /* disable the swap area being removed */ sdp->swd_flags &= ~SWF_ENABLE; + mtx_leave(&uvm_swap_data_lock); /* * the idea is to find all the pages that are paged out to this @@ -1027,15 +1069,16 @@ swap_off(struct proc *p, struct swapdev sdp->swd_drumoffset + sdp->swd_drumsize) || amap_swap_off(sdp->swd_drumoffset, sdp->swd_drumoffset + sdp->swd_drumsize)) { - error = ENOMEM; } else if (sdp->swd_npginuse > sdp->swd_npgbad) { error = EBUSY; } if (error) { + mtx_enter(&uvm_swap_data_lock); sdp->swd_flags |= SWF_ENABLE; - return (error); + mtx_leave(&uvm_swap_data_lock); + return error; } /* @@ -1051,11 +1094,13 @@ swap_off(struct proc *p, struct swapdev (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); } - uvmexp.swpages -= sdp->swd_npages; + mtx_enter(&uvm_swap_data_lock); + uvmexp.swpages -= npages; if (swaplist_find(sdp->swd_vp, 1) == NULL) panic("swap_off: swapdev not in list"); swaplist_trim(); + mtx_leave(&uvm_swap_data_lock); /* * free all resources! @@ -1089,7 +1134,9 @@ swstrategy(struct buf *bp) * in it (i.e. the blocks we are doing I/O on). */ pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT; + mtx_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(pageno); + mtx_leave(&uvm_swap_data_lock); if (sdp == NULL) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR; @@ -1406,7 +1453,7 @@ sw_reg_iodone_internal(void *xvbp) * allocate in a priority we "rotate" the tail queue. * => space can be freed with uvm_swap_free * => we return the page slot number in /dev/drum (0 == invalid slot) - * => we lock uvm.swap_data_lock + * => we lock uvm_swap_data_lock * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM */ int @@ -1425,6 +1472,8 @@ uvm_swap_alloc(int *nslots, boolean_t le * lock data lock, convert slots into blocks, and enter loop */ KERNEL_ASSERT_LOCKED(); + mtx_enter(&uvm_swap_data_lock); + ReTry: /* XXXMRG */ LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { @@ -1448,6 +1497,7 @@ ReTry: /* XXXMRG */ TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); sdp->swd_npginuse += *nslots; uvmexp.swpginuse += *nslots; + mtx_leave(&uvm_swap_data_lock); /* done! return drum slot number */ return result + sdp->swd_drumoffset; } @@ -1461,6 +1511,7 @@ ReTry: /* XXXMRG */ } /* XXXMRG: END HACK */ + mtx_leave(&uvm_swap_data_lock); return 0; /* failed */ } @@ -1473,10 +1524,10 @@ uvm_swapisfull(void) { int result; - KERNEL_LOCK(); + mtx_enter(&uvm_swap_data_lock); KASSERT(uvmexp.swpgonly <= uvmexp.swpages); result = (uvmexp.swpgonly == uvmexp.swpages); - KERNEL_UNLOCK(); + mtx_leave(&uvm_swap_data_lock); return result; } @@ -1484,14 +1535,14 @@ uvm_swapisfull(void) /* * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors * - * => we lock uvm.swap_data_lock + * => we lock uvm_swap_data_lock */ void uvm_swap_markbad(int startslot, int nslots) { struct swapdev *sdp; - KERNEL_LOCK(); + mtx_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); if (sdp != NULL) { /* @@ -1502,14 +1553,14 @@ uvm_swap_markbad(int startslot, int nslo */ sdp->swd_npgbad += nslots; } - KERNEL_UNLOCK(); + mtx_leave(&uvm_swap_data_lock); } /* * uvm_swap_free: free swap slots * * => this can be all or part of an allocation made by uvm_swap_alloc - * => we lock uvm.swap_data_lock + * => we lock uvm_swap_data_lock */ void uvm_swap_free(int startslot, int nslots) @@ -1530,6 +1581,7 @@ uvm_swap_free(int startslot, int nslots) * lookup and access the extent. */ KERNEL_LOCK(); + mtx_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); KASSERT(uvmexp.nswapdev >= 1); KASSERT(sdp != NULL); @@ -1537,6 +1589,8 @@ uvm_swap_free(int startslot, int nslots) blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); sdp->swd_npginuse -= nslots; uvmexp.swpginuse -= nslots; + mtx_leave(&uvm_swap_data_lock); + #ifdef UVM_SWAP_ENCRYPT { int i; @@ -1585,7 +1639,7 @@ uvm_swap_get(struct vm_page *page, int s { int result; - uvmexp.nswget++; + atomic_inc_int(&uvmexp.nswget); KASSERT(flags & PGO_SYNCIO); if (swslot == SWSLOT_BAD) { return VM_PAGER_ERROR; @@ -1672,7 +1726,9 @@ uvm_swap_io(struct vm_page **pps, int st * XXX - does this information stay the same over the whole * execution of this function? */ + mtx_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); + mtx_leave(&uvm_swap_data_lock); } /* @@ -1891,13 +1947,11 @@ swapmount(void) char *nam; char path[MNAMELEN + 1]; - /* - * No locking here since we happen to know that we will just be called - * once before any other process has forked. - */ if (swap_dev == NODEV) return; + rw_enter_write(&swap_syscall_lock); + #if defined(NFSCLIENT) if (swap_dev == NETDEV) { extern struct nfs_diskless nfs_diskless; @@ -1934,16 +1988,22 @@ gotit: sdp->swd_vp = vp; + mtx_enter(&uvm_swap_data_lock); swaplist_insert(sdp, spp, 0); + mtx_leave(&uvm_swap_data_lock); if (swap_on(curproc, sdp)) { + mtx_enter(&uvm_swap_data_lock); swaplist_find(vp, 1); swaplist_trim(); vput(sdp->swd_vp); + mtx_leave(&uvm_swap_data_lock); + rw_exit_write(&swap_syscall_lock); free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); free(sdp, M_VMSWAP, sizeof(*sdp)); return; } + rw_exit_write(&swap_syscall_lock); } #ifdef HIBERNATE Index: uvm/uvmexp.h =================================================================== RCS file: /cvs/src/sys/uvm/uvmexp.h,v retrieving revision 1.9 diff -u -p -r1.9 uvmexp.h --- uvm/uvmexp.h 4 Mar 2021 09:00:03 -0000 1.9 +++ uvm/uvmexp.h 17 Aug 2022 11:48:17 -0000 @@ -45,6 +45,7 @@ * I immutable after creation * K kernel lock * F uvm_lock_fpageq + * S uvm_swap_data_lock */ struct uvmexp { /* vm_page constants */ @@ -80,11 +81,11 @@ struct uvmexp { int vnodeminpct;/* min percent vnode pages */ /* swap */ - int nswapdev; /* number of configured swap devices in system */ - int swpages; /* [K] number of PAGE_SIZE'ed swap pages */ - int swpginuse; /* number of swap pages in use */ + int nswapdev; /* [S] number of configured swap devices in system */ + int swpages; /* [S] number of PAGE_SIZE'ed swap pages */ + int swpginuse; /* [K] number of swap pages in use */ int swpgonly; /* [a] number of swap pages in use, not also in RAM */ - int nswget; /* number of swap pages moved from disk to RAM */ + int nswget; /* [a] number of swap pages moved from disk to RAM */ int nanon; /* XXX number total of anon's in system */ int unused05; /* formerly nanonneeded */ int unused06; /* formerly nfreeanon */