Re: uvm_swap: introduce uvm_swap_data_lock

Martin Pieuchot Wed, 17 Aug 2022 06:13:05 -0700

On 16/01/22(Sun) 15:35, Martin Pieuchot wrote:
> On 30/12/21(Thu) 23:38, Theo Buehler wrote:
> > The diff below does two things: it adds a uvm_swap_data_lock mutex and
> > trades it for the KERNEL_LOCK in uvm_swapisfull() and uvm_swap_markbad()
> 
> Why is it enough?  Which fields is the lock protecting in these
> function?  Is it `uvmexp.swpages', could that be documented?


It is documented in the diff below.

> 
> What about `nswapdev'?  Why is the rwlock grabbed before reading it in
> sys_swapctl()?i

Because it is always modified with the lock, I added some documentation.

> What about `swpginuse'?

This is still under KERNEL_LOCK(), documented below.

> If the mutex/rwlock are used to protect the global `swap_priority' could
> that be also documented?  Once this is documented it should be trivial to
> see that some places are missing some locking.  Is it intentional?
> 
> > The uvm_swap_data_lock protects all swap data structures, so needs to be
> > grabbed a few times, many of them already documented in the comments.
> > 
> > For review, I suggest comparing to what NetBSD did and also going
> > through the consumers (swaplist_insert, swaplist_find, swaplist_trim)
> > and check that they are properly locked when called, or that there is
> > the KERNEL_LOCK() in place when swap data structures are manipulated.
> 
> I'd suggest using the KASSERT(rw_write_held()) idiom to further reduce
> the differences with NetBSD.

Done.

> > In swapmount() I introduced locking since that's needed to be able to
> > assert that the proper locks are held in swaplist_{insert,find,trim}.
> 
> Could the KERNEL_LOCK() in uvm_swap_get() be pushed a bit further down?
> What about `uvmexp.nswget' and `uvmexp.swpgonly' in there?

This has been done as part of another change.  This diff uses an atomic
operation to increase `nswget' in case multiple threads fault on a page
in swap at the same time.

Updated diff below, ok?

Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.163
diff -u -p -r1.163 uvm_swap.c
--- uvm/uvm_swap.c      6 Aug 2022 13:44:04 -0000       1.163
+++ uvm/uvm_swap.c      17 Aug 2022 11:46:20 -0000
@@ -45,6 +45,7 @@
 #include <sys/extent.h>
 #include <sys/blist.h>
 #include <sys/mount.h>
+#include <sys/mutex.h>
 #include <sys/pool.h>
 #include <sys/syscallargs.h>
 #include <sys/swap.h>
@@ -84,13 +85,16 @@
  * the system maintains a global data structure describing all swap
  * partitions/files.   there is a sorted LIST of "swappri" structures
  * which describe "swapdev"'s at that priority.   this LIST is headed
- * by the "swap_priority" global var.    each "swappri" contains a 
+ * by the "swap_priority" global var.    each "swappri" contains a
  * TAILQ of "swapdev" structures at that priority.
  *
  * locking:
  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
  *    system call and prevents the swap priority list from changing
  *    while we are in the middle of a system call (e.g. SWAP_STATS).
+ *  - uvm_swap_data_lock (mutex): this lock protects all swap data
+ *    structures including the priority list, the swapdev structures,
+ *    and the swapmap arena.
  *
  * each swap device has the following info:
  *  - swap device in use (could be disabled, preventing future use)
@@ -106,7 +110,7 @@
  * userland controls and configures swap with the swapctl(2) system call.
  * the sys_swapctl performs the following operations:
  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
- *  [2] SWAP_STATS: given a pointer to an array of swapent structures 
+ *  [2] SWAP_STATS: given a pointer to an array of swapent structures
  *     (passed in via "arg") of a size passed in via "misc" ... we load
  *     the current swap config into the array.
  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
@@ -208,9 +212,10 @@ struct extent *swapmap;            /* controls the
 
 /* list of all active swap devices [by priority] */
 LIST_HEAD(swap_priority, swappri);
-struct swap_priority swap_priority;
+struct swap_priority swap_priority;    /* [S] */
 
 /* locks */
+struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_NONE);
 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
 
 struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
@@ -224,7 +229,7 @@ void                 swapdrum_add(struct swapdev *, in
 struct swapdev *swapdrum_getsdp(int);
 
 struct swapdev *swaplist_find(struct vnode *, int);
-void            swaplist_insert(struct swapdev *, 
+void            swaplist_insert(struct swapdev *,
                                     struct swappri *, int);
 void            swaplist_trim(void);
 
@@ -472,16 +477,19 @@ uvm_swap_finicrypt_all(void)
 /*
  * swaplist_insert: insert swap device "sdp" into the global list
  *
- * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
- * => caller must provide a newly malloc'd swappri structure (we will
- *     FREE it if we don't need it... this it to prevent malloc blocking
- *     here while adding swap)
+ * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
+ * => caller must provide a newly allocated swappri structure (we will
+ *     FREE it if we don't need it... this it to prevent allocation
+ *     blocking here while adding swap)
  */
 void
 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
 {
        struct swappri *spp, *pspp;
 
+       KASSERT(rw_write_held(&swap_syscall_lock));
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        /*
         * find entry at or after which to insert the new device.
         */
@@ -523,7 +531,7 @@ swaplist_insert(struct swapdev *sdp, str
  * swaplist_find: find and optionally remove a swap device from the
  *     global list.
  *
- * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
+ * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  * => we return the swapdev we found (and removed)
  */
 struct swapdev *
@@ -532,6 +540,9 @@ swaplist_find(struct vnode *vp, boolean_
        struct swapdev *sdp;
        struct swappri *spp;
 
+       KASSERT(rw_write_held(&swap_syscall_lock));
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        /*
         * search the lists for the requested vp
         */
@@ -554,13 +565,16 @@ swaplist_find(struct vnode *vp, boolean_
  * swaplist_trim: scan priority list for empty priority entries and kill
  *     them.
  *
- * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
+ * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  */
 void
 swaplist_trim(void)
 {
        struct swappri *spp, *nextspp;
 
+       KASSERT(rw_write_held(&swap_syscall_lock));
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
                if (!TAILQ_EMPTY(&spp->spi_swapdev))
                        continue;
@@ -573,7 +587,7 @@ swaplist_trim(void)
  * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
  *
  * => caller must hold swap_syscall_lock
- * => uvm.swap_data_lock should be unlocked (we may sleep)
+ * => uvm_swap_data_lock should be unlocked (we may sleep)
  */
 void
 swapdrum_add(struct swapdev *sdp, int npages)
@@ -593,7 +607,7 @@ swapdrum_add(struct swapdev *sdp, int np
  *     to the "swapdev" that maps that section of the drum.
  *
  * => each swapdev takes one big contig chunk of the drum
- * => caller must hold uvm.swap_data_lock
+ * => caller must hold uvm_swap_data_lock
  */
 struct swapdev *
 swapdrum_getsdp(int pgno)
@@ -601,6 +615,8 @@ swapdrum_getsdp(int pgno)
        struct swapdev *sdp;
        struct swappri *spp;
 
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        if (pgno >= sdp->swd_drumoffset &&
@@ -662,7 +678,7 @@ sys_swapctl(struct proc *p, void *v, reg
         *
         * note that the swap_priority list can't change as long
         * as we are holding the swap_syscall_lock.  we don't want
-        * to grab the uvm.swap_data_lock because we may fault&sleep during
+        * to grab the uvm_swap_data_lock because we may fault&sleep during
         * copyout() and we don't want to be holding that lock then!
         */
        if (SCARG(uap, cmd) == SWAP_STATS) {
@@ -734,12 +750,14 @@ sys_swapctl(struct proc *p, void *v, reg
                 */
                priority = SCARG(uap, misc);
                spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
+               mtx_enter(&uvm_swap_data_lock);
                if ((sdp = swaplist_find(vp, 1)) == NULL) {
                        error = ENOENT;
                } else {
                        swaplist_insert(sdp, spp, priority);
                        swaplist_trim();
                }
+               mtx_leave(&uvm_swap_data_lock);
                if (error)
                        free(spp, M_VMSWAP, sizeof(*spp));
                break;
@@ -760,11 +778,8 @@ sys_swapctl(struct proc *p, void *v, reg
                 * trying to enable this device while we are working on
                 * it.
                 */
+
                priority = SCARG(uap, misc);
-               if ((sdp = swaplist_find(vp, 0)) != NULL) {
-                       error = EBUSY;
-                       break;
-               }
                sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO);
                spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
                sdp->swd_flags = SWF_FAKE;      /* placeholder only */
@@ -778,7 +793,19 @@ sys_swapctl(struct proc *p, void *v, reg
                        sdp->swd_cred = crdup(p->p_ucred);
                }
 
+               mtx_enter(&uvm_swap_data_lock);
+               if (swaplist_find(vp, 0) != NULL) {
+                       error = EBUSY;
+                       mtx_leave(&uvm_swap_data_lock);
+                       if (vp->v_type == VREG) {
+                               crfree(sdp->swd_cred);
+                       }
+                       free(sdp, M_VMSWAP, sizeof *sdp);
+                       free(spp, M_VMSWAP, sizeof *spp);
+                       break;
+               }
                swaplist_insert(sdp, spp, priority);
+               mtx_leave(&uvm_swap_data_lock);
 
                sdp->swd_pathlen = len;
                sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
@@ -792,8 +819,10 @@ sys_swapctl(struct proc *p, void *v, reg
                 */
 
                if ((error = swap_on(p, sdp)) != 0) {
+                       mtx_enter(&uvm_swap_data_lock);
                        (void) swaplist_find(vp, 1);  /* kill fake entry */
                        swaplist_trim();
+                       mtx_leave(&uvm_swap_data_lock);
                        if (vp->v_type == VREG) {
                                crfree(sdp->swd_cred);
                        }
@@ -803,7 +832,9 @@ sys_swapctl(struct proc *p, void *v, reg
                }
                break;
        case SWAP_OFF:
+               mtx_enter(&uvm_swap_data_lock);
                if ((sdp = swaplist_find(vp, 0)) == NULL) {
+                       mtx_leave(&uvm_swap_data_lock);
                        error = ENXIO;
                        break;
                }
@@ -813,6 +844,7 @@ sys_swapctl(struct proc *p, void *v, reg
                 * can't stop swapping from it (again).
                 */
                if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
+                       mtx_leave(&uvm_swap_data_lock);
                        error = EBUSY;
                        break;
                }
@@ -841,7 +873,7 @@ out:
  *     SWF_FAKE).
  *
  * => we avoid the start of the disk (to protect disk labels)
- * => caller should leave uvm.swap_data_lock unlocked, we may lock it
+ * => caller should leave uvm_swap_data_lock unlocked, we may lock it
  *     if needed.
  */
 int
@@ -991,13 +1023,18 @@ swap_on(struct proc *p, struct swapdev *
        /* now add the new swapdev to the drum and enable. */
        swapdrum_add(sdp, npages);
        sdp->swd_npages = size;
+       mtx_enter(&uvm_swap_data_lock);
        sdp->swd_flags &= ~SWF_FAKE;    /* going live */
        sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
        uvmexp.swpages += size;
+       mtx_leave(&uvm_swap_data_lock);
        return (0);
 
+       /*
+        * failure: clean up and return error.
+        */
+
 bad:
-       /* failure: close device if necessary and return error. */
        if (vp != rootvp)
                (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
        return (error);
@@ -1011,10 +1048,15 @@ bad:
 int
 swap_off(struct proc *p, struct swapdev *sdp)
 {
+       int npages = sdp->swd_npages;
        int error = 0;
 
+       KASSERT(rw_write_held(&swap_syscall_lock));
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        /* disable the swap area being removed */
        sdp->swd_flags &= ~SWF_ENABLE;
+       mtx_leave(&uvm_swap_data_lock);
 
        /*
         * the idea is to find all the pages that are paged out to this
@@ -1027,15 +1069,16 @@ swap_off(struct proc *p, struct swapdev 
                         sdp->swd_drumoffset + sdp->swd_drumsize) ||
            amap_swap_off(sdp->swd_drumoffset,
                          sdp->swd_drumoffset + sdp->swd_drumsize)) {
-
                error = ENOMEM;
        } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
                error = EBUSY;
        }
 
        if (error) {
+               mtx_enter(&uvm_swap_data_lock);
                sdp->swd_flags |= SWF_ENABLE;
-               return (error);
+               mtx_leave(&uvm_swap_data_lock);
+               return error;
        }
 
        /*
@@ -1051,11 +1094,13 @@ swap_off(struct proc *p, struct swapdev 
                (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
        }
 
-       uvmexp.swpages -= sdp->swd_npages;
+       mtx_enter(&uvm_swap_data_lock);
+       uvmexp.swpages -= npages;
 
        if (swaplist_find(sdp->swd_vp, 1) == NULL)
                panic("swap_off: swapdev not in list");
        swaplist_trim();
+       mtx_leave(&uvm_swap_data_lock);
 
        /*
         * free all resources!
@@ -1089,7 +1134,9 @@ swstrategy(struct buf *bp)
         * in it (i.e. the blocks we are doing I/O on).
         */
        pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT;
+       mtx_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(pageno);
+       mtx_leave(&uvm_swap_data_lock);
        if (sdp == NULL) {
                bp->b_error = EINVAL;
                bp->b_flags |= B_ERROR;
@@ -1406,7 +1453,7 @@ sw_reg_iodone_internal(void *xvbp)
  *     allocate in a priority we "rotate" the tail queue.
  * => space can be freed with uvm_swap_free
  * => we return the page slot number in /dev/drum (0 == invalid slot)
- * => we lock uvm.swap_data_lock
+ * => we lock uvm_swap_data_lock
  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
  */
 int
@@ -1425,6 +1472,8 @@ uvm_swap_alloc(int *nslots, boolean_t le
         * lock data lock, convert slots into blocks, and enter loop
         */
        KERNEL_ASSERT_LOCKED();
+       mtx_enter(&uvm_swap_data_lock);
+
 ReTry: /* XXXMRG */
        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
@@ -1448,6 +1497,7 @@ ReTry:    /* XXXMRG */
                        TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
                        sdp->swd_npginuse += *nslots;
                        uvmexp.swpginuse += *nslots;
+                       mtx_leave(&uvm_swap_data_lock);
                        /* done!  return drum slot number */
                        return result + sdp->swd_drumoffset;
                }
@@ -1461,6 +1511,7 @@ ReTry:    /* XXXMRG */
        }
        /* XXXMRG: END HACK */
 
+       mtx_leave(&uvm_swap_data_lock);
        return 0;               /* failed */
 }
 
@@ -1473,10 +1524,10 @@ uvm_swapisfull(void)
 {
        int result;
 
-       KERNEL_LOCK();
+       mtx_enter(&uvm_swap_data_lock);
        KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
        result = (uvmexp.swpgonly == uvmexp.swpages);
-       KERNEL_UNLOCK();
+       mtx_leave(&uvm_swap_data_lock);
 
        return result;
 }
@@ -1484,14 +1535,14 @@ uvm_swapisfull(void)
 /*
  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
  *
- * => we lock uvm.swap_data_lock
+ * => we lock uvm_swap_data_lock
  */
 void
 uvm_swap_markbad(int startslot, int nslots)
 {
        struct swapdev *sdp;
 
-       KERNEL_LOCK();
+       mtx_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(startslot);
        if (sdp != NULL) {
                /*
@@ -1502,14 +1553,14 @@ uvm_swap_markbad(int startslot, int nslo
                 */
                sdp->swd_npgbad += nslots;
        }
-       KERNEL_UNLOCK();
+       mtx_leave(&uvm_swap_data_lock);
 }
 
 /*
  * uvm_swap_free: free swap slots
  *
  * => this can be all or part of an allocation made by uvm_swap_alloc
- * => we lock uvm.swap_data_lock
+ * => we lock uvm_swap_data_lock
  */
 void
 uvm_swap_free(int startslot, int nslots)
@@ -1530,6 +1581,7 @@ uvm_swap_free(int startslot, int nslots)
         * lookup and access the extent.
         */
        KERNEL_LOCK();
+       mtx_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(startslot);
        KASSERT(uvmexp.nswapdev >= 1);
        KASSERT(sdp != NULL);
@@ -1537,6 +1589,8 @@ uvm_swap_free(int startslot, int nslots)
        blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
        sdp->swd_npginuse -= nslots;
        uvmexp.swpginuse -= nslots;
+       mtx_leave(&uvm_swap_data_lock);
+
 #ifdef UVM_SWAP_ENCRYPT
        {
                int i;
@@ -1585,7 +1639,7 @@ uvm_swap_get(struct vm_page *page, int s
 {
        int     result;
 
-       uvmexp.nswget++;
+       atomic_inc_int(&uvmexp.nswget);
        KASSERT(flags & PGO_SYNCIO);
        if (swslot == SWSLOT_BAD) {
                return VM_PAGER_ERROR;
@@ -1672,7 +1726,9 @@ uvm_swap_io(struct vm_page **pps, int st
                 * XXX - does this information stay the same over the whole
                 * execution of this function?
                 */
+               mtx_enter(&uvm_swap_data_lock);
                sdp = swapdrum_getsdp(startslot);
+               mtx_leave(&uvm_swap_data_lock);
        }
 
        /*
@@ -1891,13 +1947,11 @@ swapmount(void)
        char *nam;
        char path[MNAMELEN + 1];
 
-       /*
-        * No locking here since we happen to know that we will just be called
-        * once before any other process has forked.
-        */
        if (swap_dev == NODEV)
                return;
 
+       rw_enter_write(&swap_syscall_lock);
+
 #if defined(NFSCLIENT)
        if (swap_dev == NETDEV) {
                extern struct nfs_diskless nfs_diskless;
@@ -1934,16 +1988,22 @@ gotit:
 
        sdp->swd_vp = vp;
 
+       mtx_enter(&uvm_swap_data_lock);
        swaplist_insert(sdp, spp, 0);
+       mtx_leave(&uvm_swap_data_lock);
 
        if (swap_on(curproc, sdp)) {
+               mtx_enter(&uvm_swap_data_lock);
                swaplist_find(vp, 1);
                swaplist_trim();
                vput(sdp->swd_vp);
+               mtx_leave(&uvm_swap_data_lock);
+               rw_exit_write(&swap_syscall_lock);
                free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
                free(sdp, M_VMSWAP, sizeof(*sdp));
                return;
        }
+       rw_exit_write(&swap_syscall_lock);
 }
 
 #ifdef HIBERNATE
Index: uvm/uvmexp.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvmexp.h,v
retrieving revision 1.9
diff -u -p -r1.9 uvmexp.h
--- uvm/uvmexp.h        4 Mar 2021 09:00:03 -0000       1.9
+++ uvm/uvmexp.h        17 Aug 2022 11:48:17 -0000
@@ -45,6 +45,7 @@
  *     I       immutable after creation
  *     K       kernel lock
  *     F       uvm_lock_fpageq
+ *     S       uvm_swap_data_lock
  */
 struct uvmexp {
        /* vm_page constants */
@@ -80,11 +81,11 @@ struct uvmexp {
        int vnodeminpct;/* min percent vnode pages */
 
        /* swap */
-       int nswapdev;   /* number of configured swap devices in system */
-       int swpages;    /* [K] number of PAGE_SIZE'ed swap pages */
-       int swpginuse;  /* number of swap pages in use */
+       int nswapdev;   /* [S] number of configured swap devices in system */
+       int swpages;    /* [S] number of PAGE_SIZE'ed swap pages */
+       int swpginuse;  /* [K] number of swap pages in use */
        int swpgonly;   /* [a] number of swap pages in use, not also in RAM */
-       int nswget;     /* number of swap pages moved from disk to RAM */
+       int nswget;     /* [a] number of swap pages moved from disk to RAM */
        int nanon;      /* XXX number total of anon's in system */
        int unused05;   /* formerly nanonneeded */
        int unused06;   /* formerly nfreeanon */

Re: uvm_swap: introduce uvm_swap_data_lock

Reply via email to