On Tue, Jun 19, 2018 at 03:58:51PM +0200, Mark Kettenis wrote:
> > Date: Tue, 19 Jun 2018 15:38:01 +0200
> > From: Martin Pieuchot <m...@openbsd.org>
> > 
> > On 19/06/18(Tue) 14:55, Mark Kettenis wrote:
> > > > To avoid races with another thread that might be clearing our pointer
> > > > in `fd_ofiles', we need more than atomic operations.  For that we need
> > > > to serialize the threads.  The most simple way to do so is with a mutex
> > > > on a different data structure.  Either global, like in my diff, or as
> > > > visa@ suggested in the corresponding `fdp'.
> > > 
> > > Right.  Another case of trying to take a reference without holding one
> > > already.  The trick here is to use the fact that as long as there is a
> > > file descriptor allocated for this open file the reference count is at
> > > least 1.  So in that case taking a reference is safe.  Your global
> > > mutex does indeed do the trick.  But why aren't you using the file
> > > descriptor table rwlock for this purpose?  Is that because there is a
> > > lock ordering problem between the kernel lock and that rwlock?
> > 
> > I have two reasons.  First I don't want to introduce new sleeping points,
> > secondly I want this mutex to disappear.  IMHO extending the scope of a
> > lock is going in the wrong direction because then we'll want to split it.
> > That's why I'm happy that our discussion made visa@ look at improving this.
> 
> I wouldn't call this extending the scope of the lock.  But yes, it
> might cause unnecessary sleeps if a write lock is held for the purpose
> of opening a file.  The mutex that visa@ proposes trades that in for
> potential contion in the multiple-readers case.

Below is a revised version of the diff. Like mpi@'s diff, it uses
a `fhdlk' mutex for serializing access to the file list. However, that
lock is not used for anything else. Each file descriptor table has a
dedicated mutex that allows fd_getfile() acquire a file reference
safely.

fd_iterfile() uses a compare-and-swap sequence for reference
acquisition. The sequence ensures that once `f_count' has become zero
the value is never raised.

This diff does not unlock any system calls.


Index: kern/kern_descrip.c
===================================================================
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.166
diff -u -p -r1.166 kern_descrip.c
--- kern/kern_descrip.c 18 Jun 2018 09:15:05 -0000      1.166
+++ kern/kern_descrip.c 19 Jun 2018 15:13:05 -0000
@@ -66,7 +66,11 @@
 
 /*
  * Descriptor management.
+ *
+ * We need to block interrupts as long as `fhdlk' is being taken
+ * with and without the KERNEL_LOCK().
  */
+struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
 struct filelist filehead;      /* head of list of open files */
 int numfiles;                  /* actual number of open files */
 
@@ -194,17 +198,25 @@ struct file *
 fd_iterfile(struct file *fp, struct proc *p)
 {
        struct file *nfp;
+       unsigned int count;
 
+       mtx_enter(&fhdlk);
        if (fp == NULL)
                nfp = LIST_FIRST(&filehead);
        else
                nfp = LIST_NEXT(fp, f_list);
 
        /* don't FREF when f_count == 0 to avoid race in fdrop() */
-       while (nfp != NULL && nfp->f_count == 0)
-               nfp = LIST_NEXT(nfp, f_list);
-       if (nfp != NULL)
-               FREF(nfp);
+       while (nfp != NULL) {
+               count = nfp->f_count;
+               if (count == 0) {
+                       nfp = LIST_NEXT(nfp, f_list);
+                       continue;
+               }
+               if (atomic_cas_uint(&nfp->f_count, count, count + 1) == count)
+                       break;
+       }
+       mtx_leave(&fhdlk);
 
        if (fp != NULL)
                FRELE(fp, p);
@@ -217,10 +229,17 @@ fd_getfile(struct filedesc *fdp, int fd)
 {
        struct file *fp;
 
-       if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+       vfs_stall_barrier();
+
+       if ((u_int)fd >= fdp->fd_nfiles)
                return (NULL);
 
-       FREF(fp);
+       mtx_enter(&fdp->fd_fplock);
+       fp = fdp->fd_ofiles[fd];
+       if (fp != NULL)
+               atomic_inc_int(&fp->f_count);
+       mtx_leave(&fdp->fd_fplock);
+
        return (fp);
 }
 
@@ -639,7 +658,7 @@ finishdup(struct proc *p, struct file *f
        fdpassertlocked(fdp);
        KASSERT(fp->f_iflags & FIF_INSERTED);
 
-       if (fp->f_count == LONG_MAX-2) {
+       if (fp->f_count >= FDUP_MAX_COUNT) {
                FRELE(fp, p);
                return (EDEADLK);
        }
@@ -653,8 +672,10 @@ finishdup(struct proc *p, struct file *f
                fd_used(fdp, new);
        }
 
+       mtx_enter(&fdp->fd_fplock);
        fdp->fd_ofiles[new] = fp;
        fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+       mtx_leave(&fdp->fd_fplock);
        *retval = new;
 
        if (oldfp != NULL) {
@@ -671,22 +692,34 @@ fdinsert(struct filedesc *fdp, int fd, i
        struct file *fq;
 
        fdpassertlocked(fdp);
+
+       mtx_enter(&fhdlk);
+       fp->f_iflags |= FIF_INSERTED;
        if ((fq = fdp->fd_ofiles[0]) != NULL) {
                LIST_INSERT_AFTER(fq, fp, f_list);
        } else {
                LIST_INSERT_HEAD(&filehead, fp, f_list);
        }
+       mtx_leave(&fhdlk);
+
        KASSERT(fdp->fd_ofiles[fd] == NULL);
        fdp->fd_ofiles[fd] = fp;
        fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
-       fp->f_iflags |= FIF_INSERTED;
 }
 
 void
 fdremove(struct filedesc *fdp, int fd)
 {
        fdpassertlocked(fdp);
+
+       /*
+        * Use `fd_fplock' to synchronize with fd_getfile() so that
+        * the function no longer creates a new reference to the file.
+        */
+       mtx_enter(&fdp->fd_fplock);
        fdp->fd_ofiles[fd] = NULL;
+       mtx_leave(&fdp->fd_fplock);
+
        fdp->fd_ofileflags[fd] = 0;
        fd_unused(fdp, fd);
 }
@@ -817,6 +850,8 @@ fdalloc(struct proc *p, int want, int *r
        int lim, last, i;
        u_int new, off;
 
+       fdpassertlocked(fdp);
+
        /*
         * Search for a free descriptor starting at the higher
         * of want or fd_freefile.  If that fails, consider
@@ -864,14 +899,17 @@ void
 fdexpand(struct proc *p)
 {
        struct filedesc *fdp = p->p_fd;
-       int nfiles;
+       int nfiles, onfiles;
        size_t copylen;
-       struct file **newofile;
+       struct file **newofile, **oldofile;
        char *newofileflags;
        u_int *newhimap, *newlomap;
 
        fdpassertlocked(fdp);
 
+       onfiles = fdp->fd_nfiles;
+       oldofile = fdp->fd_ofiles;
+
        /*
         * No space in current array.
         */
@@ -905,9 +943,6 @@ fdexpand(struct proc *p)
        memcpy(newofileflags, fdp->fd_ofileflags, copylen);
        memset(newofileflags + copylen, 0, nfiles * sizeof(char) - copylen);
 
-       if (fdp->fd_nfiles > NDFILE)
-               free(fdp->fd_ofiles, M_FILEDESC, fdp->fd_nfiles * OFILESIZE);
-
        if (NDHISLOTS(nfiles) > NDHISLOTS(fdp->fd_nfiles)) {
                copylen = NDHISLOTS(fdp->fd_nfiles) * sizeof(u_int);
                memcpy(newhimap, fdp->fd_himap, copylen);
@@ -928,9 +963,17 @@ fdexpand(struct proc *p)
                fdp->fd_himap = newhimap;
                fdp->fd_lomap = newlomap;
        }
+
+       mtx_enter(&fdp->fd_fplock);
        fdp->fd_ofiles = newofile;
        fdp->fd_ofileflags = newofileflags;
-       fdp->fd_nfiles = nfiles;        
+       /* Ensure the new arrays are in use when the size is updated. */
+       membar_producer();
+       fdp->fd_nfiles = nfiles;
+       mtx_leave(&fdp->fd_fplock);
+
+       if (onfiles > NDFILE)
+               free(oldofile, M_FILEDESC, onfiles * OFILESIZE);
 }
 
 /*
@@ -966,7 +1009,7 @@ restart:
         * of open files at that point, otherwise put it at the front of
         * the list of open files.
         */
-       numfiles++;
+       atomic_inc_int(&numfiles);
        fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO);
        /*
         * We need to block interrupts as long as `f_mtx' is being taken
@@ -992,6 +1035,11 @@ fdinit(void)
 
        newfdp = pool_get(&fdesc_pool, PR_WAITOK|PR_ZERO);
        rw_init(&newfdp->fd_fd.fd_lock, "fdlock");
+       /*
+        * We need to block interrupts as long as `fd_fplock' is being taken
+        * with and without the KERNEL_LOCK().
+        */
+       mtx_init(&newfdp->fd_fd.fd_fplock, IPL_MPFLOOR);
 
        /* Create the file descriptor table. */
        newfdp->fd_fd.fd_refcnt = 1;
@@ -1081,7 +1129,7 @@ fdcopy(struct process *pr)
                         * tied to the process that opened them to enforce
                         * their internal consistency, so close them here.
                         */
-                       if (fp->f_count == LONG_MAX-2 ||
+                       if (fp->f_count >= FDUP_MAX_COUNT ||
                            fp->f_type == DTYPE_KQUEUE)
                                continue;
 
@@ -1151,9 +1199,9 @@ closef(struct file *fp, struct proc *p)
 
 #ifdef DIAGNOSTIC
        if (fp->f_count < 2)
-               panic("closef: count (%ld) < 2", fp->f_count);
+               panic("closef: count (%u) < 2", fp->f_count);
 #endif
-       fp->f_count--;
+       atomic_dec_int(&fp->f_count);
 
        /*
         * POSIX record locking dictates that any close releases ALL
@@ -1187,7 +1235,7 @@ fdrop(struct file *fp, struct proc *p)
 
 #ifdef DIAGNOSTIC
        if (fp->f_count != 0)
-               panic("fdrop: count (%ld) != 0", fp->f_count);
+               panic("fdrop: count (%u) != 0", fp->f_count);
 #endif
 
        if (fp->f_ops)
@@ -1195,10 +1243,13 @@ fdrop(struct file *fp, struct proc *p)
        else
                error = 0;
 
+       mtx_enter(&fhdlk);
        if (fp->f_iflags & FIF_INSERTED)
                LIST_REMOVE(fp, f_list);
+       mtx_leave(&fhdlk);
+
        crfree(fp->f_cred);
-       numfiles--;
+       atomic_dec_int(&numfiles);
        pool_put(&file_pool, fp);
 
        return (error);
@@ -1324,7 +1375,7 @@ dupfdopen(struct proc *p, int indx, int 
                FRELE(wfp, p);
                return (EACCES);
        }
-       if (wfp->f_count == LONG_MAX-2) {
+       if (wfp->f_count >= FDUP_MAX_COUNT) {
                FRELE(wfp, p);
                return (EDEADLK);
        }
Index: kern/uipc_usrreq.c
===================================================================
RCS file: src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.129
diff -u -p -r1.129 uipc_usrreq.c
--- kern/uipc_usrreq.c  11 Jun 2018 08:57:35 -0000      1.129
+++ kern/uipc_usrreq.c  19 Jun 2018 15:13:06 -0000
@@ -845,7 +845,7 @@ morespace:
                        error = EBADF;
                        goto fail;
                }
-               if (fp->f_count == LONG_MAX-2) {
+               if (fp->f_count >= FDUP_MAX_COUNT) {
                        error = EDEADLK;
                        goto fail;
                }
Index: sys/file.h
===================================================================
RCS file: src/sys/sys/file.h,v
retrieving revision 1.48
diff -u -p -r1.48 file.h
--- sys/file.h  18 Jun 2018 09:15:05 -0000      1.48
+++ sys/file.h  19 Jun 2018 15:13:06 -0000
@@ -77,7 +77,7 @@ struct file {
 #define        DTYPE_PIPE      3       /* pipe */
 #define        DTYPE_KQUEUE    4       /* event queue */
        short   f_type;         /* [I] descriptor type */
-       long    f_count;        /* [k] reference count */
+       u_int   f_count;        /*     reference count */
        struct  ucred *f_cred;  /* [I] credentials associated with descriptor */
        struct  fileops *f_ops; /* [I] file operation pointers */
        off_t   f_offset;       /* [k] */
@@ -94,12 +94,11 @@ struct file {
 #define FIF_INSERTED           0x80    /* present in `filehead' */
 
 #define FREF(fp) \
-       do { \
-               extern void vfs_stall_barrier(void); \
-               vfs_stall_barrier(); \
-               (fp)->f_count++; \
-       } while (0)
-#define FRELE(fp,p)    (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
+       atomic_inc_int(&(fp)->f_count)
+#define FRELE(fp,p) \
+       (atomic_dec_int_nv(&fp->f_count) == 0 ? fdrop(fp, p) : 0)
+
+#define FDUP_MAX_COUNT         (UINT_MAX - 2 * MAXCPUS)
 
 int    fdrop(struct file *, struct proc *);
 
Index: sys/filedesc.h
===================================================================
RCS file: src/sys/sys/filedesc.h,v
retrieving revision 1.39
diff -u -p -r1.39 filedesc.h
--- sys/filedesc.h      18 Jun 2018 09:15:05 -0000      1.39
+++ sys/filedesc.h      19 Jun 2018 15:13:06 -0000
@@ -32,6 +32,7 @@
  *     @(#)filedesc.h  8.1 (Berkeley) 6/2/93
  */
 
+#include <sys/mutex.h>
 #include <sys/rwlock.h>
 /*
  * This structure is used for the management of descriptors.  It may be
@@ -72,6 +73,8 @@ struct filedesc {
        struct rwlock fd_lock;          /* lock for the file descs; must be */
                                        /* held when writing to fd_ofiles, */
                                        /* fd_ofileflags, or fd_{hi,lo}map */
+       struct mutex fd_fplock;         /* lock for reading fd_ofiles without
+                                        * fd_lock */
 
        int fd_flags;                   /* flags on the file descriptor table */
 };

Reply via email to