On Tue, Jun 19, 2018 at 03:58:51PM +0200, Mark Kettenis wrote: > > Date: Tue, 19 Jun 2018 15:38:01 +0200 > > From: Martin Pieuchot <m...@openbsd.org> > > > > On 19/06/18(Tue) 14:55, Mark Kettenis wrote: > > > > To avoid races with another thread that might be clearing our pointer > > > > in `fd_ofiles', we need more than atomic operations. For that we need > > > > to serialize the threads. The most simple way to do so is with a mutex > > > > on a different data structure. Either global, like in my diff, or as > > > > visa@ suggested in the corresponding `fdp'. > > > > > > Right. Another case of trying to take a reference without holding one > > > already. The trick here is to use the fact that as long as there is a > > > file descriptor allocated for this open file the reference count is at > > > least 1. So in that case taking a reference is safe. Your global > > > mutex does indeed do the trick. But why aren't you using the file > > > descriptor table rwlock for this purpose? Is that because there is a > > > lock ordering problem between the kernel lock and that rwlock? > > > > I have two reasons. First I don't want to introduce new sleeping points, > > secondly I want this mutex to disappear. IMHO extending the scope of a > > lock is going in the wrong direction because then we'll want to split it. > > That's why I'm happy that our discussion made visa@ look at improving this. > > I wouldn't call this extending the scope of the lock. But yes, it > might cause unnecessary sleeps if a write lock is held for the purpose > of opening a file. The mutex that visa@ proposes trades that in for > potential contion in the multiple-readers case.
Below is a revised version of the diff. Like mpi@'s diff, it uses a `fhdlk' mutex for serializing access to the file list. However, that lock is not used for anything else. Each file descriptor table has a dedicated mutex that allows fd_getfile() acquire a file reference safely. fd_iterfile() uses a compare-and-swap sequence for reference acquisition. The sequence ensures that once `f_count' has become zero the value is never raised. This diff does not unlock any system calls. Index: kern/kern_descrip.c =================================================================== RCS file: src/sys/kern/kern_descrip.c,v retrieving revision 1.166 diff -u -p -r1.166 kern_descrip.c --- kern/kern_descrip.c 18 Jun 2018 09:15:05 -0000 1.166 +++ kern/kern_descrip.c 19 Jun 2018 15:13:05 -0000 @@ -66,7 +66,11 @@ /* * Descriptor management. + * + * We need to block interrupts as long as `fhdlk' is being taken + * with and without the KERNEL_LOCK(). */ +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR); struct filelist filehead; /* head of list of open files */ int numfiles; /* actual number of open files */ @@ -194,17 +198,25 @@ struct file * fd_iterfile(struct file *fp, struct proc *p) { struct file *nfp; + unsigned int count; + mtx_enter(&fhdlk); if (fp == NULL) nfp = LIST_FIRST(&filehead); else nfp = LIST_NEXT(fp, f_list); /* don't FREF when f_count == 0 to avoid race in fdrop() */ - while (nfp != NULL && nfp->f_count == 0) - nfp = LIST_NEXT(nfp, f_list); - if (nfp != NULL) - FREF(nfp); + while (nfp != NULL) { + count = nfp->f_count; + if (count == 0) { + nfp = LIST_NEXT(nfp, f_list); + continue; + } + if (atomic_cas_uint(&nfp->f_count, count, count + 1) == count) + break; + } + mtx_leave(&fhdlk); if (fp != NULL) FRELE(fp, p); @@ -217,10 +229,17 @@ fd_getfile(struct filedesc *fdp, int fd) { struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + vfs_stall_barrier(); + + if ((u_int)fd >= fdp->fd_nfiles) return (NULL); - FREF(fp); + mtx_enter(&fdp->fd_fplock); + fp = fdp->fd_ofiles[fd]; + if (fp != NULL) + atomic_inc_int(&fp->f_count); + mtx_leave(&fdp->fd_fplock); + return (fp); } @@ -639,7 +658,7 @@ finishdup(struct proc *p, struct file *f fdpassertlocked(fdp); KASSERT(fp->f_iflags & FIF_INSERTED); - if (fp->f_count == LONG_MAX-2) { + if (fp->f_count >= FDUP_MAX_COUNT) { FRELE(fp, p); return (EDEADLK); } @@ -653,8 +672,10 @@ finishdup(struct proc *p, struct file *f fd_used(fdp, new); } + mtx_enter(&fdp->fd_fplock); fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE; + mtx_leave(&fdp->fd_fplock); *retval = new; if (oldfp != NULL) { @@ -671,22 +692,34 @@ fdinsert(struct filedesc *fdp, int fd, i struct file *fq; fdpassertlocked(fdp); + + mtx_enter(&fhdlk); + fp->f_iflags |= FIF_INSERTED; if ((fq = fdp->fd_ofiles[0]) != NULL) { LIST_INSERT_AFTER(fq, fp, f_list); } else { LIST_INSERT_HEAD(&filehead, fp, f_list); } + mtx_leave(&fhdlk); + KASSERT(fdp->fd_ofiles[fd] == NULL); fdp->fd_ofiles[fd] = fp; fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); - fp->f_iflags |= FIF_INSERTED; } void fdremove(struct filedesc *fdp, int fd) { fdpassertlocked(fdp); + + /* + * Use `fd_fplock' to synchronize with fd_getfile() so that + * the function no longer creates a new reference to the file. + */ + mtx_enter(&fdp->fd_fplock); fdp->fd_ofiles[fd] = NULL; + mtx_leave(&fdp->fd_fplock); + fdp->fd_ofileflags[fd] = 0; fd_unused(fdp, fd); } @@ -817,6 +850,8 @@ fdalloc(struct proc *p, int want, int *r int lim, last, i; u_int new, off; + fdpassertlocked(fdp); + /* * Search for a free descriptor starting at the higher * of want or fd_freefile. If that fails, consider @@ -864,14 +899,17 @@ void fdexpand(struct proc *p) { struct filedesc *fdp = p->p_fd; - int nfiles; + int nfiles, onfiles; size_t copylen; - struct file **newofile; + struct file **newofile, **oldofile; char *newofileflags; u_int *newhimap, *newlomap; fdpassertlocked(fdp); + onfiles = fdp->fd_nfiles; + oldofile = fdp->fd_ofiles; + /* * No space in current array. */ @@ -905,9 +943,6 @@ fdexpand(struct proc *p) memcpy(newofileflags, fdp->fd_ofileflags, copylen); memset(newofileflags + copylen, 0, nfiles * sizeof(char) - copylen); - if (fdp->fd_nfiles > NDFILE) - free(fdp->fd_ofiles, M_FILEDESC, fdp->fd_nfiles * OFILESIZE); - if (NDHISLOTS(nfiles) > NDHISLOTS(fdp->fd_nfiles)) { copylen = NDHISLOTS(fdp->fd_nfiles) * sizeof(u_int); memcpy(newhimap, fdp->fd_himap, copylen); @@ -928,9 +963,17 @@ fdexpand(struct proc *p) fdp->fd_himap = newhimap; fdp->fd_lomap = newlomap; } + + mtx_enter(&fdp->fd_fplock); fdp->fd_ofiles = newofile; fdp->fd_ofileflags = newofileflags; - fdp->fd_nfiles = nfiles; + /* Ensure the new arrays are in use when the size is updated. */ + membar_producer(); + fdp->fd_nfiles = nfiles; + mtx_leave(&fdp->fd_fplock); + + if (onfiles > NDFILE) + free(oldofile, M_FILEDESC, onfiles * OFILESIZE); } /* @@ -966,7 +1009,7 @@ restart: * of open files at that point, otherwise put it at the front of * the list of open files. */ - numfiles++; + atomic_inc_int(&numfiles); fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO); /* * We need to block interrupts as long as `f_mtx' is being taken @@ -992,6 +1035,11 @@ fdinit(void) newfdp = pool_get(&fdesc_pool, PR_WAITOK|PR_ZERO); rw_init(&newfdp->fd_fd.fd_lock, "fdlock"); + /* + * We need to block interrupts as long as `fd_fplock' is being taken + * with and without the KERNEL_LOCK(). + */ + mtx_init(&newfdp->fd_fd.fd_fplock, IPL_MPFLOOR); /* Create the file descriptor table. */ newfdp->fd_fd.fd_refcnt = 1; @@ -1081,7 +1129,7 @@ fdcopy(struct process *pr) * tied to the process that opened them to enforce * their internal consistency, so close them here. */ - if (fp->f_count == LONG_MAX-2 || + if (fp->f_count >= FDUP_MAX_COUNT || fp->f_type == DTYPE_KQUEUE) continue; @@ -1151,9 +1199,9 @@ closef(struct file *fp, struct proc *p) #ifdef DIAGNOSTIC if (fp->f_count < 2) - panic("closef: count (%ld) < 2", fp->f_count); + panic("closef: count (%u) < 2", fp->f_count); #endif - fp->f_count--; + atomic_dec_int(&fp->f_count); /* * POSIX record locking dictates that any close releases ALL @@ -1187,7 +1235,7 @@ fdrop(struct file *fp, struct proc *p) #ifdef DIAGNOSTIC if (fp->f_count != 0) - panic("fdrop: count (%ld) != 0", fp->f_count); + panic("fdrop: count (%u) != 0", fp->f_count); #endif if (fp->f_ops) @@ -1195,10 +1243,13 @@ fdrop(struct file *fp, struct proc *p) else error = 0; + mtx_enter(&fhdlk); if (fp->f_iflags & FIF_INSERTED) LIST_REMOVE(fp, f_list); + mtx_leave(&fhdlk); + crfree(fp->f_cred); - numfiles--; + atomic_dec_int(&numfiles); pool_put(&file_pool, fp); return (error); @@ -1324,7 +1375,7 @@ dupfdopen(struct proc *p, int indx, int FRELE(wfp, p); return (EACCES); } - if (wfp->f_count == LONG_MAX-2) { + if (wfp->f_count >= FDUP_MAX_COUNT) { FRELE(wfp, p); return (EDEADLK); } Index: kern/uipc_usrreq.c =================================================================== RCS file: src/sys/kern/uipc_usrreq.c,v retrieving revision 1.129 diff -u -p -r1.129 uipc_usrreq.c --- kern/uipc_usrreq.c 11 Jun 2018 08:57:35 -0000 1.129 +++ kern/uipc_usrreq.c 19 Jun 2018 15:13:06 -0000 @@ -845,7 +845,7 @@ morespace: error = EBADF; goto fail; } - if (fp->f_count == LONG_MAX-2) { + if (fp->f_count >= FDUP_MAX_COUNT) { error = EDEADLK; goto fail; } Index: sys/file.h =================================================================== RCS file: src/sys/sys/file.h,v retrieving revision 1.48 diff -u -p -r1.48 file.h --- sys/file.h 18 Jun 2018 09:15:05 -0000 1.48 +++ sys/file.h 19 Jun 2018 15:13:06 -0000 @@ -77,7 +77,7 @@ struct file { #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_KQUEUE 4 /* event queue */ short f_type; /* [I] descriptor type */ - long f_count; /* [k] reference count */ + u_int f_count; /* reference count */ struct ucred *f_cred; /* [I] credentials associated with descriptor */ struct fileops *f_ops; /* [I] file operation pointers */ off_t f_offset; /* [k] */ @@ -94,12 +94,11 @@ struct file { #define FIF_INSERTED 0x80 /* present in `filehead' */ #define FREF(fp) \ - do { \ - extern void vfs_stall_barrier(void); \ - vfs_stall_barrier(); \ - (fp)->f_count++; \ - } while (0) -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) + atomic_inc_int(&(fp)->f_count) +#define FRELE(fp,p) \ + (atomic_dec_int_nv(&fp->f_count) == 0 ? fdrop(fp, p) : 0) + +#define FDUP_MAX_COUNT (UINT_MAX - 2 * MAXCPUS) int fdrop(struct file *, struct proc *); Index: sys/filedesc.h =================================================================== RCS file: src/sys/sys/filedesc.h,v retrieving revision 1.39 diff -u -p -r1.39 filedesc.h --- sys/filedesc.h 18 Jun 2018 09:15:05 -0000 1.39 +++ sys/filedesc.h 19 Jun 2018 15:13:06 -0000 @@ -32,6 +32,7 @@ * @(#)filedesc.h 8.1 (Berkeley) 6/2/93 */ +#include <sys/mutex.h> #include <sys/rwlock.h> /* * This structure is used for the management of descriptors. It may be @@ -72,6 +73,8 @@ struct filedesc { struct rwlock fd_lock; /* lock for the file descs; must be */ /* held when writing to fd_ofiles, */ /* fd_ofileflags, or fd_{hi,lo}map */ + struct mutex fd_fplock; /* lock for reading fd_ofiles without + * fd_lock */ int fd_flags; /* flags on the file descriptor table */ };