On Tue, Jun 19, 2018 at 03:58:51PM +0200, Mark Kettenis wrote:
> > Date: Tue, 19 Jun 2018 15:38:01 +0200
> > From: Martin Pieuchot <[email protected]>
> >
> > On 19/06/18(Tue) 14:55, Mark Kettenis wrote:
> > > > To avoid races with another thread that might be clearing our pointer
> > > > in `fd_ofiles', we need more than atomic operations. For that we need
> > > > to serialize the threads. The most simple way to do so is with a mutex
> > > > on a different data structure. Either global, like in my diff, or as
> > > > visa@ suggested in the corresponding `fdp'.
> > >
> > > Right. Another case of trying to take a reference without holding one
> > > already. The trick here is to use the fact that as long as there is a
> > > file descriptor allocated for this open file the reference count is at
> > > least 1. So in that case taking a reference is safe. Your global
> > > mutex does indeed do the trick. But why aren't you using the file
> > > descriptor table rwlock for this purpose? Is that because there is a
> > > lock ordering problem between the kernel lock and that rwlock?
> >
> > I have two reasons. First I don't want to introduce new sleeping points,
> > secondly I want this mutex to disappear. IMHO extending the scope of a
> > lock is going in the wrong direction because then we'll want to split it.
> > That's why I'm happy that our discussion made visa@ look at improving this.
>
> I wouldn't call this extending the scope of the lock. But yes, it
> might cause unnecessary sleeps if a write lock is held for the purpose
> of opening a file. The mutex that visa@ proposes trades that in for
> potential contion in the multiple-readers case.
Below is a revised version of the diff. Like mpi@'s diff, it uses
a `fhdlk' mutex for serializing access to the file list. However, that
lock is not used for anything else. Each file descriptor table has a
dedicated mutex that allows fd_getfile() acquire a file reference
safely.
fd_iterfile() uses a compare-and-swap sequence for reference
acquisition. The sequence ensures that once `f_count' has become zero
the value is never raised.
This diff does not unlock any system calls.
Index: kern/kern_descrip.c
===================================================================
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.166
diff -u -p -r1.166 kern_descrip.c
--- kern/kern_descrip.c 18 Jun 2018 09:15:05 -0000 1.166
+++ kern/kern_descrip.c 19 Jun 2018 15:13:05 -0000
@@ -66,7 +66,11 @@
/*
* Descriptor management.
+ *
+ * We need to block interrupts as long as `fhdlk' is being taken
+ * with and without the KERNEL_LOCK().
*/
+struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
struct filelist filehead; /* head of list of open files */
int numfiles; /* actual number of open files */
@@ -194,17 +198,25 @@ struct file *
fd_iterfile(struct file *fp, struct proc *p)
{
struct file *nfp;
+ unsigned int count;
+ mtx_enter(&fhdlk);
if (fp == NULL)
nfp = LIST_FIRST(&filehead);
else
nfp = LIST_NEXT(fp, f_list);
/* don't FREF when f_count == 0 to avoid race in fdrop() */
- while (nfp != NULL && nfp->f_count == 0)
- nfp = LIST_NEXT(nfp, f_list);
- if (nfp != NULL)
- FREF(nfp);
+ while (nfp != NULL) {
+ count = nfp->f_count;
+ if (count == 0) {
+ nfp = LIST_NEXT(nfp, f_list);
+ continue;
+ }
+ if (atomic_cas_uint(&nfp->f_count, count, count + 1) == count)
+ break;
+ }
+ mtx_leave(&fhdlk);
if (fp != NULL)
FRELE(fp, p);
@@ -217,10 +229,17 @@ fd_getfile(struct filedesc *fdp, int fd)
{
struct file *fp;
- if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+ vfs_stall_barrier();
+
+ if ((u_int)fd >= fdp->fd_nfiles)
return (NULL);
- FREF(fp);
+ mtx_enter(&fdp->fd_fplock);
+ fp = fdp->fd_ofiles[fd];
+ if (fp != NULL)
+ atomic_inc_int(&fp->f_count);
+ mtx_leave(&fdp->fd_fplock);
+
return (fp);
}
@@ -639,7 +658,7 @@ finishdup(struct proc *p, struct file *f
fdpassertlocked(fdp);
KASSERT(fp->f_iflags & FIF_INSERTED);
- if (fp->f_count == LONG_MAX-2) {
+ if (fp->f_count >= FDUP_MAX_COUNT) {
FRELE(fp, p);
return (EDEADLK);
}
@@ -653,8 +672,10 @@ finishdup(struct proc *p, struct file *f
fd_used(fdp, new);
}
+ mtx_enter(&fdp->fd_fplock);
fdp->fd_ofiles[new] = fp;
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+ mtx_leave(&fdp->fd_fplock);
*retval = new;
if (oldfp != NULL) {
@@ -671,22 +692,34 @@ fdinsert(struct filedesc *fdp, int fd, i
struct file *fq;
fdpassertlocked(fdp);
+
+ mtx_enter(&fhdlk);
+ fp->f_iflags |= FIF_INSERTED;
if ((fq = fdp->fd_ofiles[0]) != NULL) {
LIST_INSERT_AFTER(fq, fp, f_list);
} else {
LIST_INSERT_HEAD(&filehead, fp, f_list);
}
+ mtx_leave(&fhdlk);
+
KASSERT(fdp->fd_ofiles[fd] == NULL);
fdp->fd_ofiles[fd] = fp;
fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
- fp->f_iflags |= FIF_INSERTED;
}
void
fdremove(struct filedesc *fdp, int fd)
{
fdpassertlocked(fdp);
+
+ /*
+ * Use `fd_fplock' to synchronize with fd_getfile() so that
+ * the function no longer creates a new reference to the file.
+ */
+ mtx_enter(&fdp->fd_fplock);
fdp->fd_ofiles[fd] = NULL;
+ mtx_leave(&fdp->fd_fplock);
+
fdp->fd_ofileflags[fd] = 0;
fd_unused(fdp, fd);
}
@@ -817,6 +850,8 @@ fdalloc(struct proc *p, int want, int *r
int lim, last, i;
u_int new, off;
+ fdpassertlocked(fdp);
+
/*
* Search for a free descriptor starting at the higher
* of want or fd_freefile. If that fails, consider
@@ -864,14 +899,17 @@ void
fdexpand(struct proc *p)
{
struct filedesc *fdp = p->p_fd;
- int nfiles;
+ int nfiles, onfiles;
size_t copylen;
- struct file **newofile;
+ struct file **newofile, **oldofile;
char *newofileflags;
u_int *newhimap, *newlomap;
fdpassertlocked(fdp);
+ onfiles = fdp->fd_nfiles;
+ oldofile = fdp->fd_ofiles;
+
/*
* No space in current array.
*/
@@ -905,9 +943,6 @@ fdexpand(struct proc *p)
memcpy(newofileflags, fdp->fd_ofileflags, copylen);
memset(newofileflags + copylen, 0, nfiles * sizeof(char) - copylen);
- if (fdp->fd_nfiles > NDFILE)
- free(fdp->fd_ofiles, M_FILEDESC, fdp->fd_nfiles * OFILESIZE);
-
if (NDHISLOTS(nfiles) > NDHISLOTS(fdp->fd_nfiles)) {
copylen = NDHISLOTS(fdp->fd_nfiles) * sizeof(u_int);
memcpy(newhimap, fdp->fd_himap, copylen);
@@ -928,9 +963,17 @@ fdexpand(struct proc *p)
fdp->fd_himap = newhimap;
fdp->fd_lomap = newlomap;
}
+
+ mtx_enter(&fdp->fd_fplock);
fdp->fd_ofiles = newofile;
fdp->fd_ofileflags = newofileflags;
- fdp->fd_nfiles = nfiles;
+ /* Ensure the new arrays are in use when the size is updated. */
+ membar_producer();
+ fdp->fd_nfiles = nfiles;
+ mtx_leave(&fdp->fd_fplock);
+
+ if (onfiles > NDFILE)
+ free(oldofile, M_FILEDESC, onfiles * OFILESIZE);
}
/*
@@ -966,7 +1009,7 @@ restart:
* of open files at that point, otherwise put it at the front of
* the list of open files.
*/
- numfiles++;
+ atomic_inc_int(&numfiles);
fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO);
/*
* We need to block interrupts as long as `f_mtx' is being taken
@@ -992,6 +1035,11 @@ fdinit(void)
newfdp = pool_get(&fdesc_pool, PR_WAITOK|PR_ZERO);
rw_init(&newfdp->fd_fd.fd_lock, "fdlock");
+ /*
+ * We need to block interrupts as long as `fd_fplock' is being taken
+ * with and without the KERNEL_LOCK().
+ */
+ mtx_init(&newfdp->fd_fd.fd_fplock, IPL_MPFLOOR);
/* Create the file descriptor table. */
newfdp->fd_fd.fd_refcnt = 1;
@@ -1081,7 +1129,7 @@ fdcopy(struct process *pr)
* tied to the process that opened them to enforce
* their internal consistency, so close them here.
*/
- if (fp->f_count == LONG_MAX-2 ||
+ if (fp->f_count >= FDUP_MAX_COUNT ||
fp->f_type == DTYPE_KQUEUE)
continue;
@@ -1151,9 +1199,9 @@ closef(struct file *fp, struct proc *p)
#ifdef DIAGNOSTIC
if (fp->f_count < 2)
- panic("closef: count (%ld) < 2", fp->f_count);
+ panic("closef: count (%u) < 2", fp->f_count);
#endif
- fp->f_count--;
+ atomic_dec_int(&fp->f_count);
/*
* POSIX record locking dictates that any close releases ALL
@@ -1187,7 +1235,7 @@ fdrop(struct file *fp, struct proc *p)
#ifdef DIAGNOSTIC
if (fp->f_count != 0)
- panic("fdrop: count (%ld) != 0", fp->f_count);
+ panic("fdrop: count (%u) != 0", fp->f_count);
#endif
if (fp->f_ops)
@@ -1195,10 +1243,13 @@ fdrop(struct file *fp, struct proc *p)
else
error = 0;
+ mtx_enter(&fhdlk);
if (fp->f_iflags & FIF_INSERTED)
LIST_REMOVE(fp, f_list);
+ mtx_leave(&fhdlk);
+
crfree(fp->f_cred);
- numfiles--;
+ atomic_dec_int(&numfiles);
pool_put(&file_pool, fp);
return (error);
@@ -1324,7 +1375,7 @@ dupfdopen(struct proc *p, int indx, int
FRELE(wfp, p);
return (EACCES);
}
- if (wfp->f_count == LONG_MAX-2) {
+ if (wfp->f_count >= FDUP_MAX_COUNT) {
FRELE(wfp, p);
return (EDEADLK);
}
Index: kern/uipc_usrreq.c
===================================================================
RCS file: src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.129
diff -u -p -r1.129 uipc_usrreq.c
--- kern/uipc_usrreq.c 11 Jun 2018 08:57:35 -0000 1.129
+++ kern/uipc_usrreq.c 19 Jun 2018 15:13:06 -0000
@@ -845,7 +845,7 @@ morespace:
error = EBADF;
goto fail;
}
- if (fp->f_count == LONG_MAX-2) {
+ if (fp->f_count >= FDUP_MAX_COUNT) {
error = EDEADLK;
goto fail;
}
Index: sys/file.h
===================================================================
RCS file: src/sys/sys/file.h,v
retrieving revision 1.48
diff -u -p -r1.48 file.h
--- sys/file.h 18 Jun 2018 09:15:05 -0000 1.48
+++ sys/file.h 19 Jun 2018 15:13:06 -0000
@@ -77,7 +77,7 @@ struct file {
#define DTYPE_PIPE 3 /* pipe */
#define DTYPE_KQUEUE 4 /* event queue */
short f_type; /* [I] descriptor type */
- long f_count; /* [k] reference count */
+ u_int f_count; /* reference count */
struct ucred *f_cred; /* [I] credentials associated with descriptor */
struct fileops *f_ops; /* [I] file operation pointers */
off_t f_offset; /* [k] */
@@ -94,12 +94,11 @@ struct file {
#define FIF_INSERTED 0x80 /* present in `filehead' */
#define FREF(fp) \
- do { \
- extern void vfs_stall_barrier(void); \
- vfs_stall_barrier(); \
- (fp)->f_count++; \
- } while (0)
-#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
+ atomic_inc_int(&(fp)->f_count)
+#define FRELE(fp,p) \
+ (atomic_dec_int_nv(&fp->f_count) == 0 ? fdrop(fp, p) : 0)
+
+#define FDUP_MAX_COUNT (UINT_MAX - 2 * MAXCPUS)
int fdrop(struct file *, struct proc *);
Index: sys/filedesc.h
===================================================================
RCS file: src/sys/sys/filedesc.h,v
retrieving revision 1.39
diff -u -p -r1.39 filedesc.h
--- sys/filedesc.h 18 Jun 2018 09:15:05 -0000 1.39
+++ sys/filedesc.h 19 Jun 2018 15:13:06 -0000
@@ -32,6 +32,7 @@
* @(#)filedesc.h 8.1 (Berkeley) 6/2/93
*/
+#include <sys/mutex.h>
#include <sys/rwlock.h>
/*
* This structure is used for the management of descriptors. It may be
@@ -72,6 +73,8 @@ struct filedesc {
struct rwlock fd_lock; /* lock for the file descs; must be */
/* held when writing to fd_ofiles, */
/* fd_ofileflags, or fd_{hi,lo}map */
+ struct mutex fd_fplock; /* lock for reading fd_ofiles without
+ * fd_lock */
int fd_flags; /* flags on the file descriptor table */
};