On 18/06/18(Mon) 18:18, Mark Kettenis wrote:
> > Date: Mon, 18 Jun 2018 11:24:00 +0200
> > From: Martin Pieuchot <[email protected]>
> >
> > Diff below is the last of the serie to remove the KERNEL_LOCK() from
> > sendto(2) and sendmsg(2) for sockets protected by the NET_LOCK().
> >
> > As explained previously [0] this diff uses a simple non-intrusive
> > approached based on a single mutex. I'd like to get this in before
> > doing any refactoring of this layer to get real test coverages.
> >
> > [0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2
> >
> > Tests? Comments? Oks?
>
> At first sight this diff seems to mess up the locking quite a bit.
> I'm sure that is intentional, but still I'm not sure this is the right
> direction. In my mind we should at some point in the future be able
> to use atomic operations to manage the reference count on struct file
> (f_count). Using atomic operations has the huge benefit of not having
> lock ordering problems! But it looks as if this diff gets us further
> away from that goal.
>
> You'll still need a lock for the global list of course. It is easy to
> protect insertions and removals. The real problem is indeed
> fd_iterfile(). There you want to take a reference but that isn't
> really safe since you don't necessarily hold one already. That's why
> we have that f_count == 0 check there. And as long as you're holding
> the lock that protects the list you know the struct file is not going
> to disappear. So if f_count > 0 you can safely take a reference. But
> calling FREF while holding a mutex is problematic because of
> vfs_stall_barrier(). You're correct that vfs_stall_barrier() isn't
> required here. All you need to do is atomically increase the
> reference count. So this one is indeed a bit special. And using a
> mutex to protect the reference counts could still work here.
I agree with your analyse. My plan was a bit different. I wanted to
properly track references of elements on the list. That means that
kern_sysctl() could now be freeing a struct file, which shouldn't be
a problem.
> What I don't understand is why you also protect the manipulation of
> the file descriptor tables with the mutex in some places. Those
> tables are already protected by a lock!
That's the most important part of this diff. My goal is to ensure
that a descriptor returned by fd_getfile() is and will stay valid
during the whole execution of an unlocked syscall.
To avoid races with another thread that might be clearing our pointer
in `fd_ofiles', we need more than atomic operations. For that we need
to serialize the threads. The most simple way to do so is with a mutex
on a different data structure. Either global, like in my diff, or as
visa@ suggested in the corresponding `fdp'.
> All in all, I think that using atomic operations for the reference
> count would really help.
I agree, I just wanted to keep the logic easy in a first step. It
shouldn't be hard to split the mutex I'm using.
> > Index: kern/kern_descrip.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_descrip.c,v
> > retrieving revision 1.166
> > diff -u -p -r1.166 kern_descrip.c
> > --- kern/kern_descrip.c 18 Jun 2018 09:15:05 -0000 1.166
> > +++ kern/kern_descrip.c 18 Jun 2018 09:16:20 -0000
> > @@ -66,7 +66,11 @@
> >
> > /*
> > * Descriptor management.
> > + *
> > + * We need to block interrupts as long as `fhdlk' is being taken
> > + * with and without the KERNEL_LOCK().
> > */
> > +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
> > struct filelist filehead; /* head of list of open files */
> > int numfiles; /* actual number of open files */
> >
> > @@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc
> > {
> > struct file *nfp;
> >
> > + mtx_enter(&fhdlk);
> > if (fp == NULL)
> > nfp = LIST_FIRST(&filehead);
> > else
> > nfp = LIST_NEXT(fp, f_list);
> >
> > - /* don't FREF when f_count == 0 to avoid race in fdrop() */
> > + /* don't refcount when f_count == 0 to avoid race in fdrop() */
> > while (nfp != NULL && nfp->f_count == 0)
> > nfp = LIST_NEXT(nfp, f_list);
> > if (nfp != NULL)
> > - FREF(nfp);
> > + nfp->f_count++;
> > + mtx_leave(&fhdlk);
> >
> > if (fp != NULL)
> > FRELE(fp, p);
> > @@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd)
> > {
> > struct file *fp;
> >
> > - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
> > + vfs_stall_barrier();
> > +
> > + if ((u_int)fd >= fdp->fd_nfiles)
> > return (NULL);
> >
> > - FREF(fp);
> > + mtx_enter(&fhdlk);
> > + fp = fdp->fd_ofiles[fd];
> > + if (fp != NULL)
> > + fp->f_count++;
> > + mtx_leave(&fhdlk);
> > +
> > return (fp);
> > }
> >
> > @@ -671,6 +684,8 @@ fdinsert(struct filedesc *fdp, int fd, i
> > struct file *fq;
> >
> > fdpassertlocked(fdp);
> > +
> > + mtx_enter(&fhdlk);
> > if ((fq = fdp->fd_ofiles[0]) != NULL) {
> > LIST_INSERT_AFTER(fq, fp, f_list);
> > } else {
> > @@ -680,6 +695,7 @@ fdinsert(struct filedesc *fdp, int fd, i
> > fdp->fd_ofiles[fd] = fp;
> > fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
> > fp->f_iflags |= FIF_INSERTED;
> > + mtx_leave(&fhdlk);
> > }
> >
> > void
> > @@ -978,7 +994,11 @@ restart:
> > crhold(fp->f_cred);
> > *resultfp = fp;
> > *resultfd = i;
> > - FREF(fp);
> > +
> > + mtx_enter(&fhdlk);
> > + fp->f_count++;
> > + mtx_leave(&fhdlk);
> > +
> > return (0);
> > }
> >
> > @@ -1069,6 +1089,7 @@ fdcopy(struct process *pr)
> > newfdp->fd_flags = fdp->fd_flags;
> > newfdp->fd_cmask = fdp->fd_cmask;
> >
> > + mtx_enter(&fhdlk);
> > for (i = 0; i <= fdp->fd_lastfile; i++) {
> > struct file *fp = fdp->fd_ofiles[i];
> >
> > @@ -1085,12 +1106,13 @@ fdcopy(struct process *pr)
> > fp->f_type == DTYPE_KQUEUE)
> > continue;
> >
> > - FREF(fp);
> > + fp->f_count++;
> > newfdp->fd_ofiles[i] = fp;
> > newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
> > fd_used(newfdp, i);
> > }
> > }
> > + mtx_leave(&fhdlk);
> > fdpunlock(fdp);
> >
> > return (newfdp);
> > @@ -1112,8 +1134,9 @@ fdfree(struct proc *p)
> > for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) {
> > fp = *fpp;
> > if (fp != NULL) {
> > - FREF(fp);
> > *fpp = NULL;
> > + /* closef() expects a refcount of 2 */
> > + FREF(fp);
> > (void) closef(fp, p);
> > }
> > }
> > @@ -1149,11 +1172,11 @@ closef(struct file *fp, struct proc *p)
> > if (fp == NULL)
> > return (0);
> >
> > -#ifdef DIAGNOSTIC
> > - if (fp->f_count < 2)
> > - panic("closef: count (%ld) < 2", fp->f_count);
> > -#endif
> > + KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count);
> > +
> > + mtx_enter(&fhdlk);
> > fp->f_count--;
> > + mtx_leave(&fhdlk);
> >
> > /*
> > * POSIX record locking dictates that any close releases ALL
> > @@ -1185,18 +1208,19 @@ fdrop(struct file *fp, struct proc *p)
> > {
> > int error;
> >
> > -#ifdef DIAGNOSTIC
> > - if (fp->f_count != 0)
> > - panic("fdrop: count (%ld) != 0", fp->f_count);
> > -#endif
> > + MUTEX_ASSERT_LOCKED(&fhdlk);
> > +
> > + KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count);
> > +
> > + if (fp->f_iflags & FIF_INSERTED)
> > + LIST_REMOVE(fp, f_list);
> > + mtx_leave(&fhdlk);
> >
> > if (fp->f_ops)
> > error = (*fp->f_ops->fo_close)(fp, p);
> > else
> > error = 0;
> >
> > - if (fp->f_iflags & FIF_INSERTED)
> > - LIST_REMOVE(fp, f_list);
> > crfree(fp->f_cred);
> > numfiles--;
> > pool_put(&file_pool, fp);
> > Index: kern/kern_ktrace.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
> > retrieving revision 1.97
> > diff -u -p -r1.97 kern_ktrace.c
> > --- kern/kern_ktrace.c 27 May 2018 06:02:14 -0000 1.97
> > +++ kern/kern_ktrace.c 18 Jun 2018 09:16:21 -0000
> > @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui
> > struct ktr_header kth;
> > struct ktr_genio ktp;
> > caddr_t cp;
> > - int count;
> > + int count, error;
> > int buflen;
> >
> > atomic_setbits_int(&p->p_flag, P_INKTR);
> > @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui
> > if (copyin(iov->iov_base, cp, count))
> > break;
> >
> > - if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0)
> > + KERNEL_LOCK();
> > + error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count);
> > + KERNEL_UNLOCK();
> > + if (error != 0)
> > break;
> >
> > iov->iov_len -= count;
> > @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na
> > {
> > struct ktr_header kth;
> >
> > - KERNEL_ASSERT_LOCKED();
> > atomic_setbits_int(&p->p_flag, P_INKTR);
> > ktrinitheader(&kth, p, KTR_STRUCT);
> > -
> > +
> > if (data == NULL)
> > datalen = 0;
> > + KERNEL_LOCK();
> > ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen);
> > + KERNEL_UNLOCK();
> > atomic_clearbits_int(&p->p_flag, P_INKTR);
> > }
> >
> > @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin
> > kp.code = code;
> > kp.syscall = syscall;
> >
> > + KERNEL_LOCK();
> > ktrwrite(p, &kth, &kp, sizeof(kp));
> > + KERNEL_UNLOCK();
> > atomic_clearbits_int(&p->p_flag, P_INKTR);
> > }
> >
> > @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn
> > struct iovec aiov[3];
> > struct process *pr;
> > int error;
> > +
> > + KERNEL_ASSERT_LOCKED();
> >
> > auio.uio_iov = &aiov[0];
> > auio.uio_offset = 0;
> > Index: kern/kern_pledge.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_pledge.c,v
> > retrieving revision 1.232
> > diff -u -p -r1.232 kern_pledge.c
> > --- kern/kern_pledge.c 16 Jun 2018 15:37:00 -0000 1.232
> > +++ kern/kern_pledge.c 18 Jun 2018 09:16:21 -0000
> > @@ -524,6 +524,7 @@ pledge_fail(struct proc *p, int error, u
> > if (p->p_p->ps_pledge & PLEDGE_ERROR)
> > return (ENOSYS);
> >
> > + KERNEL_LOCK();
> > log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n",
> > p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall);
> > p->p_p->ps_acflag |= APLEDGE;
> > @@ -536,6 +537,7 @@ pledge_fail(struct proc *p, int error, u
> > psignal(p, SIGABRT);
> >
> > p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */
> > + KERNEL_UNLOCK();
> > return (error);
> > }
> >
> > Index: kern/kern_sysctl.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> > retrieving revision 1.341
> > diff -u -p -r1.341 kern_sysctl.c
> > --- kern/kern_sysctl.c 2 Jun 2018 16:38:21 -0000 1.341
> > +++ kern/kern_sysctl.c 18 Jun 2018 09:16:21 -0000
> > @@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct
> > kf->f_flag = fp->f_flag;
> > kf->f_iflags = fp->f_iflags;
> > kf->f_type = fp->f_type;
> > + mtx_enter(&fhdlk);
> > kf->f_count = fp->f_count;
> > + mtx_leave(&fhdlk);
> > if (show_pointers)
> > kf->f_ucred = PTRTOINT64(fp->f_cred);
> > kf->f_uid = fp->f_cred->cr_uid;
> > Index: kern/syscalls.master
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/syscalls.master,v
> > retrieving revision 1.181
> > diff -u -p -r1.181 syscalls.master
> > --- kern/syscalls.master 28 May 2018 09:17:11 -0000 1.181
> > +++ kern/syscalls.master 18 Jun 2018 09:16:21 -0000
> > @@ -90,7 +90,7 @@
> > #endif
> > 27 STD { ssize_t sys_recvmsg(int s, struct msghdr *msg, \
> > int flags); }
> > -28 STD { ssize_t sys_sendmsg(int s, \
> > +28 STD NOLOCK { ssize_t sys_sendmsg(int s, \
> > const struct msghdr *msg, int flags); }
> > 29 STD { ssize_t sys_recvfrom(int s, void *buf, size_t len, \
> > int flags, struct sockaddr *from, \
> > @@ -261,7 +261,7 @@
> > 130 OBSOL oftruncate
> > 131 STD { int sys_flock(int fd, int how); }
> > 132 STD { int sys_mkfifo(const char *path, mode_t
> > mode); }
> > -133 STD { ssize_t sys_sendto(int s, const void *buf, \
> > +133 STD NOLOCK { ssize_t sys_sendto(int s, const void *buf, \
> > size_t len, int flags, const struct sockaddr *to, \
> > socklen_t tolen); }
> > 134 STD { int sys_shutdown(int s, int how); }
> > Index: kern/uipc_syscalls.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
> > retrieving revision 1.176
> > diff -u -p -r1.176 uipc_syscalls.c
> > --- kern/uipc_syscalls.c 18 Jun 2018 09:15:05 -0000 1.176
> > +++ kern/uipc_syscalls.c 18 Jun 2018 09:16:21 -0000
> > @@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg
> > }
> > #endif
> > len = auio.uio_resid;
> > - error = sosend(fp->f_data, to, &auio, NULL, control, flags);
> > + error = sosend(so, to, &auio, NULL, control, flags);
> > if (error) {
> > if (auio.uio_resid != len && (error == ERESTART ||
> > error == EINTR || error == EWOULDBLOCK))
> > error = 0;
> > - if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0)
> > + if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) {
> > + KERNEL_LOCK();
> > ptsignal(p, SIGPIPE, STHREAD);
> > + KERNEL_UNLOCK();
> > + }
> > }
> > if (error == 0) {
> > *retsize = len - auio.uio_resid;
> > @@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct
> > {
> > struct file *fp;
> >
> > - if ((fp = fd_getfile(p->p_fd, fdes)) == NULL)
> > + fp = fd_getfile(p->p_fd, fdes);
> > + if (fp == NULL)
> > return (EBADF);
> > if (fp->f_type != DTYPE_SOCKET) {
> > FRELE(fp, p);
> > Index: kern/uipc_usrreq.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.129
> > diff -u -p -r1.129 uipc_usrreq.c
> > --- kern/uipc_usrreq.c 11 Jun 2018 08:57:35 -0000 1.129
> > +++ kern/uipc_usrreq.c 18 Jun 2018 09:16:22 -0000
> > @@ -906,6 +906,7 @@ unp_gc(void *arg __unused)
> > fp = defer->ud_fp[i].fp;
> > if (fp == NULL)
> > continue;
> > + /* closef() expects a refcount of 2 */
> > FREF(fp);
> > if ((unp = fptounp(fp)) != NULL)
> > unp->unp_msgcount--;
> > @@ -922,6 +923,8 @@ unp_gc(void *arg __unused)
> > do {
> > nunref = 0;
> > LIST_FOREACH(unp, &unp_head, unp_link) {
> > + mtx_enter(&fhdlk);
> > + fp = unp->unp_file;
> > if (unp->unp_flags & UNP_GCDEFER) {
> > /*
> > * This socket is referenced by another
> > @@ -932,8 +935,9 @@ unp_gc(void *arg __unused)
> > unp_defer--;
> > } else if (unp->unp_flags & UNP_GCMARK) {
> > /* marked as live in previous pass */
> > + mtx_leave(&fhdlk);
> > continue;
> > - } else if ((fp = unp->unp_file) == NULL) {
> > + } else if (fp == NULL) {
> > /* not being passed, so can't be in loop */
> > } else if (fp->f_count == 0) {
> > /*
> > @@ -950,9 +954,11 @@ unp_gc(void *arg __unused)
> > if (fp->f_count == unp->unp_msgcount) {
> > nunref++;
> > unp->unp_flags |= UNP_GCDEAD;
> > + mtx_leave(&fhdlk);
> > continue;
> > }
> > }
> > + mtx_leave(&fhdlk);
> >
> > /*
> > * This is the first time we've seen this socket on
> > Index: sys/file.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/file.h,v
> > retrieving revision 1.48
> > diff -u -p -r1.48 file.h
> > --- sys/file.h 18 Jun 2018 09:15:05 -0000 1.48
> > +++ sys/file.h 18 Jun 2018 09:16:22 -0000
> > @@ -65,6 +65,7 @@ struct fileops {
> > *
> > * Locks used to protect struct members in this file:
> > * I immutable after creation
> > + * F global `fhdlk' mutex
> > * f per file `f_mtx'
> > * k kernel lock
> > */
> > @@ -77,7 +78,7 @@ struct file {
> > #define DTYPE_PIPE 3 /* pipe */
> > #define DTYPE_KQUEUE 4 /* event queue */
> > short f_type; /* [I] descriptor type */
> > - long f_count; /* [k] reference count */
> > + long f_count; /* [F] reference count */
> > struct ucred *f_cred; /* [I] credentials associated with descriptor */
> > struct fileops *f_ops; /* [I] file operation pointers */
> > off_t f_offset; /* [k] */
> > @@ -97,12 +98,25 @@ struct file {
> > do { \
> > extern void vfs_stall_barrier(void); \
> > vfs_stall_barrier(); \
> > + mtx_enter(&fhdlk); \
> > (fp)->f_count++; \
> > + mtx_leave(&fhdlk); \
> > } while (0)
> > -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
> > +
> > +#define FRELE(fp,p) \
> > +({ \
> > + int rv = 0; \
> > + mtx_enter(&fhdlk); \
> > + if (--(fp)->f_count == 0) \
> > + rv = fdrop(fp, p); \
> > + else \
> > + mtx_leave(&fhdlk); \
> > + rv; \
> > +})
> >
> > int fdrop(struct file *, struct proc *);
> >
> > +extern struct mutex fhdlk; /* protects `filehead' and f_count */
> > LIST_HEAD(filelist, file);
> > extern int maxfiles; /* kernel limit on number of
> > open files */
> > extern int numfiles; /* actual number of open files
> > */
> >
> >