On 18/06/18(Mon) 18:18, Mark Kettenis wrote:
> > Date: Mon, 18 Jun 2018 11:24:00 +0200
> > From: Martin Pieuchot <[email protected]>
> > 
> > Diff below is the last of the serie to remove the KERNEL_LOCK() from
> > sendto(2) and sendmsg(2) for sockets protected by the NET_LOCK().
> > 
> > As explained previously [0] this diff uses a simple non-intrusive
> > approached based on a single mutex.  I'd like to get this in before
> > doing any refactoring of this layer to get real test coverages.
> > 
> > [0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2
> > 
> > Tests?  Comments?  Oks?
> 
> At first sight this diff seems to mess up the locking quite a bit.
> I'm sure that is intentional, but still I'm not sure this is the right
> direction.  In my mind we should at some point in the future be able
> to use atomic operations to manage the reference count on struct file
> (f_count).  Using atomic operations has the huge benefit of not having
> lock ordering problems!  But it looks as if this diff gets us further
> away from that goal.
> 
> You'll still need a lock for the global list of course.  It is easy to
> protect insertions and removals.  The real problem is indeed
> fd_iterfile().  There you want to take a reference but that isn't
> really safe since you don't necessarily hold one already.  That's why
> we have that f_count == 0 check there.  And as long as you're holding
> the lock that protects the list you know the struct file is not going
> to disappear.  So if f_count > 0 you can safely take a reference.  But
> calling FREF while holding a mutex is problematic because of
> vfs_stall_barrier().  You're correct that vfs_stall_barrier() isn't
> required here.  All you need to do is atomically increase the
> reference count.  So this one is indeed a bit special.  And using a
> mutex to protect the reference counts could still work here.

I agree with your analyse.  My plan was a bit different.  I wanted to
properly track references of elements on the list.  That means that
kern_sysctl() could now be freeing a struct file, which shouldn't be
a problem.

> What I don't understand is why you also protect the manipulation of
> the file descriptor tables with the mutex in some places.  Those
> tables are already protected by a lock!

That's the most important part of this diff.  My goal is to ensure
that a descriptor returned by fd_getfile() is and will stay valid
during the whole execution of an unlocked syscall.

To avoid races with another thread that might be clearing our pointer
in `fd_ofiles', we need more than atomic operations.  For that we need
to serialize the threads.  The most simple way to do so is with a mutex
on a different data structure.  Either global, like in my diff, or as
visa@ suggested in the corresponding `fdp'.

> All in all, I think that using atomic operations for the reference
> count would really help.

I agree, I just wanted to keep the logic easy in a first step.  It
shouldn't be hard to split the mutex I'm using. 

> > Index: kern/kern_descrip.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_descrip.c,v
> > retrieving revision 1.166
> > diff -u -p -r1.166 kern_descrip.c
> > --- kern/kern_descrip.c     18 Jun 2018 09:15:05 -0000      1.166
> > +++ kern/kern_descrip.c     18 Jun 2018 09:16:20 -0000
> > @@ -66,7 +66,11 @@
> >  
> >  /*
> >   * Descriptor management.
> > + *
> > + * We need to block interrupts as long as `fhdlk' is being taken
> > + * with and without the KERNEL_LOCK().
> >   */
> > +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
> >  struct filelist filehead;  /* head of list of open files */
> >  int numfiles;                      /* actual number of open files */
> >  
> > @@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc
> >  {
> >     struct file *nfp;
> >  
> > +   mtx_enter(&fhdlk);
> >     if (fp == NULL)
> >             nfp = LIST_FIRST(&filehead);
> >     else
> >             nfp = LIST_NEXT(fp, f_list);
> >  
> > -   /* don't FREF when f_count == 0 to avoid race in fdrop() */
> > +   /* don't refcount when f_count == 0 to avoid race in fdrop() */
> >     while (nfp != NULL && nfp->f_count == 0)
> >             nfp = LIST_NEXT(nfp, f_list);
> >     if (nfp != NULL)
> > -           FREF(nfp);
> > +           nfp->f_count++;
> > +   mtx_leave(&fhdlk);
> >  
> >     if (fp != NULL)
> >             FRELE(fp, p);
> > @@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd)
> >  {
> >     struct file *fp;
> >  
> > -   if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
> > +   vfs_stall_barrier();
> > +
> > +   if ((u_int)fd >= fdp->fd_nfiles)
> >             return (NULL);
> >  
> > -   FREF(fp);
> > +   mtx_enter(&fhdlk);
> > +   fp = fdp->fd_ofiles[fd];
> > +   if (fp != NULL)
> > +           fp->f_count++;
> > +   mtx_leave(&fhdlk);
> > +
> >     return (fp);
> >  }
> >  
> > @@ -671,6 +684,8 @@ fdinsert(struct filedesc *fdp, int fd, i
> >     struct file *fq;
> >  
> >     fdpassertlocked(fdp);
> > +
> > +   mtx_enter(&fhdlk);
> >     if ((fq = fdp->fd_ofiles[0]) != NULL) {
> >             LIST_INSERT_AFTER(fq, fp, f_list);
> >     } else {
> > @@ -680,6 +695,7 @@ fdinsert(struct filedesc *fdp, int fd, i
> >     fdp->fd_ofiles[fd] = fp;
> >     fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
> >     fp->f_iflags |= FIF_INSERTED;
> > +   mtx_leave(&fhdlk);
> >  }
> >  
> >  void
> > @@ -978,7 +994,11 @@ restart:
> >     crhold(fp->f_cred);
> >     *resultfp = fp;
> >     *resultfd = i;
> > -   FREF(fp);
> > +
> > +   mtx_enter(&fhdlk);
> > +   fp->f_count++;
> > +   mtx_leave(&fhdlk);
> > +
> >     return (0);
> >  }
> >  
> > @@ -1069,6 +1089,7 @@ fdcopy(struct process *pr)
> >     newfdp->fd_flags = fdp->fd_flags;
> >     newfdp->fd_cmask = fdp->fd_cmask;
> >  
> > +   mtx_enter(&fhdlk);
> >     for (i = 0; i <= fdp->fd_lastfile; i++) {
> >             struct file *fp = fdp->fd_ofiles[i];
> >  
> > @@ -1085,12 +1106,13 @@ fdcopy(struct process *pr)
> >                         fp->f_type == DTYPE_KQUEUE)
> >                             continue;
> >  
> > -                   FREF(fp);
> > +                   fp->f_count++;
> >                     newfdp->fd_ofiles[i] = fp;
> >                     newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
> >                     fd_used(newfdp, i);
> >             }
> >     }
> > +   mtx_leave(&fhdlk);
> >     fdpunlock(fdp);
> >  
> >     return (newfdp);
> > @@ -1112,8 +1134,9 @@ fdfree(struct proc *p)
> >     for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) {
> >             fp = *fpp;
> >             if (fp != NULL) {
> > -                   FREF(fp);
> >                     *fpp = NULL;
> > +                    /* closef() expects a refcount of 2 */
> > +                   FREF(fp);
> >                     (void) closef(fp, p);
> >             }
> >     }
> > @@ -1149,11 +1172,11 @@ closef(struct file *fp, struct proc *p)
> >     if (fp == NULL)
> >             return (0);
> >  
> > -#ifdef DIAGNOSTIC
> > -   if (fp->f_count < 2)
> > -           panic("closef: count (%ld) < 2", fp->f_count);
> > -#endif
> > +   KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count);
> > +
> > +   mtx_enter(&fhdlk);
> >     fp->f_count--;
> > +   mtx_leave(&fhdlk);
> >  
> >     /*
> >      * POSIX record locking dictates that any close releases ALL
> > @@ -1185,18 +1208,19 @@ fdrop(struct file *fp, struct proc *p)
> >  {
> >     int error;
> >  
> > -#ifdef DIAGNOSTIC
> > -   if (fp->f_count != 0)
> > -           panic("fdrop: count (%ld) != 0", fp->f_count);
> > -#endif
> > +   MUTEX_ASSERT_LOCKED(&fhdlk);
> > +
> > +   KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count);
> > +
> > +   if (fp->f_iflags & FIF_INSERTED)
> > +           LIST_REMOVE(fp, f_list);
> > +   mtx_leave(&fhdlk);
> >  
> >     if (fp->f_ops)
> >             error = (*fp->f_ops->fo_close)(fp, p);
> >     else
> >             error = 0;
> >  
> > -   if (fp->f_iflags & FIF_INSERTED)
> > -           LIST_REMOVE(fp, f_list);
> >     crfree(fp->f_cred);
> >     numfiles--;
> >     pool_put(&file_pool, fp);
> > Index: kern/kern_ktrace.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
> > retrieving revision 1.97
> > diff -u -p -r1.97 kern_ktrace.c
> > --- kern/kern_ktrace.c      27 May 2018 06:02:14 -0000      1.97
> > +++ kern/kern_ktrace.c      18 Jun 2018 09:16:21 -0000
> > @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui
> >     struct ktr_header kth;
> >     struct ktr_genio ktp;
> >     caddr_t cp;
> > -   int count;
> > +   int count, error;
> >     int buflen;
> >  
> >     atomic_setbits_int(&p->p_flag, P_INKTR);
> > @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui
> >             if (copyin(iov->iov_base, cp, count))
> >                     break;
> >  
> > -           if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0)
> > +           KERNEL_LOCK();
> > +           error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count);
> > +           KERNEL_UNLOCK();
> > +           if (error != 0)
> >                     break;
> >  
> >             iov->iov_len -= count;
> > @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na
> >  {
> >     struct ktr_header kth;
> >  
> > -   KERNEL_ASSERT_LOCKED();
> >     atomic_setbits_int(&p->p_flag, P_INKTR);
> >     ktrinitheader(&kth, p, KTR_STRUCT);
> > -   
> > +
> >     if (data == NULL)
> >             datalen = 0;
> > +   KERNEL_LOCK();
> >     ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen);
> > +   KERNEL_UNLOCK();
> >     atomic_clearbits_int(&p->p_flag, P_INKTR);
> >  }
> >  
> > @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin
> >     kp.code = code;
> >     kp.syscall = syscall;
> >  
> > +   KERNEL_LOCK();
> >     ktrwrite(p, &kth, &kp, sizeof(kp));
> > +   KERNEL_UNLOCK();
> >     atomic_clearbits_int(&p->p_flag, P_INKTR);
> >  }
> >  
> > @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn
> >     struct iovec aiov[3];
> >     struct process *pr;
> >     int error;
> > +
> > +   KERNEL_ASSERT_LOCKED();
> >  
> >     auio.uio_iov = &aiov[0];
> >     auio.uio_offset = 0;
> > Index: kern/kern_pledge.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_pledge.c,v
> > retrieving revision 1.232
> > diff -u -p -r1.232 kern_pledge.c
> > --- kern/kern_pledge.c      16 Jun 2018 15:37:00 -0000      1.232
> > +++ kern/kern_pledge.c      18 Jun 2018 09:16:21 -0000
> > @@ -524,6 +524,7 @@ pledge_fail(struct proc *p, int error, u
> >     if (p->p_p->ps_pledge & PLEDGE_ERROR)
> >             return (ENOSYS);
> >  
> > +   KERNEL_LOCK();
> >     log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n",
> >         p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall);
> >     p->p_p->ps_acflag |= APLEDGE;
> > @@ -536,6 +537,7 @@ pledge_fail(struct proc *p, int error, u
> >     psignal(p, SIGABRT);
> >  
> >     p->p_p->ps_pledge = 0;          /* Disable all PLEDGE_ flags */
> > +   KERNEL_UNLOCK();
> >     return (error);
> >  }
> >  
> > Index: kern/kern_sysctl.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> > retrieving revision 1.341
> > diff -u -p -r1.341 kern_sysctl.c
> > --- kern/kern_sysctl.c      2 Jun 2018 16:38:21 -0000       1.341
> > +++ kern/kern_sysctl.c      18 Jun 2018 09:16:21 -0000
> > @@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct 
> >             kf->f_flag = fp->f_flag;
> >             kf->f_iflags = fp->f_iflags;
> >             kf->f_type = fp->f_type;
> > +           mtx_enter(&fhdlk);
> >             kf->f_count = fp->f_count;
> > +           mtx_leave(&fhdlk);
> >             if (show_pointers)
> >                     kf->f_ucred = PTRTOINT64(fp->f_cred);
> >             kf->f_uid = fp->f_cred->cr_uid;
> > Index: kern/syscalls.master
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/syscalls.master,v
> > retrieving revision 1.181
> > diff -u -p -r1.181 syscalls.master
> > --- kern/syscalls.master    28 May 2018 09:17:11 -0000      1.181
> > +++ kern/syscalls.master    18 Jun 2018 09:16:21 -0000
> > @@ -90,7 +90,7 @@
> >  #endif
> >  27 STD             { ssize_t sys_recvmsg(int s, struct msghdr *msg, \
> >                         int flags); }
> > -28 STD             { ssize_t sys_sendmsg(int s, \
> > +28 STD NOLOCK      { ssize_t sys_sendmsg(int s, \
> >                         const struct msghdr *msg, int flags); }
> >  29 STD             { ssize_t sys_recvfrom(int s, void *buf, size_t len, \
> >                         int flags, struct sockaddr *from, \
> > @@ -261,7 +261,7 @@
> >  130        OBSOL           oftruncate
> >  131        STD             { int sys_flock(int fd, int how); }
> >  132        STD             { int sys_mkfifo(const char *path, mode_t 
> > mode); }
> > -133        STD             { ssize_t sys_sendto(int s, const void *buf, \
> > +133        STD NOLOCK      { ssize_t sys_sendto(int s, const void *buf, \
> >                         size_t len, int flags, const struct sockaddr *to, \
> >                         socklen_t tolen); }
> >  134        STD             { int sys_shutdown(int s, int how); }
> > Index: kern/uipc_syscalls.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
> > retrieving revision 1.176
> > diff -u -p -r1.176 uipc_syscalls.c
> > --- kern/uipc_syscalls.c    18 Jun 2018 09:15:05 -0000      1.176
> > +++ kern/uipc_syscalls.c    18 Jun 2018 09:16:21 -0000
> > @@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg
> >     }
> >  #endif
> >     len = auio.uio_resid;
> > -   error = sosend(fp->f_data, to, &auio, NULL, control, flags);
> > +   error = sosend(so, to, &auio, NULL, control, flags);
> >     if (error) {
> >             if (auio.uio_resid != len && (error == ERESTART ||
> >                 error == EINTR || error == EWOULDBLOCK))
> >                     error = 0;
> > -           if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0)
> > +           if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) {
> > +                   KERNEL_LOCK();
> >                     ptsignal(p, SIGPIPE, STHREAD);
> > +                   KERNEL_UNLOCK();
> > +           }
> >     }
> >     if (error == 0) {
> >             *retsize = len - auio.uio_resid;
> > @@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct
> >  {
> >     struct file *fp;
> >  
> > -   if ((fp = fd_getfile(p->p_fd, fdes)) == NULL)
> > +   fp = fd_getfile(p->p_fd, fdes);
> > +   if (fp == NULL)
> >             return (EBADF);
> >     if (fp->f_type != DTYPE_SOCKET) {
> >             FRELE(fp, p);
> > Index: kern/uipc_usrreq.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.129
> > diff -u -p -r1.129 uipc_usrreq.c
> > --- kern/uipc_usrreq.c      11 Jun 2018 08:57:35 -0000      1.129
> > +++ kern/uipc_usrreq.c      18 Jun 2018 09:16:22 -0000
> > @@ -906,6 +906,7 @@ unp_gc(void *arg __unused)
> >                     fp = defer->ud_fp[i].fp;
> >                     if (fp == NULL)
> >                             continue;
> > +                    /* closef() expects a refcount of 2 */
> >                     FREF(fp);
> >                     if ((unp = fptounp(fp)) != NULL)
> >                             unp->unp_msgcount--;
> > @@ -922,6 +923,8 @@ unp_gc(void *arg __unused)
> >     do {
> >             nunref = 0;
> >             LIST_FOREACH(unp, &unp_head, unp_link) {
> > +                   mtx_enter(&fhdlk);
> > +                   fp = unp->unp_file;
> >                     if (unp->unp_flags & UNP_GCDEFER) {
> >                             /*
> >                              * This socket is referenced by another
> > @@ -932,8 +935,9 @@ unp_gc(void *arg __unused)
> >                             unp_defer--;
> >                     } else if (unp->unp_flags & UNP_GCMARK) {
> >                             /* marked as live in previous pass */
> > +                           mtx_leave(&fhdlk);
> >                             continue;
> > -                   } else if ((fp = unp->unp_file) == NULL) {
> > +                   } else if (fp == NULL) {
> >                             /* not being passed, so can't be in loop */
> >                     } else if (fp->f_count == 0) {
> >                             /*
> > @@ -950,9 +954,11 @@ unp_gc(void *arg __unused)
> >                             if (fp->f_count == unp->unp_msgcount) {
> >                                     nunref++;
> >                                     unp->unp_flags |= UNP_GCDEAD;
> > +                                   mtx_leave(&fhdlk);
> >                                     continue;
> >                             }
> >                     }
> > +                   mtx_leave(&fhdlk);
> >  
> >                     /*
> >                      * This is the first time we've seen this socket on
> > Index: sys/file.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/file.h,v
> > retrieving revision 1.48
> > diff -u -p -r1.48 file.h
> > --- sys/file.h      18 Jun 2018 09:15:05 -0000      1.48
> > +++ sys/file.h      18 Jun 2018 09:16:22 -0000
> > @@ -65,6 +65,7 @@ struct    fileops {
> >   *
> >   *  Locks used to protect struct members in this file:
> >   * I       immutable after creation
> > + * F       global `fhdlk' mutex
> >   * f       per file `f_mtx'
> >   * k       kernel lock
> >   */
> > @@ -77,7 +78,7 @@ struct file {
> >  #define    DTYPE_PIPE      3       /* pipe */
> >  #define    DTYPE_KQUEUE    4       /* event queue */
> >     short   f_type;         /* [I] descriptor type */
> > -   long    f_count;        /* [k] reference count */
> > +   long    f_count;        /* [F] reference count */
> >     struct  ucred *f_cred;  /* [I] credentials associated with descriptor */
> >     struct  fileops *f_ops; /* [I] file operation pointers */
> >     off_t   f_offset;       /* [k] */
> > @@ -97,12 +98,25 @@ struct file {
> >     do { \
> >             extern void vfs_stall_barrier(void); \
> >             vfs_stall_barrier(); \
> > +           mtx_enter(&fhdlk); \
> >             (fp)->f_count++; \
> > +           mtx_leave(&fhdlk); \
> >     } while (0)
> > -#define FRELE(fp,p)        (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
> > +
> > +#define FRELE(fp,p) \
> > +({ \
> > +   int rv = 0; \
> > +   mtx_enter(&fhdlk); \
> > +   if (--(fp)->f_count == 0) \
> > +           rv = fdrop(fp, p); \
> > +   else \
> > +           mtx_leave(&fhdlk); \
> > +   rv; \
> > +})
> >  
> >  int        fdrop(struct file *, struct proc *);
> >  
> > +extern struct mutex fhdlk;         /* protects `filehead' and f_count */
> >  LIST_HEAD(filelist, file);
> >  extern int maxfiles;                       /* kernel limit on number of 
> > open files */
> >  extern int numfiles;                       /* actual number of open files 
> > */
> > 
> > 

Reply via email to