Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-11 Thread Vitaliy Makkoveev
On Thu, Nov 11, 2021 at 03:21:45PM +0100, Alexander Bluhm wrote:
> On Thu, Nov 11, 2021 at 12:09:41PM +0300, Vitaliy Makkoveev wrote:
> > Can I propose to commit this diff before? It reorders soclose() to
> > destroy PCB before `so_q0' and `so_q' cleanup.
> 
> OK bluhm@
> 
> > Also the tool to test the diff:
> 
> What happens when you run the tool with the current code?  Does the
> kernel crash?  Are there some inconsistencies that userland could
> detect?
> 
> If yes, we could convert it to a regress test.
> 

All fine with the current code. I wrote this to check that nothing
changes with the diff.

> bluhm
> 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > 
> > #include 
> > #include 
> > 
> > #define NTHR (10)
> > #define NCONNECT (10)
> > #define NACCEPT ((NTHR * NCONNECT) / 2)
> > 
> > static void *
> > thr_conn(void *arg)
> > {
> > struct sockaddr_storage *ss = arg;
> > int s[NCONNECT];
> > size_t hd = 0, tl = 0;
> > 
> > while (1) {
> > if ((s[hd] = socket(ss->ss_family, SOCK_STREAM, 0)) < 0)
> > continue;
> > 
> > if (connect(s[hd], (struct sockaddr *)ss, ss->ss_len) < 0) {
> > close(s[hd]);
> > continue;
> > }
> > 
> > if ((hd = (hd + 1) % NCONNECT) == tl) {
> > close(s[tl]);
> > tl = (tl + 1) % NCONNECT;
> > }
> > }
> > 
> > return NULL;
> > }
> > 
> > static void
> > usage(void)
> > {
> > extern char *__progname;
> > 
> > fprintf(stderr, "Usage %s: unix | inet | inet6\n", __progname);
> > exit(1);
> > }
> > 
> > int
> > main(int argc, char *argv[])
> > {
> > struct sockaddr_storage ss;
> > size_t i;
> > int s;
> > 
> > if (argc != 2) {
> > usage();
> > }
> > 
> > memset(&ss, 0, sizeof(ss));
> > 
> > if (strcmp(argv[1], "unix") == 0) {
> > struct sockaddr_un *sun;
> > 
> > sun = (struct sockaddr_un *)&ss;
> > sun->sun_len = sizeof(*sun);
> > sun->sun_family = AF_UNIX;
> > snprintf(sun->sun_path, sizeof(sun->sun_path) - 1,
> > "/tmp/socket%d", getpid());
> > } else if (strcmp(argv[1], "inet") == 0) {
> > struct sockaddr_in *sin;
> > 
> > sin = (struct sockaddr_in *)&ss;
> > sin->sin_len = sizeof(*sin);
> > sin->sin_family = AF_INET;
> > sin->sin_port = htons(1+(getpid()%1));
> > if (inet_pton(AF_INET, "127.0.0.1", &sin->sin_addr) <= 0)
> > errx(1, "inet_pton()");
> > } else if (strcmp(argv[1], "inet6") == 0) {
> > struct sockaddr_in6 *sin6;
> > 
> > sin6 = (struct sockaddr_in6 *)&ss;
> > sin6->sin6_len = sizeof(*sin6);
> > sin6->sin6_family = AF_INET6;
> > sin6->sin6_port = htons(1+(getpid()%1));
> > if (inet_pton(AF_INET6, "::1", &sin6->sin6_addr) <= 0)
> > errx(1, "inet_pton()");
> > } else
> > usage();
> > 
> > for (i = 0; i < NTHR; ++i) {
> > pthread_t thr;
> > int error;
> > 
> > if ((error = pthread_create(&thr, NULL, thr_conn, &ss)))
> > errc(1, error, "pthread_create()");
> > }
> > 
> > while(1) {
> > int conns[NACCEPT];
> > 
> > if ((s = socket(ss.ss_family, SOCK_STREAM, 0)) < 0)
> > err(1, "socket()");
> > 
> > if (ss.ss_family == AF_UNIX) {
> > struct sockaddr_un *sun = (struct sockaddr_un *)&ss;
> > unlink(sun->sun_path);
> > } else {
> > int opt = 1;
> > 
> > if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
> > &opt, sizeof(opt)) < 0)
> > err(1, "setsockopt()");
> > }
> > 
> > if (bind(s, (struct sockaddr *)&ss, ss.ss_len) < 0)
> > err(1, "bind()");
> > 
> > if (listen(s, 10) < 0)
> > err(1, "listen()");
> > 
> > for(i=0; i < NACCEPT; ++i) {
> > if ((conns[i] = accept(s, NULL, NULL)) < 0)
> > err(1, "accept()");
> > }
> > 
> > close(s);
> > 
> > for(i=0; i < NACCEPT; ++i) {
> > close(conns[i]);
> > }
> > }
> > 
> > return 0;
> > }
> > 
> > Index: sys/kern/uipc_socket.c
> > ===
> > RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> > retrieving revision 1.267
> > diff -u -p -r1.267 uipc_socket.c
> > --- sys/kern/uipc_socket.c  24 Oct 2021 07:02:47 -  1.267
> > +++ sys/kern/uipc_socket.c  11 Nov 2021 08:56:17 -
> > @@ -322,19 +322,9 @@ soclo

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-11 Thread Alexander Bluhm
On Thu, Nov 11, 2021 at 12:42:36AM +0300, Vitaliy Makkoveev wrote:
> What about 'SO_DYING' flag? Anyway I want to remove it with the next
> diff.

The so_options are for user flags.  The so_state is for kernel
transitions.  So a SS_DYING would be better, see sys/socketvar.h.

But as we commit the other soclose() diff first, it does not matter
anymore.

> No problem, but I made "if (vp)" without NULL for the consistency with
> "if (unp->unp_vnode)" in this function. Should they have the same
> notation?

mpi@ and I converted the network stack gradually to the pointer ==
NULL or pointer != NULL idiom.  If you change something, go into
this direction.  But there is no need to make bulk conversions.

> OK for updated diff?

OK bluhm@ without the SO_DYING part.

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.268
> diff -u -p -r1.268 uipc_socket.c
> --- sys/kern/uipc_socket.c6 Nov 2021 05:26:33 -   1.268
> +++ sys/kern/uipc_socket.c10 Nov 2021 21:33:01 -
> @@ -334,6 +334,8 @@ soclose(struct socket *so, int flags)
>   /* Revoke async IO early. There is a final revocation in sofree(). */
>   sigio_free(&so->so_sigio);
>   if (so->so_options & SO_ACCEPTCONN) {
> + so->so_options |= SO_DYING;
> +
>   while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
>   (void) soqremque(so2, 0);
>   (void) soabort(so2);
> Index: sys/kern/uipc_usrreq.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.154
> diff -u -p -r1.154 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c6 Nov 2021 17:35:14 -   1.154
> +++ sys/kern/uipc_usrreq.c10 Nov 2021 21:33:01 -
> @@ -485,20 +485,30 @@ void
>  unp_detach(struct unpcb *unp)
>  {
>   struct socket *so = unp->unp_socket;
> - struct vnode *vp = NULL;
> + struct vnode *vp = unp->unp_vnode;
>  
>   rw_assert_wrlock(&unp_lock);
>  
>   LIST_REMOVE(unp, unp_link);
> - if (unp->unp_vnode) {
> +
> + if (vp != NULL) {
> + unp->unp_vnode = NULL;
> +
>   /*
> -  * `v_socket' is only read in unp_connect and
> -  * unplock prevents concurrent access.
> +  * Enforce `i_lock' -> `unp_lock' because fifo
> +  * subsystem requires it.
>*/
>  
> - unp->unp_vnode->v_socket = NULL;
> - vp = unp->unp_vnode;
> - unp->unp_vnode = NULL;
> + sounlock(so, SL_LOCKED);
> +
> + VOP_LOCK(vp, LK_EXCLUSIVE);
> + vp->v_socket = NULL;
> +
> + KERNEL_LOCK();
> + vput(vp);
> + KERNEL_UNLOCK();
> +
> + solock(so);
>   }
>  
>   if (unp->unp_conn)
> @@ -511,21 +521,6 @@ unp_detach(struct unpcb *unp)
>   pool_put(&unpcb_pool, unp);
>   if (unp_rights)
>   task_add(systqmp, &unp_gc_task);
> -
> - if (vp != NULL) {
> - /*
> -  * Enforce `i_lock' -> `unplock' because fifo subsystem
> -  * requires it. The socket can't be closed concurrently
> -  * because the file descriptor reference is
> -  * still hold.
> -  */
> -
> - sounlock(so, SL_LOCKED);
> - KERNEL_LOCK();
> - vrele(vp);
> - KERNEL_UNLOCK();
> - solock(so);
> - }
>  }
>  
>  int
> @@ -670,7 +665,8 @@ unp_connect(struct socket *so, struct mb
>   goto put_locked;
>   }
>   if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
> - if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
> + if ((so2->so_options & SO_DYING) != 0 ||
> + (so2->so_options & SO_ACCEPTCONN) == 0 ||
>   (so3 = sonewconn(so2, 0)) == NULL) {
>   error = ECONNREFUSED;
>   goto put_locked;
> Index: sys/sys/socket.h
> ===
> RCS file: /cvs/src/sys/sys/socket.h,v
> retrieving revision 1.101
> diff -u -p -r1.101 socket.h
> --- sys/sys/socket.h  7 Nov 2021 12:05:28 -   1.101
> +++ sys/sys/socket.h  10 Nov 2021 21:33:01 -
> @@ -97,6 +97,7 @@ typedef __sa_family_t   sa_family_t;/* so
>  #define SO_TIMESTAMP 0x0800  /* timestamp received dgram traffic */
>  #define SO_BINDANY   0x1000  /* allow bind to any address */
>  #define SO_ZEROIZE   0x2000  /* zero out all mbufs sent over socket 
> */
> +#define SO_DYING 0x4000  /* socket is dying */
>  
>  /*
>   * Additional options, not kept in so_options.



Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-11 Thread Alexander Bluhm
On Thu, Nov 11, 2021 at 12:09:41PM +0300, Vitaliy Makkoveev wrote:
> Can I propose to commit this diff before? It reorders soclose() to
> destroy PCB before `so_q0' and `so_q' cleanup.

OK bluhm@

> Also the tool to test the diff:

What happens when you run the tool with the current code?  Does the
kernel crash?  Are there some inconsistencies that userland could
detect?

If yes, we could convert it to a regress test.

bluhm

> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> #include 
> #include 
> 
> #define NTHR (10)
> #define NCONNECT (10)
> #define NACCEPT ((NTHR * NCONNECT) / 2)
> 
> static void *
> thr_conn(void *arg)
> {
>   struct sockaddr_storage *ss = arg;
>   int s[NCONNECT];
>   size_t hd = 0, tl = 0;
> 
>   while (1) {
>   if ((s[hd] = socket(ss->ss_family, SOCK_STREAM, 0)) < 0)
>   continue;
> 
>   if (connect(s[hd], (struct sockaddr *)ss, ss->ss_len) < 0) {
>   close(s[hd]);
>   continue;
>   }
> 
>   if ((hd = (hd + 1) % NCONNECT) == tl) {
>   close(s[tl]);
>   tl = (tl + 1) % NCONNECT;
>   }
>   }
> 
>   return NULL;
> }
> 
> static void
> usage(void)
> {
>   extern char *__progname;
> 
>   fprintf(stderr, "Usage %s: unix | inet | inet6\n", __progname);
>   exit(1);
> }
> 
> int
> main(int argc, char *argv[])
> {
>   struct sockaddr_storage ss;
>   size_t i;
>   int s;
> 
>   if (argc != 2) {
>   usage();
>   }
> 
>   memset(&ss, 0, sizeof(ss));
> 
>   if (strcmp(argv[1], "unix") == 0) {
>   struct sockaddr_un *sun;
> 
>   sun = (struct sockaddr_un *)&ss;
>   sun->sun_len = sizeof(*sun);
>   sun->sun_family = AF_UNIX;
>   snprintf(sun->sun_path, sizeof(sun->sun_path) - 1,
>   "/tmp/socket%d", getpid());
>   } else if (strcmp(argv[1], "inet") == 0) {
>   struct sockaddr_in *sin;
> 
>   sin = (struct sockaddr_in *)&ss;
>   sin->sin_len = sizeof(*sin);
>   sin->sin_family = AF_INET;
>   sin->sin_port = htons(1+(getpid()%1));
>   if (inet_pton(AF_INET, "127.0.0.1", &sin->sin_addr) <= 0)
>   errx(1, "inet_pton()");
>   } else if (strcmp(argv[1], "inet6") == 0) {
>   struct sockaddr_in6 *sin6;
> 
>   sin6 = (struct sockaddr_in6 *)&ss;
>   sin6->sin6_len = sizeof(*sin6);
>   sin6->sin6_family = AF_INET6;
>   sin6->sin6_port = htons(1+(getpid()%1));
>   if (inet_pton(AF_INET6, "::1", &sin6->sin6_addr) <= 0)
>   errx(1, "inet_pton()");
>   } else
>   usage();
> 
>   for (i = 0; i < NTHR; ++i) {
>   pthread_t thr;
>   int error;
> 
>   if ((error = pthread_create(&thr, NULL, thr_conn, &ss)))
>   errc(1, error, "pthread_create()");
>   }
> 
>   while(1) {
>   int conns[NACCEPT];
> 
>   if ((s = socket(ss.ss_family, SOCK_STREAM, 0)) < 0)
>   err(1, "socket()");
> 
>   if (ss.ss_family == AF_UNIX) {
>   struct sockaddr_un *sun = (struct sockaddr_un *)&ss;
>   unlink(sun->sun_path);
>   } else {
>   int opt = 1;
> 
>   if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
>   &opt, sizeof(opt)) < 0)
>   err(1, "setsockopt()");
>   }
> 
>   if (bind(s, (struct sockaddr *)&ss, ss.ss_len) < 0)
>   err(1, "bind()");
> 
>   if (listen(s, 10) < 0)
>   err(1, "listen()");
> 
>   for(i=0; i < NACCEPT; ++i) {
>   if ((conns[i] = accept(s, NULL, NULL)) < 0)
>   err(1, "accept()");
>   }
> 
>   close(s);
> 
>   for(i=0; i < NACCEPT; ++i) {
>   close(conns[i]);
>   }
>   }
> 
>   return 0;
> }
> 
> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.267
> diff -u -p -r1.267 uipc_socket.c
> --- sys/kern/uipc_socket.c24 Oct 2021 07:02:47 -  1.267
> +++ sys/kern/uipc_socket.c11 Nov 2021 08:56:17 -
> @@ -322,19 +322,9 @@ soclose(struct socket *so, int flags)
>   s = solock(so);
>   /* Revoke async IO early. There is a final revocation in sofree(). */
>   sigio_free(&so->so_sigio);
> - if (so->so_options & SO_ACCEPTCONN) {
> - while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
> - (void

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-11 Thread Vitaliy Makkoveev
On Thu, Nov 11, 2021 at 12:42:36AM +0300, Vitaliy Makkoveev wrote:
> On Wed, Nov 10, 2021 at 10:03:48PM +0100, Alexander Bluhm wrote:
> > On Tue, Oct 26, 2021 at 02:12:36PM +0300, Vitaliy Makkoveev wrote:
> > > --- sys/kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> > > +++ sys/kern/uipc_socket.c26 Oct 2021 11:05:59 -
> > > @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
> > >   /* Revoke async IO early. There is a final revocation in sofree(). */
> > >   sigio_free(&so->so_sigio);
> > >   if (so->so_options & SO_ACCEPTCONN) {
> > > + so->so_options &= ~SO_ACCEPTCONN;
> > > +
> > >   while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
> > >   (void) soqremque(so2, 0);
> > >   (void) soabort(so2);
> > 
> > I don't like this.  A listening socket can never be converted to a
> > non-listening socket.  Whatever this transition means in th TCP
> > layer.  If you want to mark a listening socket as closing to avoid
> > accepting connections, use a approriate flag.  Do not convert it
> > to an non-listening socket that may have races elsewhere.
> > 
> > I would say some so_state bits may be suitable if we really need
> > this feature.
> > 
> 
> This is temporary solution. I want to reorder soclose() to destroy PCB
> before `so_q0' and `so_q' cleanup, so this 'SO_ACCEPTCONN' modification
> will be removed. This is required because the relock dances will
> appeared in socose() and unp_detach() for the per-socket locking.
> 
> We don't release netlock in the inet sockets case, so they are not
> affected.
> 
> What about 'SO_DYING' flag? Anyway I want to remove it with the next
> diff.
> 
> > > - if (unp->unp_vnode) {
> > > +
> > > + if (vp) {
> > 
> > Please use (vp != NULL) for consistency.
> > 
> 
> No problem, but I made "if (vp)" without NULL for the consistency with
> "if (unp->unp_vnode)" in this function. Should they have the same
> notation?
> 
> > > + unp->unp_vnode = NULL;
> > > +
> > >   /*
> > > -  * `v_socket' is only read in unp_connect and
> > > -  * unplock prevents concurrent access.
> > > +  * Enforce `i_lock' -> `unp_lock' because fifo
> > > +  * subsystem requires it.
> > >*/
> > >  
> > > - unp->unp_vnode->v_socket = NULL;
> > > - vp = unp->unp_vnode;
> > > - unp->unp_vnode = NULL;
> > > + sounlock(so, SL_LOCKED);
> > > +
> > > + VOP_LOCK(vp, LK_EXCLUSIVE);
> > > + vp->v_socket = NULL;
> > > +
> > > + KERNEL_LOCK();
> > > + vput(vp);
> > > + KERNEL_UNLOCK();
> > > +
> > > + solock(so);
> > >   }
> > 
> > This might work, we should try it.
> > 
> > > @@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
> > >   pool_put(&unpcb_pool, unp);
> > >   if (unp_rights)
> > >   task_add(systqmp, &unp_gc_task);
> > > -
> > > - if (vp != NULL) {
> > > - /*
> > > -  * Enforce `i_lock' -> `unplock' because fifo subsystem
> > > -  * requires it. The socket can't be closed concurrently
> > > -  * because the file descriptor reference is
> > > -  * still hold.
> > > -  */
> > > -
> > > - sounlock(so, SL_LOCKED);
> > > - KERNEL_LOCK();
> > > - vrele(vp);
> > > - KERNEL_UNLOCK();
> > > - solock(so);
> > > - }
> > >  }
> > 
> > Why did you move the lock dance to the beginning of the function?
> > Does it matter?
> > 
> 
> Yes, I want to disconnect dying socket from the file system before start
> it's destruction. This does matter because in some cases we need to
> release the solock() before acquire solock() for two sockets to keep
> lock order between them. With listening socket disconnected from the
> file system we can't have concurrent threads performing connect to dying
> socket.
> 
> Also this is the reason I want to reorder listening socket destruction
> in the soclose().
> 
> OK for updated diff?
> 

Can I propose to commit this diff before? It reorders soclose() to
destroy PCB before `so_q0' and `so_q' cleanup.

The dying socket is already unlinked from the file descriptor layer, but
still accessible from the stack or from the file system layer. When we
release solock() while processing `so_q0' or `so_q' queues or when
performing (*pr_detach)(), concurrent thread could perform connection.
This reorder prevents this. The reorder will be required for the
upcoming fine grained locking diffs.

According the current code we assume the `so_q0' or `so_q' queues
cleanup could be done with destroyed PCB so this diff doesn't modify
existing logic. With this diff committed I don't need to modify
`so_state' in the unp_detach() reorder diff which started this thread.

Also the tool to test the diff:

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 

#define NTHR (10)
#define NCONNECT (10)
#define NACCEPT ((NTHR * NCONNECT) / 2)

static void *
thr

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-10 Thread Vitaliy Makkoveev
On Wed, Nov 10, 2021 at 10:03:48PM +0100, Alexander Bluhm wrote:
> On Tue, Oct 26, 2021 at 02:12:36PM +0300, Vitaliy Makkoveev wrote:
> > --- sys/kern/uipc_socket.c  14 Oct 2021 23:05:10 -  1.265
> > +++ sys/kern/uipc_socket.c  26 Oct 2021 11:05:59 -
> > @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
> > /* Revoke async IO early. There is a final revocation in sofree(). */
> > sigio_free(&so->so_sigio);
> > if (so->so_options & SO_ACCEPTCONN) {
> > +   so->so_options &= ~SO_ACCEPTCONN;
> > +
> > while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
> > (void) soqremque(so2, 0);
> > (void) soabort(so2);
> 
> I don't like this.  A listening socket can never be converted to a
> non-listening socket.  Whatever this transition means in th TCP
> layer.  If you want to mark a listening socket as closing to avoid
> accepting connections, use a approriate flag.  Do not convert it
> to an non-listening socket that may have races elsewhere.
> 
> I would say some so_state bits may be suitable if we really need
> this feature.
> 

This is temporary solution. I want to reorder soclose() to destroy PCB
before `so_q0' and `so_q' cleanup, so this 'SO_ACCEPTCONN' modification
will be removed. This is required because the relock dances will
appeared in socose() and unp_detach() for the per-socket locking.

We don't release netlock in the inet sockets case, so they are not
affected.

What about 'SO_DYING' flag? Anyway I want to remove it with the next
diff.

> > -   if (unp->unp_vnode) {
> > +
> > +   if (vp) {
> 
> Please use (vp != NULL) for consistency.
> 

No problem, but I made "if (vp)" without NULL for the consistency with
"if (unp->unp_vnode)" in this function. Should they have the same
notation?

> > +   unp->unp_vnode = NULL;
> > +
> > /*
> > -* `v_socket' is only read in unp_connect and
> > -* unplock prevents concurrent access.
> > +* Enforce `i_lock' -> `unp_lock' because fifo
> > +* subsystem requires it.
> >  */
> >  
> > -   unp->unp_vnode->v_socket = NULL;
> > -   vp = unp->unp_vnode;
> > -   unp->unp_vnode = NULL;
> > +   sounlock(so, SL_LOCKED);
> > +
> > +   VOP_LOCK(vp, LK_EXCLUSIVE);
> > +   vp->v_socket = NULL;
> > +
> > +   KERNEL_LOCK();
> > +   vput(vp);
> > +   KERNEL_UNLOCK();
> > +
> > +   solock(so);
> > }
> 
> This might work, we should try it.
> 
> > @@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
> > pool_put(&unpcb_pool, unp);
> > if (unp_rights)
> > task_add(systqmp, &unp_gc_task);
> > -
> > -   if (vp != NULL) {
> > -   /*
> > -* Enforce `i_lock' -> `unplock' because fifo subsystem
> > -* requires it. The socket can't be closed concurrently
> > -* because the file descriptor reference is
> > -* still hold.
> > -*/
> > -
> > -   sounlock(so, SL_LOCKED);
> > -   KERNEL_LOCK();
> > -   vrele(vp);
> > -   KERNEL_UNLOCK();
> > -   solock(so);
> > -   }
> >  }
> 
> Why did you move the lock dance to the beginning of the function?
> Does it matter?
> 

Yes, I want to disconnect dying socket from the file system before start
it's destruction. This does matter because in some cases we need to
release the solock() before acquire solock() for two sockets to keep
lock order between them. With listening socket disconnected from the
file system we can't have concurrent threads performing connect to dying
socket.

Also this is the reason I want to reorder listening socket destruction
in the soclose().

OK for updated diff?

Index: sys/kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.268
diff -u -p -r1.268 uipc_socket.c
--- sys/kern/uipc_socket.c  6 Nov 2021 05:26:33 -   1.268
+++ sys/kern/uipc_socket.c  10 Nov 2021 21:33:01 -
@@ -334,6 +334,8 @@ soclose(struct socket *so, int flags)
/* Revoke async IO early. There is a final revocation in sofree(). */
sigio_free(&so->so_sigio);
if (so->so_options & SO_ACCEPTCONN) {
+   so->so_options |= SO_DYING;
+
while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
(void) soqremque(so2, 0);
(void) soabort(so2);
Index: sys/kern/uipc_usrreq.c
===
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.154
diff -u -p -r1.154 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c  6 Nov 2021 17:35:14 -   1.154
+++ sys/kern/uipc_usrreq.c  10 Nov 2021 21:33:01 -
@@ -485,20 +485,30 @@ void
 unp_detach(struct unpcb *unp)
 {
struct socket *so = unp->unp_socket;
-   struct vnode *vp = NULL;
+  

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-10 Thread Alexander Bluhm
On Tue, Oct 26, 2021 at 02:12:36PM +0300, Vitaliy Makkoveev wrote:
> --- sys/kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> +++ sys/kern/uipc_socket.c26 Oct 2021 11:05:59 -
> @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
>   /* Revoke async IO early. There is a final revocation in sofree(). */
>   sigio_free(&so->so_sigio);
>   if (so->so_options & SO_ACCEPTCONN) {
> + so->so_options &= ~SO_ACCEPTCONN;
> +
>   while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
>   (void) soqremque(so2, 0);
>   (void) soabort(so2);

I don't like this.  A listening socket can never be converted to a
non-listening socket.  Whatever this transition means in th TCP
layer.  If you want to mark a listening socket as closing to avoid
accepting connections, use a approriate flag.  Do not convert it
to an non-listening socket that may have races elsewhere.

I would say some so_state bits may be suitable if we really need
this feature.

> - if (unp->unp_vnode) {
> +
> + if (vp) {

Please use (vp != NULL) for consistency.

> + unp->unp_vnode = NULL;
> +
>   /*
> -  * `v_socket' is only read in unp_connect and
> -  * unplock prevents concurrent access.
> +  * Enforce `i_lock' -> `unp_lock' because fifo
> +  * subsystem requires it.
>*/
>  
> - unp->unp_vnode->v_socket = NULL;
> - vp = unp->unp_vnode;
> - unp->unp_vnode = NULL;
> + sounlock(so, SL_LOCKED);
> +
> + VOP_LOCK(vp, LK_EXCLUSIVE);
> + vp->v_socket = NULL;
> +
> + KERNEL_LOCK();
> + vput(vp);
> + KERNEL_UNLOCK();
> +
> + solock(so);
>   }

This might work, we should try it.

> @@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
>   pool_put(&unpcb_pool, unp);
>   if (unp_rights)
>   task_add(systqmp, &unp_gc_task);
> -
> - if (vp != NULL) {
> - /*
> -  * Enforce `i_lock' -> `unplock' because fifo subsystem
> -  * requires it. The socket can't be closed concurrently
> -  * because the file descriptor reference is
> -  * still hold.
> -  */
> -
> - sounlock(so, SL_LOCKED);
> - KERNEL_LOCK();
> - vrele(vp);
> - KERNEL_UNLOCK();
> - solock(so);
> - }
>  }

Why did you move the lock dance to the beginning of the function?
Does it matter?

bluhm



Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-10 Thread Vitaliy Makkoveev
Ping...

On Thu, Nov 04, 2021 at 05:30:07PM +0300, Vitaliy Makkoveev wrote:
> On Fri, Nov 05, 2021 at 08:31:07AM +0100, Martin Pieuchot wrote:
> > On 26/10/21(Tue) 14:12, Vitaliy Makkoveev wrote:
> > > Another step to make UNIX sockets locking fine grained.
> > > 
> > > The listening socket has the references from file descriptors layer and
> > > from the vnode(9) layer. This means when we close(2)'ing such socket it
> > > still referenced by concurrent thread through connect(2) path.
> > > 
> > > When we bind(2) UNIX socket we link it to vnode(9) by assigning
> > > `v_socket'. When we connect(2)'ing socket to the socket we previously
> > > bind(2)'ed we finding it by namei(9) and obtain it's reference through
> > > `v_socket'. This socket has no extra reference in file descriptor
> > > layer and could be closed by concurrent thread.
> > > 
> > > This time we have `unp_lock' rwlock(9) which protects the whole layer
> > > and the dereference of `v_socket' is safe. But with the fine grained
> > > locking the `v_socket' will not be protected by global lock. When we
> > > obtain the vnode(9) by namei(9) in connect(9) or bind(9) paths it is
> > > already exclusively locked by vlode(9) lock. But in unp_detach() which
> > > is called on the close(2)'ing socket we assume `unp_lock' protects
> > > `v_socket'.
> > > 
> > > I propose to use exclusive vnode(9) lock to protect `v_socket'. With the
> > > fine grained locking, the `v_socket' dereference in unp_bind() or
> > > unp_connect() threads will be safe because unp_detach() thread will wait
> > > the vnode(9) lock release. The vnode referenced by `unp_vnod' has
> > > reference counter bumped so it's dereference is also safe without
> > > `unp_lock' held.
> > 
> > This makes sense to me.  Using the vnode lock here seems the simplest
> > approach.
> > 
> > > The `i_lock' should be take before `unp_lock' and unp_detach() should
> > > release solock(). To prevent connections on this socket the
> > > 'SO_ACCEPTCONN' bit cleared in soclose().
> > 
> > This is done to prevent races when solock() is released inside soabort(),
> > right?  Is it the only one or some more care is needed?
> > Will this stay with per-socket locks or is this only necessary because of
> > the global `unp_lock'?
> > 
> 
> The only "binded" sockets has the associated vnode(9). We link them
> when we perform unp_bind(). I used the "binded" instead of "listening"
> because datagram sockets could also be binded but they are not
> connection oriented.
> 
> soabort() kills the not accept(2)ed peers of connection oriented sockets
> when we close "listening" socket. I used "listening" because the socket
> was bind(2)ed and then marked as "assepting connections" by listen(2).
> 
> Since the unaccepted peers we kills with soabort() has no associated
> vnode(9) we don't release `unp_lock' because we don't need to call
> vrele(9).
> 
> This diff solves lock dances in unp_connect(). Our thread own `so' we
> passed to unp_connect(). It's file descriptor has refcouter bumped and
> the `so' can't be killed by concurrent thread. But we get `so2' the
> "binded" socket from vnode(9).
> 
> This time the global `unp_lock' protects both sockets, but with the
> per-socket locking the only `so' will be protected. So the `so2'
> dereference will be unsafe because concurrent thread could kill it.
> 
> Also we could release `unp_lock' within unp_attach(). So unp_attach()
> called by sonewconn() within unp_connect() could release `unp_lock' too.
> The upcoming diff which moves unp_gc() stuff from `unp_lock' will
> release `unp_lock' in unp_detach() to enforce lock order `unp_gc_lock'
> -> `unp_lock'.
> 
> We keep vnode(9) locked when we link the socket with vnode(9) in
> unp_bind(). We also keep vnode(9) locked when we connect sockets in
> unp_connect(). So unp_detach() should also lock the vnode(9) because
> vnode(9) lock protects `v_socket'.
> 
> This will be actual with the upcoming fine grained locks implementation.
> This time this diff mostly makes `v_socket' protection consistent.
> 
> Could I commit this diff?
> 
> > > Index: sys/kern/uipc_socket.c
> > > ===
> > > RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> > > retrieving revision 1.265
> > > diff -u -p -r1.265 uipc_socket.c
> > > --- sys/kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> > > +++ sys/kern/uipc_socket.c26 Oct 2021 11:05:59 -
> > > @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
> > >   /* Revoke async IO early. There is a final revocation in sofree(). */
> > >   sigio_free(&so->so_sigio);
> > >   if (so->so_options & SO_ACCEPTCONN) {
> > > + so->so_options &= ~SO_ACCEPTCONN;
> > > +
> > >   while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
> > >   (void) soqremque(so2, 0);
> > >   (void) soabort(so2);
> > > Index: sys/kern/uipc_usrreq.c
> > > 

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-05 Thread Vitaliy Makkoveev
On Fri, Nov 05, 2021 at 08:31:07AM +0100, Martin Pieuchot wrote:
> On 26/10/21(Tue) 14:12, Vitaliy Makkoveev wrote:
> > Another step to make UNIX sockets locking fine grained.
> > 
> > The listening socket has the references from file descriptors layer and
> > from the vnode(9) layer. This means when we close(2)'ing such socket it
> > still referenced by concurrent thread through connect(2) path.
> > 
> > When we bind(2) UNIX socket we link it to vnode(9) by assigning
> > `v_socket'. When we connect(2)'ing socket to the socket we previously
> > bind(2)'ed we finding it by namei(9) and obtain it's reference through
> > `v_socket'. This socket has no extra reference in file descriptor
> > layer and could be closed by concurrent thread.
> > 
> > This time we have `unp_lock' rwlock(9) which protects the whole layer
> > and the dereference of `v_socket' is safe. But with the fine grained
> > locking the `v_socket' will not be protected by global lock. When we
> > obtain the vnode(9) by namei(9) in connect(9) or bind(9) paths it is
> > already exclusively locked by vlode(9) lock. But in unp_detach() which
> > is called on the close(2)'ing socket we assume `unp_lock' protects
> > `v_socket'.
> > 
> > I propose to use exclusive vnode(9) lock to protect `v_socket'. With the
> > fine grained locking, the `v_socket' dereference in unp_bind() or
> > unp_connect() threads will be safe because unp_detach() thread will wait
> > the vnode(9) lock release. The vnode referenced by `unp_vnod' has
> > reference counter bumped so it's dereference is also safe without
> > `unp_lock' held.
> 
> This makes sense to me.  Using the vnode lock here seems the simplest
> approach.
> 
> > The `i_lock' should be take before `unp_lock' and unp_detach() should
> > release solock(). To prevent connections on this socket the
> > 'SO_ACCEPTCONN' bit cleared in soclose().
> 
> This is done to prevent races when solock() is released inside soabort(),
> right?  Is it the only one or some more care is needed?
> Will this stay with per-socket locks or is this only necessary because of
> the global `unp_lock'?
> 

The only "binded" sockets has the associated vnode(9). We link them
when we perform unp_bind(). I used the "binded" instead of "listening"
because datagram sockets could also be binded but they are not
connection oriented.

soabort() kills the not accept(2)ed peers of connection oriented sockets
when we close "listening" socket. I used "listening" because the socket
was bind(2)ed and then marked as "assepting connections" by listen(2).

Since the unaccepted peers we kills with soabort() has no associated
vnode(9) we don't release `unp_lock' because we don't need to call
vrele(9).

This diff solves lock dances in unp_connect(). Our thread own `so' we
passed to unp_connect(). It's file descriptor has refcouter bumped and
the `so' can't be killed by concurrent thread. But we get `so2' the
"binded" socket from vnode(9).

This time the global `unp_lock' protects both sockets, but with the
per-socket locking the only `so' will be protected. So the `so2'
dereference will be unsafe because concurrent thread could kill it.

Also we could release `unp_lock' within unp_attach(). So unp_attach()
called by sonewconn() within unp_connect() could release `unp_lock' too.
The upcoming diff which moves unp_gc() stuff from `unp_lock' will
release `unp_lock' in unp_detach() to enforce lock order `unp_gc_lock'
-> `unp_lock'.

We keep vnode(9) locked when we link the socket with vnode(9) in
unp_bind(). We also keep vnode(9) locked when we connect sockets in
unp_connect(). So unp_detach() should also lock the vnode(9) because
vnode(9) lock protects `v_socket'.

This will be actual with the upcoming fine grained locks implementation.
This time this diff mostly makes `v_socket' protection consistent.

Could I commit this diff?

> > Index: sys/kern/uipc_socket.c
> > ===
> > RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> > retrieving revision 1.265
> > diff -u -p -r1.265 uipc_socket.c
> > --- sys/kern/uipc_socket.c  14 Oct 2021 23:05:10 -  1.265
> > +++ sys/kern/uipc_socket.c  26 Oct 2021 11:05:59 -
> > @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
> > /* Revoke async IO early. There is a final revocation in sofree(). */
> > sigio_free(&so->so_sigio);
> > if (so->so_options & SO_ACCEPTCONN) {
> > +   so->so_options &= ~SO_ACCEPTCONN;
> > +
> > while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
> > (void) soqremque(so2, 0);
> > (void) soabort(so2);
> > Index: sys/kern/uipc_usrreq.c
> > ===
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.150
> > diff -u -p -r1.150 uipc_usrreq.c
> > --- sys/kern/uipc_usrreq.c  21 Oct 2021 22:11:07 -  1.150
> > +++ sys/kern/uipc_usrreq.c  26 Oct 2021 11:05:59 -
> > @@ -474,20 +47

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-05 Thread Martin Pieuchot
On 26/10/21(Tue) 14:12, Vitaliy Makkoveev wrote:
> Another step to make UNIX sockets locking fine grained.
> 
> The listening socket has the references from file descriptors layer and
> from the vnode(9) layer. This means when we close(2)'ing such socket it
> still referenced by concurrent thread through connect(2) path.
> 
> When we bind(2) UNIX socket we link it to vnode(9) by assigning
> `v_socket'. When we connect(2)'ing socket to the socket we previously
> bind(2)'ed we finding it by namei(9) and obtain it's reference through
> `v_socket'. This socket has no extra reference in file descriptor
> layer and could be closed by concurrent thread.
> 
> This time we have `unp_lock' rwlock(9) which protects the whole layer
> and the dereference of `v_socket' is safe. But with the fine grained
> locking the `v_socket' will not be protected by global lock. When we
> obtain the vnode(9) by namei(9) in connect(9) or bind(9) paths it is
> already exclusively locked by vlode(9) lock. But in unp_detach() which
> is called on the close(2)'ing socket we assume `unp_lock' protects
> `v_socket'.
> 
> I propose to use exclusive vnode(9) lock to protect `v_socket'. With the
> fine grained locking, the `v_socket' dereference in unp_bind() or
> unp_connect() threads will be safe because unp_detach() thread will wait
> the vnode(9) lock release. The vnode referenced by `unp_vnod' has
> reference counter bumped so it's dereference is also safe without
> `unp_lock' held.

This makes sense to me.  Using the vnode lock here seems the simplest
approach.

> The `i_lock' should be take before `unp_lock' and unp_detach() should
> release solock(). To prevent connections on this socket the
> 'SO_ACCEPTCONN' bit cleared in soclose().

This is done to prevent races when solock() is released inside soabort(),
right?  Is it the only one or some more care is needed?
Will this stay with per-socket locks or is this only necessary because of
the global `unp_lock'?

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.265
> diff -u -p -r1.265 uipc_socket.c
> --- sys/kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> +++ sys/kern/uipc_socket.c26 Oct 2021 11:05:59 -
> @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
>   /* Revoke async IO early. There is a final revocation in sofree(). */
>   sigio_free(&so->so_sigio);
>   if (so->so_options & SO_ACCEPTCONN) {
> + so->so_options &= ~SO_ACCEPTCONN;
> +
>   while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
>   (void) soqremque(so2, 0);
>   (void) soabort(so2);
> Index: sys/kern/uipc_usrreq.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c21 Oct 2021 22:11:07 -  1.150
> +++ sys/kern/uipc_usrreq.c26 Oct 2021 11:05:59 -
> @@ -474,20 +474,30 @@ void
>  unp_detach(struct unpcb *unp)
>  {
>   struct socket *so = unp->unp_socket;
> - struct vnode *vp = NULL;
> + struct vnode *vp = unp->unp_vnode;
>  
>   rw_assert_wrlock(&unp_lock);
>  
>   LIST_REMOVE(unp, unp_link);
> - if (unp->unp_vnode) {
> +
> + if (vp) {
> + unp->unp_vnode = NULL;
> +
>   /*
> -  * `v_socket' is only read in unp_connect and
> -  * unplock prevents concurrent access.
> +  * Enforce `i_lock' -> `unp_lock' because fifo
> +  * subsystem requires it.
>*/
>  
> - unp->unp_vnode->v_socket = NULL;
> - vp = unp->unp_vnode;
> - unp->unp_vnode = NULL;
> + sounlock(so, SL_LOCKED);
> +
> + VOP_LOCK(vp, LK_EXCLUSIVE);
> + vp->v_socket = NULL;
> +
> + KERNEL_LOCK();
> + vput(vp);
> + KERNEL_UNLOCK();
> +
> + solock(so);
>   }
>  
>   if (unp->unp_conn)
> @@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
>   pool_put(&unpcb_pool, unp);
>   if (unp_rights)
>   task_add(systqmp, &unp_gc_task);
> -
> - if (vp != NULL) {
> - /*
> -  * Enforce `i_lock' -> `unplock' because fifo subsystem
> -  * requires it. The socket can't be closed concurrently
> -  * because the file descriptor reference is
> -  * still hold.
> -  */
> -
> - sounlock(so, SL_LOCKED);
> - KERNEL_LOCK();
> - vrele(vp);
> - KERNEL_UNLOCK();
> - solock(so);
> - }
>  }
>  
>  int
> 



Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-10-28 Thread Vitaliy Makkoveev
ping...

> On 26 Oct 2021, at 14:12, Vitaliy Makkoveev  wrote:
> 
> Another step to make UNIX sockets locking fine grained.
> 
> The listening socket has the references from file descriptors layer and
> from the vnode(9) layer. This means when we close(2)'ing such socket it
> still referenced by concurrent thread through connect(2) path.
> 
> When we bind(2) UNIX socket we link it to vnode(9) by assigning
> `v_socket'. When we connect(2)'ing socket to the socket we previously
> bind(2)'ed we finding it by namei(9) and obtain it's reference through
> `v_socket'. This socket has no extra reference in file descriptor
> layer and could be closed by concurrent thread.
> 
> This time we have `unp_lock' rwlock(9) which protects the whole layer
> and the dereference of `v_socket' is safe. But with the fine grained
> locking the `v_socket' will not be protected by global lock. When we
> obtain the vnode(9) by namei(9) in connect(9) or bind(9) paths it is
> already exclusively locked by vlode(9) lock. But in unp_detach() which
> is called on the close(2)'ing socket we assume `unp_lock' protects
> `v_socket'.
> 
> I propose to use exclusive vnode(9) lock to protect `v_socket'. With the
> fine grained locking, the `v_socket' dereference in unp_bind() or
> unp_connect() threads will be safe because unp_detach() thread will wait
> the vnode(9) lock release. The vnode referenced by `unp_vnod' has
> reference counter bumped so it's dereference is also safe without
> `unp_lock' held.
> 
> The `i_lock' should be take before `unp_lock' and unp_detach() should
> release solock(). To prevent connections on this socket the
> 'SO_ACCEPTCONN' bit cleared in soclose().
> 
> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.265
> diff -u -p -r1.265 uipc_socket.c
> --- sys/kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> +++ sys/kern/uipc_socket.c26 Oct 2021 11:05:59 -
> @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
>   /* Revoke async IO early. There is a final revocation in sofree(). */
>   sigio_free(&so->so_sigio);
>   if (so->so_options & SO_ACCEPTCONN) {
> + so->so_options &= ~SO_ACCEPTCONN;
> +
>   while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
>   (void) soqremque(so2, 0);
>   (void) soabort(so2);
> Index: sys/kern/uipc_usrreq.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c21 Oct 2021 22:11:07 -  1.150
> +++ sys/kern/uipc_usrreq.c26 Oct 2021 11:05:59 -
> @@ -474,20 +474,30 @@ void
> unp_detach(struct unpcb *unp)
> {
>   struct socket *so = unp->unp_socket;
> - struct vnode *vp = NULL;
> + struct vnode *vp = unp->unp_vnode;
> 
>   rw_assert_wrlock(&unp_lock);
> 
>   LIST_REMOVE(unp, unp_link);
> - if (unp->unp_vnode) {
> +
> + if (vp) {
> + unp->unp_vnode = NULL;
> +
>   /*
> -  * `v_socket' is only read in unp_connect and
> -  * unplock prevents concurrent access.
> +  * Enforce `i_lock' -> `unp_lock' because fifo
> +  * subsystem requires it.
>*/
> 
> - unp->unp_vnode->v_socket = NULL;
> - vp = unp->unp_vnode;
> - unp->unp_vnode = NULL;
> + sounlock(so, SL_LOCKED);
> +
> + VOP_LOCK(vp, LK_EXCLUSIVE);
> + vp->v_socket = NULL;
> +
> + KERNEL_LOCK();
> + vput(vp);
> + KERNEL_UNLOCK();
> +
> + solock(so);
>   }
> 
>   if (unp->unp_conn)
> @@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
>   pool_put(&unpcb_pool, unp);
>   if (unp_rights)
>   task_add(systqmp, &unp_gc_task);
> -
> - if (vp != NULL) {
> - /*
> -  * Enforce `i_lock' -> `unplock' because fifo subsystem
> -  * requires it. The socket can't be closed concurrently
> -  * because the file descriptor reference is
> -  * still hold.
> -  */
> -
> - sounlock(so, SL_LOCKED);
> - KERNEL_LOCK();
> - vrele(vp);
> - KERNEL_UNLOCK();
> - solock(so);
> - }
> }
> 
> int
> 



UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-10-26 Thread Vitaliy Makkoveev
Another step to make UNIX sockets locking fine grained.

The listening socket has the references from file descriptors layer and
from the vnode(9) layer. This means when we close(2)'ing such socket it
still referenced by concurrent thread through connect(2) path.

When we bind(2) UNIX socket we link it to vnode(9) by assigning
`v_socket'. When we connect(2)'ing socket to the socket we previously
bind(2)'ed we finding it by namei(9) and obtain it's reference through
`v_socket'. This socket has no extra reference in file descriptor
layer and could be closed by concurrent thread.

This time we have `unp_lock' rwlock(9) which protects the whole layer
and the dereference of `v_socket' is safe. But with the fine grained
locking the `v_socket' will not be protected by global lock. When we
obtain the vnode(9) by namei(9) in connect(9) or bind(9) paths it is
already exclusively locked by vlode(9) lock. But in unp_detach() which
is called on the close(2)'ing socket we assume `unp_lock' protects
`v_socket'.

I propose to use exclusive vnode(9) lock to protect `v_socket'. With the
fine grained locking, the `v_socket' dereference in unp_bind() or
unp_connect() threads will be safe because unp_detach() thread will wait
the vnode(9) lock release. The vnode referenced by `unp_vnod' has
reference counter bumped so it's dereference is also safe without
`unp_lock' held.

The `i_lock' should be take before `unp_lock' and unp_detach() should
release solock(). To prevent connections on this socket the
'SO_ACCEPTCONN' bit cleared in soclose().

Index: sys/kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.265
diff -u -p -r1.265 uipc_socket.c
--- sys/kern/uipc_socket.c  14 Oct 2021 23:05:10 -  1.265
+++ sys/kern/uipc_socket.c  26 Oct 2021 11:05:59 -
@@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
/* Revoke async IO early. There is a final revocation in sofree(). */
sigio_free(&so->so_sigio);
if (so->so_options & SO_ACCEPTCONN) {
+   so->so_options &= ~SO_ACCEPTCONN;
+
while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
(void) soqremque(so2, 0);
(void) soabort(so2);
Index: sys/kern/uipc_usrreq.c
===
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.150
diff -u -p -r1.150 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c  21 Oct 2021 22:11:07 -  1.150
+++ sys/kern/uipc_usrreq.c  26 Oct 2021 11:05:59 -
@@ -474,20 +474,30 @@ void
 unp_detach(struct unpcb *unp)
 {
struct socket *so = unp->unp_socket;
-   struct vnode *vp = NULL;
+   struct vnode *vp = unp->unp_vnode;
 
rw_assert_wrlock(&unp_lock);
 
LIST_REMOVE(unp, unp_link);
-   if (unp->unp_vnode) {
+
+   if (vp) {
+   unp->unp_vnode = NULL;
+
/*
-* `v_socket' is only read in unp_connect and
-* unplock prevents concurrent access.
+* Enforce `i_lock' -> `unp_lock' because fifo
+* subsystem requires it.
 */
 
-   unp->unp_vnode->v_socket = NULL;
-   vp = unp->unp_vnode;
-   unp->unp_vnode = NULL;
+   sounlock(so, SL_LOCKED);
+
+   VOP_LOCK(vp, LK_EXCLUSIVE);
+   vp->v_socket = NULL;
+
+   KERNEL_LOCK();
+   vput(vp);
+   KERNEL_UNLOCK();
+
+   solock(so);
}
 
if (unp->unp_conn)
@@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
pool_put(&unpcb_pool, unp);
if (unp_rights)
task_add(systqmp, &unp_gc_task);
-
-   if (vp != NULL) {
-   /*
-* Enforce `i_lock' -> `unplock' because fifo subsystem
-* requires it. The socket can't be closed concurrently
-* because the file descriptor reference is
-* still hold.
-*/
-
-   sounlock(so, SL_LOCKED);
-   KERNEL_LOCK();
-   vrele(vp);
-   KERNEL_UNLOCK();
-   solock(so);
-   }
 }
 
 int