(resend, sorry, fscked up the address list)

> A recv() on an AF_UNIX, SOCK_STREAM socket can race with a
> send()+close() on the peer, causing recv() to return zero, even though
> the sent data should be received.
> 
> This happens if the send() and the close() is performed between
> skb_dequeue() and checking sk->sk_shutdown in unix_stream_recvmsg():
> 
> process A  skb_dequeue() returns NULL, there's no data in the socket queue
> process B  new data is inserted onto the queue by unix_stream_sendmsg()
> process B  sk->sk_shutdown is set to SHUTDOWN_MASK by unix_release_sock()
> process A  sk->sk_shutdown is checked, unix_release_sock() returns zero

This is only part of the story.  It turns out, there are other races
involving the garbage collector, that can throw away perfectly good
packets with AF_UNIX sockets in them.

The problems arise when a socket goes from installed to in-flight or
vica versa during garbage collection.  Since gc is done with a
spinlock held, this only shows up on SMP.

The following patch fixes it for me, but it's possibly the wrong
approach.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux-2.6.22-rc2/net/unix/garbage.c
===================================================================
--- linux-2.6.22-rc2.orig/net/unix/garbage.c    2007-06-03 23:58:11.000000000 
+0200
+++ linux-2.6.22-rc2/net/unix/garbage.c 2007-06-04 11:39:42.000000000 +0200
@@ -90,6 +90,7 @@
 static struct sock *gc_current = GC_HEAD; /* stack of objects to mark */
 
 atomic_t unix_tot_inflight = ATOMIC_INIT(0);
+DECLARE_RWSEM(unix_gc_sem);
 
 
 static struct sock *unix_get_socket(struct file *filp)
@@ -169,7 +170,7 @@ static void maybe_unmark_and_push(struct
 
 void unix_gc(void)
 {
-       static DEFINE_MUTEX(unix_gc_sem);
+       static DEFINE_MUTEX(unix_gc_local_lock);
        int i;
        struct sock *s;
        struct sk_buff_head hitlist;
@@ -179,9 +180,22 @@ void unix_gc(void)
         *      Avoid a recursive GC.
         */
 
-       if (!mutex_trylock(&unix_gc_sem))
+       if (!mutex_trylock(&unix_gc_local_lock))
                return;
 
+
+       /*
+        * unix_gc_sem protects against sockets going from in-flight to
+        * installed
+        *
+        * Can't sleep on this, because skb_recv_datagram could be
+        * waiting for a packet that is to be sent by the thread which
+        * invoked the gc
+        */
+       if (!down_write_trylock(&unix_gc_sem)) {
+               mutex_unlock(&unix_gc_local_lock);
+               return;
+       }
        spin_lock(&unix_table_lock);
 
        forall_unix_sockets(i, s)
@@ -207,8 +221,6 @@ void unix_gc(void)
 
        forall_unix_sockets(i, s)
        {
-               int open_count = 0;
-
                /*
                 *      If all instances of the descriptor are not
                 *      in flight we are in use.
@@ -218,10 +230,20 @@ void unix_gc(void)
                 *      In this case (see unix_create1()) we set artificial
                 *      negative inflight counter to close race window.
                 *      It is trick of course and dirty one.
+                *
+                *      Get the inflight counter first, then the open
+                *      counter.  This avoids problems if racing with
+                *      sendmsg
+                *
+                *      If just created socket is not yet attached to
+                *      a file descriptor, assume open_count of 1
                 */
+               int inflight_count = atomic_read(&unix_sk(s)->inflight);
+               int open_count = 1;
+
                if (s->sk_socket && s->sk_socket->file)
                        open_count = file_count(s->sk_socket->file);
-               if (open_count > atomic_read(&unix_sk(s)->inflight))
+               if (open_count > inflight_count)
                        maybe_unmark_and_push(s);
        }
 
@@ -302,11 +324,12 @@ void unix_gc(void)
                u->gc_tree = GC_ORPHAN;
        }
        spin_unlock(&unix_table_lock);
+       up_write(&unix_gc_sem);
 
        /*
         *      Here we are. Hitlist is filled. Die.
         */
 
        __skb_queue_purge(&hitlist);
-       mutex_unlock(&unix_gc_sem);
+       mutex_unlock(&unix_gc_local_lock);
 }
Index: linux-2.6.22-rc2/include/net/af_unix.h
===================================================================
--- linux-2.6.22-rc2.orig/include/net/af_unix.h 2007-04-26 05:08:32.000000000 
+0200
+++ linux-2.6.22-rc2/include/net/af_unix.h      2007-06-04 09:13:56.000000000 
+0200
@@ -14,6 +14,7 @@ extern void unix_gc(void);
 
 extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 extern spinlock_t unix_table_lock;
+extern struct rw_semaphore unix_gc_sem;
 
 extern atomic_t unix_tot_inflight;
 
Index: linux-2.6.22-rc2/net/unix/af_unix.c
===================================================================
--- linux-2.6.22-rc2.orig/net/unix/af_unix.c    2007-06-03 23:58:11.000000000 
+0200
+++ linux-2.6.22-rc2/net/unix/af_unix.c 2007-06-04 11:04:15.000000000 +0200
@@ -1572,6 +1572,7 @@ static int unix_dgram_recvmsg(struct kio
 
        msg->msg_namelen = 0;
 
+       down_read(&unix_gc_sem);
        mutex_lock(&u->readlock);
 
        skb = skb_recv_datagram(sk, flags, noblock, &err);
@@ -1629,6 +1630,7 @@ out_free:
        skb_free_datagram(sk,skb);
 out_unlock:
        mutex_unlock(&u->readlock);
+       up_read(&unix_gc_sem);
 out:
        return err;
 }
@@ -1704,6 +1706,7 @@ static int unix_stream_recvmsg(struct ki
                memset(&tmp_scm, 0, sizeof(tmp_scm));
        }
 
+       down_read(&unix_gc_sem);
        mutex_lock(&u->readlock);
 
        do
@@ -1732,6 +1735,7 @@ static int unix_stream_recvmsg(struct ki
                        if (!timeo)
                                break;
                        mutex_unlock(&u->readlock);
+                       up_read(&unix_gc_sem);
 
                        timeo = unix_stream_data_wait(sk, timeo);
 
@@ -1739,6 +1743,7 @@ static int unix_stream_recvmsg(struct ki
                                err = sock_intr_errno(timeo);
                                goto out;
                        }
+                       down_read(&unix_gc_sem);
                        mutex_lock(&u->readlock);
                        continue;
  unlock:
@@ -1810,6 +1815,7 @@ static int unix_stream_recvmsg(struct ki
        } while (size);
 
        mutex_unlock(&u->readlock);
+       up_read(&unix_gc_sem);
        scm_recv(sock, msg, siocb->scm, flags);
 out:
        return copied ? : err;

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to