From: "David S. Miller" <[EMAIL PROTECTED]>
Date: Sat, 15 Apr 2006 01:23:27 -0700 (PDT)

> I came up with one more possible approach, it goes something like
> this:
> 
> 1) Get rid of the skb->destructor callback for TCP receive
>    data.
> 
> 2) Adjust sk_rmem_alloc and sk_forward_alloc explicitly when we add
>    packets to sk_receive_queue/out_of_order_queue, and when we unlink
>    them from sk_receive_queue and __kfree_skb() them up.

It turns out DCCP doesn't use the sk_forward_alloc memory
scheduling at least not for receive.

So, as food for thought, below is a comile-tested-only
implementation of the above idea.  The basic transformations
are:

1) sk_eat_skb --> sk_stream_eat_skb which does the sk_forward_alloc
   et al. accounting by hand which would have been done by the
   skb->destructor sk_stream_rfree()

2) __skb_queue_purge --> sk_stream_eat_queue

3) sk_stream_set_owner_r --> sk_stream_charge_r which does
   the accounting for a new receive skb but does not hook up
   the destructor

4) unlink and free SKB not received to userspace yet -->
   sk_stream_eat_skb()

... and then I noticed that ip_rcv() unshares the SKB, so
skb->users should always be 1 and therefore it is impossible
for sk_stream_rfree() to be called from a non socket locked
context.

And all of this was a wild goose chase, oh well :-)
So the BUG_ON(!sk_forward_alloc) problem is elsewhere. :-/

diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist
index 2bcebdc..da3379a 100644
--- a/arch/x86_64/kernel/functionlist
+++ b/arch/x86_64/kernel/functionlist
@@ -751,7 +751,6 @@
 *(.text.strcmp)
 *(.text.steal_locks)
 *(.text.sock_create)
-*(.text.sk_stream_rfree)
 *(.text.sk_stream_mem_schedule)
 *(.text.skip_atoi)
 *(.text.sk_alloc)
diff --git a/include/net/sock.h b/include/net/sock.h
index af2b054..54d5a2f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -442,14 +442,56 @@ static inline int sk_stream_memory_free(
        return sk->sk_wmem_queued < sk->sk_sndbuf;
 }
 
-extern void sk_stream_rfree(struct sk_buff *skb);
-
-static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
+/* Charge SKB as receive queue memory to SK.  The socket must be locked
+ * when we get here.
+ */
+static inline void sk_stream_charge_r(struct sk_buff *skb, struct sock *sk)
 {
-       skb->sk = sk;
-       skb->destructor = sk_stream_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk->sk_forward_alloc -= skb->truesize;
+}
+
+/* Release SKB as receive queue memory from SK.  The socket must be
+ * locked when we get here.
+ */
+static inline void sk_stream_release_r(struct sk_buff *skb, struct sock *sk)
+{
+       atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+       sk->sk_forward_alloc += skb->truesize;
+}
+
+/**
+ * sk_stream_eat_skb - Release a skb if it is no longer needed
+ * @sk: socket to eat this skb from
+ * @skb: socket buffer to eat
+ * @queue: skb queue to unlink from
+ *
+ * This routine must be called with interrupts disabled or with the socket
+ * locked so that the sk_buff queue operation is ok.
+ */
+static inline void sk_stream_eat_skb(struct sk_buff *skb, struct sock *sk, 
struct sk_buff_head *queue)
+{
+       __skb_unlink(skb, queue);
+       sk_stream_release_r(skb, sk);
+       __kfree_skb(skb);
+}
+
+/*
+ * sk_stream_eat_queue - Release an entire queue of skbs, when no longer needed
+ * @sk: socket to eat these skbs from
+ * @queue: queue of skbs to eat
+ *
+ * This routine must be called with interrupts disabled or with the socket
+ * locked so that the sk_buff queue operation is ok.
+ */
+static inline void sk_stream_eat_queue(struct sock *sk, struct sk_buff_head 
*queue)
+{
+       struct sk_buff *skb;
+
+       while ((skb = __skb_dequeue(queue)) != NULL) {
+               sk_stream_release_r(skb, sk);
+               __kfree_skb(skb);
+       }
 }
 
 static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb)
diff --git a/net/core/stream.c b/net/core/stream.c
index 35e2525..a034f2d 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -172,16 +172,6 @@ do_interrupted:
 
 EXPORT_SYMBOL(sk_stream_wait_memory);
 
-void sk_stream_rfree(struct sk_buff *skb)
-{
-       struct sock *sk = skb->sk;
-
-       atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
-       sk->sk_forward_alloc += skb->truesize;
-}
-
-EXPORT_SYMBOL(sk_stream_rfree);
-
 int sk_stream_error(struct sock *sk, int flags, int err)
 {
        if (err == -EPIPE)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87f68e7..c45a9c2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_
                                break;
                }
                if (skb->h.th->fin) {
-                       sk_eat_skb(sk, skb);
+                       sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
                        ++seq;
                        break;
                }
-               sk_eat_skb(sk, skb);
+               sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
                if (!desc->count)
                        break;
        }
@@ -1355,15 +1355,17 @@ skip_copy:
 
                if (skb->h.th->fin)
                        goto found_fin_ok;
-               if (!(flags & MSG_PEEK))
-                       sk_eat_skb(sk, skb);
+               if (!(flags & MSG_PEEK)) {
+                       sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
+               }
                continue;
 
        found_fin_ok:
                /* Process the FIN. */
                ++*seq;
-               if (!(flags & MSG_PEEK))
-                       sk_eat_skb(sk, skb);
+               if (!(flags & MSG_PEEK)) {
+                       sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
+               }
                break;
        } while (len > 0);
 
@@ -1489,6 +1491,7 @@ void tcp_close(struct sock *sk, long tim
                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
                          skb->h.th->fin;
                data_was_unread += len;
+               sk_stream_release_r(skb, sk);
                __kfree_skb(skb);
        }
 
@@ -1650,9 +1653,9 @@ int tcp_disconnect(struct sock *sk, int 
                sk->sk_err = ECONNRESET;
 
        tcp_clear_xmit_timers(sk);
-       __skb_queue_purge(&sk->sk_receive_queue);
+       sk_stream_eat_queue(sk, &sk->sk_receive_queue);
        sk_stream_writequeue_purge(sk);
-       __skb_queue_purge(&tp->out_of_order_queue);
+       sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 
        inet->dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9f0cca4..5a30648 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2875,7 +2875,7 @@ static void tcp_fin(struct sk_buff *skb,
        /* It _is_ possible, that we have something out-of-order _after_ FIN.
         * Probably, we should reset in this case. For now drop them.
         */
-       __skb_queue_purge(&tp->out_of_order_queue);
+       sk_stream_eat_queue(sk, &tp->out_of_order_queue);
        if (tp->rx_opt.sack_ok)
                tcp_sack_reset(&tp->rx_opt);
        sk_stream_mem_reclaim(sk);
@@ -3093,8 +3093,7 @@ static void tcp_ofo_queue(struct sock *s
 
                if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
                        SOCK_DEBUG(sk, "ofo packet was already received \n");
-                       __skb_unlink(skb, &tp->out_of_order_queue);
-                       __kfree_skb(skb);
+                       sk_stream_eat_skb(skb, sk, &tp->out_of_order_queue);
                        continue;
                }
                SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -3166,7 +3165,7 @@ queue_and_out:
                                    !sk_stream_rmem_schedule(sk, skb))
                                        goto drop;
                        }
-                       sk_stream_set_owner_r(skb, sk);
+                       sk_stream_charge_r(skb, sk);
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
                }
                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3248,7 +3247,7 @@ drop:
        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
-       sk_stream_set_owner_r(skb, sk);
+       sk_stream_charge_r(skb, sk);
 
        if (!skb_peek(&tp->out_of_order_queue)) {
                /* Initial out of order segment, build 1 SACK. */
@@ -3290,6 +3289,7 @@ drop:
                    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
                        if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
                                /* All the bits are present. Drop. */
+                               sk_stream_release_r(skb, sk);
                                __kfree_skb(skb);
                                tcp_dsack_set(tp, seq, end_seq);
                                goto add_sack;
@@ -3311,9 +3311,8 @@ drop:
                               tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, 
end_seq);
                               break;
                       }
-                      __skb_unlink(skb1, &tp->out_of_order_queue);
                       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, 
TCP_SKB_CB(skb1)->end_seq);
-                      __kfree_skb(skb1);
+                      sk_stream_eat_skb(skb1, sk, &tp->out_of_order_queue);
                }
 
 add_sack:
@@ -3340,8 +3339,7 @@ tcp_collapse(struct sock *sk, struct sk_
                /* No new bits? It is possible on ofo queue. */
                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
                        struct sk_buff *next = skb->next;
-                       __skb_unlink(skb, list);
-                       __kfree_skb(skb);
+                       sk_stream_eat_skb(skb, sk, list);
                        NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
                        skb = next;
                        continue;
@@ -3387,7 +3385,7 @@ tcp_collapse(struct sock *sk, struct sk_
                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
                __skb_insert(nskb, skb->prev, skb, list);
-               sk_stream_set_owner_r(nskb, sk);
+               sk_stream_charge_r(nskb, sk);
 
                /* Copy data, releasing collapsed skbs. */
                while (copy > 0) {
@@ -3405,8 +3403,7 @@ tcp_collapse(struct sock *sk, struct sk_
                        }
                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
                                struct sk_buff *next = skb->next;
-                               __skb_unlink(skb, list);
-                               __kfree_skb(skb);
+                               sk_stream_eat_skb(skb, sk, list);
                                NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
                                skb = next;
                                if (skb == tail || skb->h.th->syn || 
skb->h.th->fin)
@@ -3494,7 +3491,7 @@ static int tcp_prune_queue(struct sock *
        /* First, purge the out_of_order queue. */
        if (!skb_queue_empty(&tp->out_of_order_queue)) {
                NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
-               __skb_queue_purge(&tp->out_of_order_queue);
+               sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 
                /* Reset SACK state.  A conforming SACK implementation will
                 * do the same at a timeout based retransmit.  When a connection
@@ -3703,10 +3700,8 @@ static void tcp_check_urg(struct sock * 
            tp->copied_seq != tp->rcv_nxt) {
                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
                tp->copied_seq++;
-               if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
-                       __skb_unlink(skb, &sk->sk_receive_queue);
-                       __kfree_skb(skb);
-               }
+               if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))
+                       sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
        }
 
        tp->urg_data   = TCP_URG_NOTYET;
@@ -3950,7 +3945,7 @@ int tcp_rcv_established(struct sock *sk,
                                /* Bulk data transfer: receiver */
                                __skb_pull(skb,tcp_header_len);
                                __skb_queue_tail(&sk->sk_receive_queue, skb);
-                               sk_stream_set_owner_r(skb, sk);
+                               sk_stream_charge_r(skb, sk);
                                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                        }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e..e518d97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1294,7 +1294,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
        sk_stream_writequeue_purge(sk);
 
        /* Cleans up our, hopefully empty, out_of_order_queue. */
-       __skb_queue_purge(&tp->out_of_order_queue);
+       sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 
        /* Clean prequeue, it must be empty really */
        __skb_queue_purge(&tp->ucopy.prequeue);
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to