Regression introduced by commit 2dc49d1680.

tcp_v6_fill_cb() will be called twice if socket's state changes from
TCP_TIME_WAIT to TCP_LISTEN. That can result in performance loss and
control buffer data corruption because in the second tcp_v6_fill_cb()
it's not copying the 'header' anymore, but 'seq', 'end_seq', etc.

Reproduced with LTP/tcp_fastopen test and netperf -t TCP_CRR, so when
we're constantly closing and opening TCP connections after several
requests/responses from client.

This can be fixed if we move 'header' union back to the beginning of
'struct tcp_skb_cb' and getting skb->next, TCP_SKB_CB(skb)->seq and
TCP_SKB_CB(skb)->end_seq on the same cache line by moving 'cb[48]'
closer to 'skb->next', above the *sk and *dev pointers.

Signed-off-by: Alexey Kodanev <alexey.koda...@oracle.com>
---
 include/linux/skbuff.h |    5 +++--
 include/net/tcp.h      |   17 ++++++++---------
 net/ipv4/tcp_ipv4.c    |    6 ------
 net/ipv4/tcp_output.c  |    4 ----
 net/ipv6/tcp_ipv6.c    |   21 ++++-----------------
 5 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f54d665..8870d16 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -524,8 +524,6 @@ struct sk_buff {
                };
                struct rb_node  rbnode; /* used in netem & tcp stack */
        };
-       struct sock             *sk;
-       struct net_device       *dev;
 
        /*
         * This is the control buffer. It is free to use for every
@@ -535,6 +533,9 @@ struct sk_buff {
         */
        char                    cb[48] __aligned(8);
 
+       struct sock             *sk;
+       struct net_device       *dev;
+
        unsigned long           _skb_refdst;
        void                    (*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_XFRM
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8d6b983..1a5cf12 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -694,6 +694,13 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff 
*skb)
  * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
  */
 struct tcp_skb_cb {
+       union {
+               struct inet_skb_parm    h4;
+#if IS_ENABLED(CONFIG_IPV6)
+               struct inet6_skb_parm   h6;
+#endif
+       } header;       /* For incoming frames          */
+
        __u32           seq;            /* Starting sequence number     */
        __u32           end_seq;        /* SEQ + FIN + SYN + datalen    */
        union {
@@ -721,21 +728,13 @@ struct tcp_skb_cb {
        __u8            ip_dsfield;     /* IPv4 tos or IPv6 dsfield     */
        /* 1 byte hole */
        __u32           ack_seq;        /* Sequence number ACK'd        */
-       union {
-               struct inet_skb_parm    h4;
-#if IS_ENABLED(CONFIG_IPV6)
-               struct inet6_skb_parm   h6;
-#endif
-       } header;       /* For incoming frames          */
 };
 
 #define TCP_SKB_CB(__skb)      ((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
 
 #if IS_ENABLED(CONFIG_IPV6)
-/* This is the variant of inet6_iif() that must be used by TCP,
- * as TCP moves IP6CB into a different location in skb->cb[]
- */
+/* This is the variant of inet6_iif() that must be used by TCP */
 static inline int tcp_v6_iif(const struct sk_buff *skb)
 {
        return TCP_SKB_CB(skb)->header.h6.iif;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a2dfed..43d6cd2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1622,12 +1622,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
 
        th = tcp_hdr(skb);
        iph = ip_hdr(skb);
-       /* This is tricky : We move IPCB at its correct location into 
TCP_SKB_CB()
-        * barrier() makes sure compiler wont play fool^Waliasing games.
-        */
-       memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
-               sizeof(struct inet_skb_parm));
-       barrier();
 
        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a2a796c..7de50d0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1011,10 +1011,6 @@ static int tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb, int clone_it,
        /* Our usage of tstamp should remain private */
        skb->tstamp.tv64 = 0;
 
-       /* Cleanup our debris for IP stacks */
-       memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
-                              sizeof(struct inet6_skb_parm)));
-
        err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
 
        if (likely(err <= 0))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5d46832..e8cc440 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1392,15 +1392,6 @@ ipv6_pktoptions:
 static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
                           const struct tcphdr *th)
 {
-       /* This is tricky: we move IP6CB at its correct location into
-        * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
-        * _decode_session6() uses IP6CB().
-        * barrier() makes sure compiler won't play aliasing games.
-        */
-       memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
-               sizeof(struct inet6_skb_parm));
-       barrier();
-
        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff*4);
@@ -1443,8 +1434,10 @@ static int tcp_v6_rcv(struct sk_buff *skb)
        th = tcp_hdr(skb);
        hdr = ipv6_hdr(skb);
 
+       tcp_v6_fill_cb(skb, hdr, th);
+
        sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
-                               inet6_iif(skb));
+                               tcp_v6_iif(skb));
        if (!sk)
                goto no_tcp_socket;
 
@@ -1460,8 +1453,6 @@ process:
        if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
                goto discard_and_relse;
 
-       tcp_v6_fill_cb(skb, hdr, th);
-
 #ifdef CONFIG_TCP_MD5SIG
        if (tcp_v6_inbound_md5_hash(sk, skb))
                goto discard_and_relse;
@@ -1493,8 +1484,6 @@ no_tcp_socket:
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard_it;
 
-       tcp_v6_fill_cb(skb, hdr, th);
-
        if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
 csum_error:
                TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
@@ -1518,8 +1507,6 @@ do_time_wait:
                goto discard_it;
        }
 
-       tcp_v6_fill_cb(skb, hdr, th);
-
        if (skb->len < (th->doff<<2)) {
                inet_twsk_put(inet_twsk(sk));
                goto bad_packet;
@@ -1538,7 +1525,7 @@ do_time_wait:
                                            &ipv6_hdr(skb)->saddr, th->source,
                                            &ipv6_hdr(skb)->daddr,
                                            ntohs(th->dest), tcp_v6_iif(skb));
-               if (sk2 != NULL) {
+               if (sk2) {
                        struct inet_timewait_sock *tw = inet_twsk(sk);
                        inet_twsk_deschedule(tw, &tcp_death_row);
                        inet_twsk_put(tw);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to