This is a note to let you know that I've just added the patch titled

    ipv4: tcp: get rid of ugly unicast_sock

to the 3.14-stable tree which can be found at:
    
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     ipv4-tcp-get-rid-of-ugly-unicast_sock.patch
and it can be found in the queue-3.14 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@vger.kernel.org> know about it.


>From foo@baz Thu Feb 12 09:26:08 HKT 2015
From: Eric Dumazet <eduma...@google.com>
Date: Thu, 29 Jan 2015 21:35:05 -0800
Subject: ipv4: tcp: get rid of ugly unicast_sock

From: Eric Dumazet <eduma...@google.com>

[ Upstream commit bdbbb8527b6f6a358dbcb70dac247034d665b8e4 ]

In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :

commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.

commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.

commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression.

commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.

Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.

This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.

Signed-off-by: Eric Dumazet <eduma...@google.com>
Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 include/net/ip.h         |    2 +-
 include/net/netns/ipv4.h |    1 +
 net/ipv4/ip_output.c     |   30 +++---------------------------
 net/ipv4/tcp_ipv4.c      |   37 ++++++++++++++++++++++++++++++++-----
 4 files changed, 37 insertions(+), 33 deletions(-)

--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -175,7 +175,7 @@ static inline __u8 ip_reply_arg_flowi_fl
        return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
                           __be32 saddr, const struct ip_reply_arg *arg,
                           unsigned int len);
 
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -47,6 +47,7 @@ struct netns_ipv4 {
        struct inet_peer_base   *peers;
        struct tcpm_hash_bucket *tcp_metrics_hash;
        unsigned int            tcp_metrics_hash_log;
+       struct sock  * __percpu *tcp_sk;
        struct netns_frags      frags;
 #ifdef CONFIG_NETFILTER
        struct xt_table         *iptable_filter;
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1460,24 +1460,8 @@ static int ip_reply_glue_bits(void *dptr
 /*
  *     Generic function to send a packet as reply to another packet.
  *     Used to send some TCP resets/acks so far.
- *
- *     Use a fake percpu inet socket to avoid false sharing and contention.
  */
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
-       .sk = {
-               .__sk_common = {
-                       .skc_refcnt = ATOMIC_INIT(1),
-               },
-               .sk_wmem_alloc  = ATOMIC_INIT(1),
-               .sk_allocation  = GFP_ATOMIC,
-               .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
-               .sk_pacing_rate = ~0U,
-       },
-       .pmtudisc       = IP_PMTUDISC_WANT,
-       .uc_ttl         = -1,
-};
-
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
                           __be32 saddr, const struct ip_reply_arg *arg,
                           unsigned int len)
 {
@@ -1485,9 +1469,8 @@ void ip_send_unicast_reply(struct net *n
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
+       struct net *net = sock_net(sk);
        struct sk_buff *nskb;
-       struct sock *sk;
-       struct inet_sock *inet;
        int err;
 
        if (ip_options_echo(&replyopts.opt.opt, skb))
@@ -1517,15 +1500,11 @@ void ip_send_unicast_reply(struct net *n
        if (IS_ERR(rt))
                return;
 
-       inet = &get_cpu_var(unicast_sock);
+       inet_sk(sk)->tos = arg->tos;
 
-       inet->tos = arg->tos;
-       sk = &inet->sk;
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
-       sock_net_set(sk, net);
-       __skb_queue_head_init(&sk->sk_write_queue);
        sk->sk_sndbuf = sysctl_wmem_default;
        err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
                             len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1541,13 +1520,10 @@ void ip_send_unicast_reply(struct net *n
                          arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                arg->csum));
                nskb->ip_summed = CHECKSUM_NONE;
-               skb_orphan(nskb);
                skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                ip_push_pending_frames(sk, &fl4);
        }
 out:
-       put_cpu_var(unicast_sock);
-
        ip_rt_put(rt);
 }
 
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -691,7 +691,8 @@ static void tcp_v4_send_reset(struct soc
 
        net = dev_net(skb_dst(skb)->dev);
        arg.tos = ip_hdr(skb)->tos;
-       ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+       ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+                             skb, ip_hdr(skb)->saddr,
                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -774,7 +775,8 @@ static void tcp_v4_send_ack(struct sk_bu
        if (oif)
                arg.bound_dev_if = oif;
        arg.tos = tos;
-       ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+       ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+                             skb, ip_hdr(skb)->saddr,
                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -2769,14 +2771,39 @@ struct proto tcp_prot = {
 };
 EXPORT_SYMBOL(tcp_prot);
 
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+       free_percpu(net->ipv4.tcp_sk);
+}
+
 static int __net_init tcp_sk_init(struct net *net)
 {
+       int res, cpu;
+
+       net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+       if (!net->ipv4.tcp_sk)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu) {
+               struct sock *sk;
+
+               res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+                                          IPPROTO_TCP, net);
+               if (res)
+                       goto fail;
+               *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+       }
        net->ipv4.sysctl_tcp_ecn = 2;
        return 0;
-}
 
-static void __net_exit tcp_sk_exit(struct net *net)
-{
+fail:
+       tcp_sk_exit(net);
+
+       return res;
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)


Patches currently in stable-queue which might be from eduma...@google.com are

queue-3.14/tcp-ipv4-initialize-unicast_sock-sk_pacing_rate.patch
queue-3.14/ip-zero-sockaddr-returned-on-error-queue.patch
queue-3.14/ping-fix-race-in-free-in-receive-path.patch
queue-3.14/net-rps-fix-cpu-unplug.patch
queue-3.14/netxen-fix-netxen_nic_poll-logic.patch
queue-3.14/ipv4-tcp-get-rid-of-ugly-unicast_sock.patch
queue-3.14/bnx2x-fix-napi-poll-return-value-for-repoll.patch
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to