Note that when a new netns is created, it inherits its
sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns.

This change is needed so that we can refine TCP rcvbuf autotuning,
to take RTT into consideration.

Signed-off-by: Eric Dumazet <eduma...@google.com>
Cc: Wei Wang <wei...@google.com>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  2 --
 net/ipv4/sysctl_net_ipv4.c | 32 ++++++++++++++++----------------
 net/ipv4/tcp.c             | 21 ++++++++-------------
 net/ipv4/tcp_input.c       | 14 ++++++++------
 net/ipv4/tcp_ipv4.c        | 13 ++++++++++---
 net/ipv4/tcp_output.c      |  2 +-
 net/ipv6/tcp_ipv6.c        |  4 ++--
 8 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 
379550f8124a1fca9987e644dbe00b7fa4bd9fdd..5e12975fc658b8671ad74678d25df1ada6fab1c8
 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -155,6 +155,8 @@ struct netns_ipv4 {
        int sysctl_tcp_invalid_ratelimit;
        int sysctl_tcp_pacing_ss_ratio;
        int sysctl_tcp_pacing_ca_ratio;
+       int sysctl_tcp_wmem[3];
+       int sysctl_tcp_rmem[3];
        struct inet_timewait_death_row tcp_death_row;
        int sysctl_max_syn_backlog;
        int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 
babfd4da1515841f7cf02506736c2c4bb0006e7d..2f2c69ad31b28d7c479768b52bd1388220086f14
 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* sysctl variables for tcp */
 extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
-extern int sysctl_tcp_wmem[3];
-extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 
a82b4403830816351f070fa47e2a7338bcc6abf5..ef0ff3357a44757d59f658614f3858687ef2ef47
 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -440,22 +440,6 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_doulongvec_minmax,
        },
-       {
-               .procname       = "tcp_wmem",
-               .data           = &sysctl_tcp_wmem,
-               .maxlen         = sizeof(sysctl_tcp_wmem),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &one,
-       },
-       {
-               .procname       = "tcp_rmem",
-               .data           = &sysctl_tcp_rmem,
-               .maxlen         = sizeof(sysctl_tcp_rmem),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &one,
-       },
        {
                .procname       = "tcp_low_latency",
                .data           = &sysctl_tcp_low_latency,
@@ -1164,6 +1148,22 @@ static struct ctl_table ipv4_net_table[] = {
                .extra1         = &zero,
                .extra2         = &thousand,
        },
+       {
+               .procname       = "tcp_wmem",
+               .data           = &init_net.ipv4.sysctl_tcp_wmem,
+               .maxlen         = sizeof(init_net.ipv4.sysctl_tcp_wmem),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &one,
+       },
+       {
+               .procname       = "tcp_rmem",
+               .data           = &init_net.ipv4.sysctl_tcp_rmem,
+               .maxlen         = sizeof(init_net.ipv4.sysctl_tcp_rmem),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &one,
+       },
        { }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 
c4cb19ed4628180ce627f9731def66feb6140baf..bc71a27d5ad98208a9b5f744b04d9abf9b0b9a6a
 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
 long sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
-
 EXPORT_SYMBOL(sysctl_tcp_mem);
-EXPORT_SYMBOL(sysctl_tcp_rmem);
-EXPORT_SYMBOL(sysctl_tcp_wmem);
 
 atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
@@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk)
 
        icsk->icsk_sync_mss = tcp_sync_mss;
 
-       sk->sk_sndbuf = sysctl_tcp_wmem[1];
-       sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+       sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+       sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
 
        sk_sockets_allocated_inc(sk);
 }
@@ -3636,13 +3631,13 @@ void __init tcp_init(void)
        max_wshare = min(4UL*1024*1024, limit);
        max_rshare = min(6UL*1024*1024, limit);
 
-       sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
-       sysctl_tcp_wmem[1] = 16*1024;
-       sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+       init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+       init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+       init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
 
-       sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
-       sysctl_tcp_rmem[1] = 87380;
-       sysctl_tcp_rmem[2] = max(87380, max_rshare);
+       init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+       init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
+       init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
 
        pr_info("Hash tables configured (established %u bind %u)\n",
                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 
0ada8bfc2ebdf56b785d419edbe87a2ac3156389..e41123d854797910890dbd84900b5a7393089394
 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
        sndmem *= nr_segs * per_mss;
 
        if (sk->sk_sndbuf < sndmem)
-               sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+               sk->sk_sndbuf = min(sndmem, 
sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
 }
 
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const 
struct sk_buff *skb)
        struct tcp_sock *tp = tcp_sk(sk);
        /* Optimize this! */
        int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
-       int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1;
+       int window = tcp_win_from_space(sk, 
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
 
        while (tp->rcv_ssthresh <= window) {
                if (truesize <= skb->len)
@@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
                rcvmem <<= 2;
 
        if (sk->sk_rcvbuf < rcvmem)
-               sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
+               sk->sk_rcvbuf = min(rcvmem, 
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 }
 
 /* 4. Try to fixup all. It is made immediately after connection enters
@@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct net *net = sock_net(sk);
 
        icsk->icsk_ack.quick = 0;
 
-       if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+       if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
            !tcp_under_memory_pressure(sk) &&
            sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
-                                   sysctl_tcp_rmem[2]);
+                                   net->ipv4.sysctl_tcp_rmem[2]);
        }
        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
                tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
                while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
                        rcvmem += 128;
 
-               rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+               rcvbuf = min(rcvwin / tp->advmss * rcvmem,
+                            sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
                if (rcvbuf > sk->sk_rcvbuf) {
                        sk->sk_rcvbuf = rcvbuf;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 
0162c577bb9cf85160ed833e3e069daf66a2cd15..1eac84b8044e70fe99c6d3c4d8c5319392141b40
 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2409,8 +2409,8 @@ struct proto tcp_prot = {
        .memory_allocated       = &tcp_memory_allocated,
        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_mem             = sysctl_tcp_mem,
-       .sysctl_wmem            = sysctl_tcp_wmem,
-       .sysctl_rmem            = sysctl_tcp_rmem,
+       .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+       .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp_sock),
        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
@@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net)
        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
-
+       if (net != &init_net) {
+               memcpy(net->ipv4.sysctl_tcp_rmem,
+                      init_net.ipv4.sysctl_tcp_rmem,
+                      sizeof(init_net.ipv4.sysctl_tcp_rmem));
+               memcpy(net->ipv4.sysctl_tcp_wmem,
+                      init_net.ipv4.sysctl_tcp_wmem,
+                      sizeof(init_net.ipv4.sysctl_tcp_wmem));
+       }
        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 
a9d917e4dad5c6fec1dc81e96c16b78223786d4c..9b98d35aa0d8d0a829e4a41985d805d4e2895a8e
 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int 
__space, __u32 mss,
        (*rcv_wscale) = 0;
        if (wscale_ok) {
                /* Set window scaling on max possible window */
-               space = max_t(u32, space, sysctl_tcp_rmem[2]);
+               space = max_t(u32, space, 
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
                space = max_t(u32, space, sysctl_rmem_max);
                space = min_t(u32, space, *window_clamp);
                while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 
0e2529958b52c422ac9ca66186dae17d69628f0b..6bb98c93edfe2ed2f16fe5229605f8108cfc7f9a
 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = {
        .memory_pressure        = &tcp_memory_pressure,
        .orphan_count           = &tcp_orphan_count,
        .sysctl_mem             = sysctl_tcp_mem,
-       .sysctl_wmem            = sysctl_tcp_wmem,
-       .sysctl_rmem            = sysctl_tcp_rmem,
+       .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+       .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp6_sock),
        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
-- 
2.15.0.403.gc27cc4dac6-goog

Reply via email to