When TCP_CLOSE_NORST is set before a close(), offload sinking of unwanted data to the kernel with low resource usage, with a timeout of TCP_LINGER2. The socket will transition to FIN_WAIT1 and then FIN_WAIT2 where it will ack data until either the timeout is hit, or a RST or FIN is received.
Signed-off-by: Debabrata Banerjee <dbane...@akamai.com> --- include/linux/tcp.h | 4 +++- include/uapi/linux/tcp.h | 2 +- net/ipv4/tcp.c | 23 +++++++++++++++++++++-- net/ipv4/tcp_input.c | 16 ++++++++++++---- net/ipv4/tcp_minisocks.c | 15 +++++++++++++++ 5 files changed, 52 insertions(+), 8 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 72705eaf4b84..bd44bc99b480 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -226,7 +226,8 @@ struct tcp_sock { fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - unused:2; + norst:1, /* Don't send RST on shutdown() socket */ + unused:1; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ @@ -429,6 +430,7 @@ struct tcp_timewait_sock { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif + int tw_norst; }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 29eb659aa77a..369f3402b669 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -124,8 +124,8 @@ enum { #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ #define TCP_ZEROCOPY_RECEIVE 35 #define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ - #define TCP_CM_INQ TCP_INQ +#define TCP_CLOSE_NORST 37 /* Don't send RST on close()'d socket */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0a2ea0bbf867..29fe763002e5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2318,8 +2318,10 @@ void tcp_close(struct sock *sk, long timeout) struct sk_buff *skb; int data_was_unread = 0; int state; + struct tcp_sock *tp; lock_sock(sk); + tp = tcp_sk(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { @@ -2362,8 +2364,19 @@ void tcp_close(struct sock *sk, long timeout) } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); - tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + + if (unlikely(tp->norst)) { + if (tcp_close_state(sk)) { + /* We will discard all new incoming data + * set window to max of current or init. + */ + tp->rcv_wnd = max(tp->rcv_wnd, MAX_TCP_WINDOW); + tcp_send_fin(sk); + } + } else { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, sk->sk_allocation); + } } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -3040,6 +3053,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_CLOSE_NORST: + tp->norst = !!val; + break; default: err = -ENOPROTOOPT; break; @@ -3523,6 +3539,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return err; } #endif + case TCP_CLOSE_NORST: + val = tp->norst; + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index aebb29ab2fdf..e0aa6e126700 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6054,7 +6054,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0) { + if (likely(!tp->norst) && tp->linger2 < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; @@ -6064,9 +6064,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) /* Receive out of order FIN after close() */ if (tp->syn_fastopen && th->fin) tcp_fastopen_active_disable(sk); - tcp_done(sk); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; + + if (likely(!tp->norst)) { + tcp_done(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } } tmo = tcp_fin_time(sk); @@ -6123,6 +6126,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + if (unlikely(tp->norst)) { + tcp_send_ack(sk); + goto discard; + } + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f867658b4b30..48a9d5351478 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -133,6 +133,20 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } + if (tcptw->tw_norst) { + /* ack and discard new data */ + tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tmp_opt.saw_tstamp) { + tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + } + + if (th->fin) /* active remote close, we can die now */ + inet_twsk_deschedule_put(tw); + + return TCP_TW_ACK; + } + /* New data or FIN. If new data arrive after half-duplex close, * reset. */ @@ -272,6 +286,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; + tcptw->tw_norst = tp->norst; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { -- 2.17.0