When TCP_CLOSE_NORST is set before a close(), offload sinking of
unwanted data to the kernel with low resource usage, with a timeout of
TCP_LINGER2. The socket will transition to FIN_WAIT1 and then FIN_WAIT2
where it will ack data until either the timeout is hit, or a RST or FIN
is received.

Signed-off-by: Debabrata Banerjee <dbane...@akamai.com>
---
 include/linux/tcp.h      |  4 +++-
 include/uapi/linux/tcp.h |  2 +-
 net/ipv4/tcp.c           | 23 +++++++++++++++++++++--
 net/ipv4/tcp_input.c     | 16 ++++++++++++----
 net/ipv4/tcp_minisocks.c | 15 +++++++++++++++
 5 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 72705eaf4b84..bd44bc99b480 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -226,7 +226,8 @@ struct tcp_sock {
                fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
                fastopen_no_cookie:1, /* Allow send/recv SYN+data without a 
cookie */
                is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
-               unused:2;
+               norst:1,        /* Don't send RST on shutdown() socket */
+               unused:1;
        u8      nonagle     : 4,/* Disable Nagle algorithm?             */
                thin_lto    : 1,/* Use linear timeouts for thin streams */
                recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
@@ -429,6 +430,7 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key     *tw_md5_key;
 #endif
+       int                       tw_norst;
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..369f3402b669 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -124,8 +124,8 @@ enum {
 #define TCP_FASTOPEN_NO_COOKIE 34      /* Enable TFO without a TFO cookie */
 #define TCP_ZEROCOPY_RECEIVE   35
 #define TCP_INQ                        36      /* Notify bytes available to 
read as a cmsg on read */
-
 #define TCP_CM_INQ             TCP_INQ
+#define TCP_CLOSE_NORST                37      /* Don't send RST on close()'d 
socket */
 
 struct tcp_repair_opt {
        __u32   opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a2ea0bbf867..29fe763002e5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2318,8 +2318,10 @@ void tcp_close(struct sock *sk, long timeout)
        struct sk_buff *skb;
        int data_was_unread = 0;
        int state;
+       struct tcp_sock *tp;
 
        lock_sock(sk);
+       tp = tcp_sk(sk);
        sk->sk_shutdown = SHUTDOWN_MASK;
 
        if (sk->sk_state == TCP_LISTEN) {
@@ -2362,8 +2364,19 @@ void tcp_close(struct sock *sk, long timeout)
        } else if (data_was_unread) {
                /* Unread data was tossed, zap the connection. */
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
-               tcp_set_state(sk, TCP_CLOSE);
-               tcp_send_active_reset(sk, sk->sk_allocation);
+
+               if (unlikely(tp->norst)) {
+                       if (tcp_close_state(sk)) {
+                               /* We will discard all new incoming data
+                                * set window to max of current or init.
+                                */
+                               tp->rcv_wnd = max(tp->rcv_wnd, MAX_TCP_WINDOW);
+                               tcp_send_fin(sk);
+                       }
+               } else {
+                       tcp_set_state(sk, TCP_CLOSE);
+                       tcp_send_active_reset(sk, sk->sk_allocation);
+               }
        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
@@ -3040,6 +3053,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                else
                        tp->recvmsg_inq = val;
                break;
+       case TCP_CLOSE_NORST:
+               tp->norst = !!val;
+               break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -3523,6 +3539,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                return err;
        }
 #endif
+       case TCP_CLOSE_NORST:
+               val = tp->norst;
+               break;
        default:
                return -ENOPROTOOPT;
        }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf..e0aa6e126700 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6054,7 +6054,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
                        break;
                }
 
-               if (tp->linger2 < 0) {
+               if (likely(!tp->norst) && tp->linger2 < 0) {
                        tcp_done(sk);
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                        return 1;
@@ -6064,9 +6064,12 @@ int tcp_rcv_state_process(struct sock *sk, struct 
sk_buff *skb)
                        /* Receive out of order FIN after close() */
                        if (tp->syn_fastopen && th->fin)
                                tcp_fastopen_active_disable(sk);
-                       tcp_done(sk);
-                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
-                       return 1;
+
+                       if (likely(!tp->norst)) {
+                               tcp_done(sk);
+                               NET_INC_STATS(sock_net(sk), 
LINUX_MIB_TCPABORTONDATA);
+                               return 1;
+                       }
                }
 
                tmo = tcp_fin_time(sk);
@@ -6123,6 +6126,11 @@ int tcp_rcv_state_process(struct sock *sk, struct 
sk_buff *skb)
                if (sk->sk_shutdown & RCV_SHUTDOWN) {
                        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                            after(TCP_SKB_CB(skb)->end_seq - th->fin, 
tp->rcv_nxt)) {
+                               if (unlikely(tp->norst)) {
+                                       tcp_send_ack(sk);
+                                       goto discard;
+                               }
+
                                NET_INC_STATS(sock_net(sk), 
LINUX_MIB_TCPABORTONDATA);
                                tcp_reset(sk);
                                return 1;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f867658b4b30..48a9d5351478 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -133,6 +133,20 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, 
struct sk_buff *skb,
                        return TCP_TW_SUCCESS;
                }
 
+               if (tcptw->tw_norst) {
+                       /* ack and discard new data */
+                       tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                       if (tmp_opt.saw_tstamp) {
+                               tcptw->tw_ts_recent_stamp = get_seconds();
+                               tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
+                       }
+
+                       if (th->fin) /* active remote close, we can die now */
+                               inet_twsk_deschedule_put(tw);
+
+                       return TCP_TW_ACK;
+               }
+
                /* New data or FIN. If new data arrive after half-duplex close,
                 * reset.
                 */
@@ -272,6 +286,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
                tcptw->tw_ts_offset     = tp->tsoffset;
                tcptw->tw_last_oow_ack_time = 0;
+               tcptw->tw_norst         = tp->norst;
 
 #if IS_ENABLED(CONFIG_IPV6)
                if (tw->tw_family == PF_INET6) {
-- 
2.17.0

Reply via email to