Re: [PATCH v3 net-next 4/4] tcp: implement coalescing on backlog queue

2018-11-28 Thread Neal Cardwell
On Tue, Nov 27, 2018 at 5:42 PM Eric Dumazet  wrote:
>
> In case GRO is not as efficient as it should be or disabled,
> we might have a user thread trapped in __release_sock() while
> softirq handler flood packets up to the point we have to drop.
>
> This patch balances work done from user thread and softirq,
> to give more chances to __release_sock() to complete its work
> before new packets are added the the backlog.
>
> This also helps if we receive many ACK packets, since GRO
> does not aggregate them.
>
> This patch brings ~60% throughput increase on a receiver
> without GRO, but the spectacular gain is really on
> 1000x release_sock() latency reduction I have measured.
>
> Signed-off-by: Eric Dumazet 
> Cc: Neal Cardwell 
> Cc: Yuchung Cheng 
> ---

Acked-by: Neal Cardwell 

Thanks!

neal


[PATCH v3 net-next 4/4] tcp: implement coalescing on backlog queue

2018-11-27 Thread Eric Dumazet
In case GRO is not as efficient as it should be or disabled,
we might have a user thread trapped in __release_sock() while
softirq handler flood packets up to the point we have to drop.

This patch balances work done from user thread and softirq,
to give more chances to __release_sock() to complete its work
before new packets are added the the backlog.

This also helps if we receive many ACK packets, since GRO
does not aggregate them.

This patch brings ~60% throughput increase on a receiver
without GRO, but the spectacular gain is really on
1000x release_sock() latency reduction I have measured.

Signed-off-by: Eric Dumazet 
Cc: Neal Cardwell 
Cc: Yuchung Cheng 
---
 include/uapi/linux/snmp.h |  1 +
 net/ipv4/proc.c   |  1 +
 net/ipv4/tcp_ipv4.c   | 92 ---
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 
f80135e5feaa88609db6dff75b2bc2d637b2..86dc24a96c90ab047d5173d625450facd6c6dd79
 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -243,6 +243,7 @@ enum
LINUX_MIB_TCPREQQFULLDROP,  /* TCPReqQFullDrop */
LINUX_MIB_TCPRETRANSFAIL,   /* TCPRetransFail */
LINUX_MIB_TCPRCVCOALESCE,   /* TCPRcvCoalesce */
+   LINUX_MIB_TCPBACKLOGCOALESCE,   /* TCPBacklogCoalesce */
LINUX_MIB_TCPOFOQUEUE,  /* TCPOFOQueue */
LINUX_MIB_TCPOFODROP,   /* TCPOFODrop */
LINUX_MIB_TCPOFOMERGE,  /* TCPOFOMerge */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 
70289682a6701438aed99a00a9705c39fa4394d3..c3610b37bb4ce665b1976d8cc907b6dd0de42ab9
 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+   SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 
795605a2327504b8a025405826e7e0ca8dc8501d..4904250a9aac5001410f9454258cbb8978bb8202
 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
-   /* Only socket owner can try to collapse/prune rx queues
-* to reduce memory overhead, so add a little headroom here.
-* Few sockets backlog are possibly concurrently non empty.
-*/
-   limit += 64*1024;
+   struct skb_shared_info *shinfo;
+   const struct tcphdr *th;
+   struct tcphdr *thtail;
+   struct sk_buff *tail;
+   unsigned int hdrlen;
+   bool fragstolen;
+   u32 gso_segs;
+   int delta;
 
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
 * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff 
*skb)
 
skb_dst_drop(skb);
 
+   if (unlikely(tcp_checksum_complete(skb))) {
+   bh_unlock_sock(sk);
+   __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+   __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+   return true;
+   }
+
+   /* Attempt coalescing to last skb in backlog, even if we are
+* above the limits.
+* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+*/
+   th = (const struct tcphdr *)skb->data;
+   hdrlen = th->doff * 4;
+   shinfo = skb_shinfo(skb);
+
+   if (!shinfo->gso_size)
+   shinfo->gso_size = skb->len - hdrlen;
+
+   if (!shinfo->gso_segs)
+   shinfo->gso_segs = 1;
+
+   tail = sk->sk_backlog.tail;
+   if (!tail)
+   goto no_coalesce;
+   thtail = (struct tcphdr *)tail->data;
+
+   if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+   TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+   ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+   ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+   tail->decrypted != skb->decrypted ||
+#endif
+   thtail->doff != th->doff ||
+   memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+   goto no_coalesce;
+
+   __skb_pull(skb, hdrlen);
+   if (skb_try_coalesce(tail, skb, &fra