This patch is not a proper fix and mainly serves for discussion purpose. It is based on net-next which I have been using to debug the issue.
The change that works around the issue is in ensure_mtu_is_adequate(). Other changes are the rippling effect in function arg. This bug was uncovered by one of our legacy service that are still using ipvs for load balancing. In that setup, the ipvs encap the ipv6-tcp packet in another ipv6 hdr before tx it out to eth0. The problem is the kernel stack could pass a skb (which was originated from a sys_write(tcp_fd)) to the driver with skb->len bigger than the device MTU. In one NIC setup (with gso and tso off) that we are using, it upset the NIC/driver and caused the tx queue stalled for tens of seconds which is how it got uncovered. (On the NIC side, the NIC firmware and driver have been fixed to avoid this tx queue stall after seeing this skb). On the kernel side, based on the commit log, this bug should have been exposed after commit 815d22e55b0e ("ip6ip6: Support for GSO/GRO"). Before commit 815d22e55b0e, ipv6_gso_segment() would just error out (-EPROTONOSUPPORT) because the tx-ing packet is an ip6ip6. Due to this error out, it avoid passing it to the driver. The TCP stack then timeout and the TCP mtu probing eventually kicked in to lower the skb->len enough to avoid gso_segment. After commit 815d22e55b0e, ipv6_gso_segment() -> ipv6_gso_segment() -> tcp6_gso_segment() which segment the packet based on a mss that does not account for the extra IPv6 hdr. Here is a stack from the WARN_ON() that we added to the driver to capture the issue: [ 1128.611875] WARNING: CPU: 40 PID: 31495 at drivers/net/ethernet/mellanox/mlx5/core/en_tx.c:424 mlx5e_xmit+0x814 ... [ 1129.016536] Call Trace: [ 1129.021412] ? skb_release_data+0xfc/0x120 [ 1129.029587] ? kfree_skbmem+0x64/0x70 [ 1129.036905] dev_hard_start_xmit+0xa4/0x200 [ 1129.045262] sch_direct_xmit+0x10f/0x280 [ 1129.053111] __qdisc_run+0x223/0x5a0 [ 1129.060251] __dev_queue_xmit+0x245/0x7d0 [ 1129.068268] dev_queue_xmit+0x10/0x20 [ 1129.075573] ? dev_queue_xmit+0x10/0x20 [ 1129.083218] ip6_finish_output2+0x2db/0x490 [ 1129.091573] ip6_finish_output+0x125/0x190 [ 1129.099754] ip6_output+0x5f/0x100 [ 1129.106548] ? ip6_fragment+0x9f0/0x9f0 [ 1129.114212] ip6_local_out+0x35/0x40 [ 1129.121356] ip_vs_tunnel_xmit_v6+0x267/0x290 [ip_vs] [ 1129.131443] ip_vs_in.part.24+0x302/0x710 [ip_vs] [ 1129.140837] ? ip_vs_in.part.24+0x302/0x710 [ip_vs] [ 1129.150578] ? ip_vs_conn_out_get+0x17/0x140 [ip_vs] [ 1129.160493] ? ip_vs_conn_out_get_proto+0x25/0x30 [ip_vs] [ 1129.171273] ip_vs_in+0x43/0x130 [ip_vs] [ 1129.179109] ip_vs_local_request6+0x26/0x30 [ip_vs] [ 1129.188849] nf_hook_slow+0x3e/0xc0 [ 1129.195800] ip6_xmit+0x30b/0x540 [ 1129.202421] ? ac6_proc_exit+0x20/0x20 [ 1129.209909] inet6_csk_xmit+0x82/0xd0 [ 1129.217207] ? lock_timer_base+0x76/0xa0 [ 1129.225043] tcp_transmit_skb+0x56f/0xa40 [ 1129.233051] tcp_write_xmit+0x2b2/0x11b0 [ 1129.240885] __tcp_push_pending_frames+0x33/0xa0 [ 1129.250106] tcp_push+0xde/0x100 [ 1129.256554] tcp_sendmsg_locked+0x9ca/0xca0 [ 1129.264910] tcp_sendmsg+0x2c/0x50 [ 1129.271703] inet_sendmsg+0x31/0xb0 [ 1129.278672] sock_write_iter+0xf8/0x110 [ 1129.286335] new_sync_write+0xd9/0x120 [ 1129.293823] vfs_write+0x18d/0x1e0 [ 1129.300614] SyS_write+0x48/0xa0 [ 1129.307045] do_syscall_64+0x69/0x1e0 [ 1129.314361] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 ... [ 1129.648183] ---[ end trace 635061c9c300799e ]--- [ 1129.657407] skb->len:1554 MTU:1522 The tcp flow is connecting from the address ending ':27:0' to the ':85'. [host-a] > ip -6 r show table local local 2401:db00:1011:1f01:face:b00c:0:85 dev lo src 2401:db00:1011:10af:face:0:27:0 metric 1024 advmss 1440 pref medium [host-a] > ip -6 a 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 state UNKNOWN qlen 1000 inet6 2401:db00:1011:1f01:face:b00c:0:85/128 scope global valid_lft forever preferred_lft forever 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000 inet6 2401:db00:1011:10af:face:0:27:0/64 scope global valid_lft forever preferred_lft forever [host-a] > cat /proc/net/ip_vs TCP [2401:db00:1011:1f01:face:b00c:0000:0085]:01BB rr -> [2401:db00:1011:10cc:face:0000:0091:0000]:01BB Tunnel 6772 9 6 -> [2401:db00:1011:10d8:face:0000:0091:0000]:01BB Tunnel 6772 8 6 -> [2401:db00:1011:10d2:face:0000:0091:0000]:01BB Tunnel 6772 19 7 [host-a] > openssl s_client -connect [2401:db00:1011:1f01:face:b00c:0:85]:443 send-something-long-here-to-trigger-the-bug Changing the local route mtu to 1460 to account for the extra ipv6 tunnel header can also side step the issue. Like this: > ip -6 r show table local local 2401:db00:1011:1f01:face:b00c:0:85 dev lo src 2401:db00:1011:10af:face:0:27:0 metric 1024 mtu 1460 advmss 1440 pref medium Signed-off-by: Martin KaFai Lau <ka...@fb.com> --- net/netfilter/ipvs/ip_vs_xmit.c | 49 +++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 11c416f3d6e3..88cc0d53ebce 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -212,13 +212,15 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } -static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, +static inline bool ensure_mtu_is_adequate(struct ip_vs_conn *cp, int rt_mode, struct ip_vs_iphdr *ipvsh, struct sk_buff *skb, int mtu) { + struct netns_ipvs *ipvs = cp->ipvs; + #ifdef CONFIG_IP_VS_IPV6 - if (skb_af == AF_INET6) { + if (cp->af == AF_INET6) { struct net *net = ipvs->net; if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { @@ -251,6 +253,17 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, } } + if (skb_shinfo(skb)->gso_size && cp->protocol == IPPROTO_TCP) { + const struct tcphdr *th = (struct tcphdr *)skb_transport_header(skb); + unsigned short hdr_len = (th->doff << 2) + + skb_network_header_len(skb); + + if (mtu > hdr_len && mtu - hdr_len < skb_shinfo(skb)->gso_size) + skb_decrease_gso_size(skb_shinfo(skb), + skb_shinfo(skb)->gso_size - + (mtu - hdr_len)); + } + return true; } @@ -305,13 +318,15 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, +__ip_vs_get_out_rt(struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = cp->ipvs; struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; + int skb_af = cp->af; struct rtable *rt; /* Route to the other host */ int mtu; int local, noref = 1; @@ -389,7 +404,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(cp, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -455,13 +470,15 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, +__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { + struct netns_ipvs *ipvs = cp->ipvs; struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; + int skb_af = cp->af; struct rt6_info *rt; /* Route to the other host */ struct dst_entry *dst; int mtu; @@ -541,7 +558,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(cp, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -679,7 +696,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, + if (__ip_vs_get_out_rt(cp, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -708,7 +725,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, + if (__ip_vs_get_out_rt_v6(cp, skb, NULL, &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -753,7 +770,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL, ipvsh); @@ -839,7 +856,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(cp, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | @@ -1031,7 +1048,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | @@ -1129,7 +1146,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(cp, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | @@ -1218,7 +1235,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); @@ -1252,7 +1269,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(cp, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | @@ -1317,7 +1334,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + local = __ip_vs_get_out_rt(cp, skb, cp->dest, cp->daddr.ip, rt_mode, NULL, iph); if (local < 0) goto tx_error; @@ -1406,7 +1423,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(cp, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; -- 2.9.5