[ updated my email address, please use this one for v6+ ] From: <fan....@zte.com.cn> Date: Tue, 15 Jul 2025 23:52:49 +0800 (CST) > From: Fan Yu <fan....@zte.com.cn> > > Background > ========== > When TCP retransmits a packet due to missing ACKs, the > retransmission may fail for various reasons (e.g., packets > stuck in driver queues, receiver zero windows, or routing issues). > > The original tcp_retransmit_skb tracepoint: > 'commit e086101b150a ("tcp: add a tracepoint for tcp > retransmission")'
Looks like your email client converts "'" into "'" and corrupts the diff, see below. > lacks visibility into these failure causes, making production > diagnostics difficult. > > Solution > ======== > Adds the retval("err") to the tcp_retransmit_skb tracepoint. > Enables users to know why some tcp retransmission failed and > users can filter retransmission failures by retval. > > Compatibility description > ========================= > This patch extends the tcp_retransmit_skb tracepoint > by adding a new "err" field at the end of its > existing structure (within TP_STRUCT__entry). The > compatibility implications are detailed as follows: > > 1) Structural compatibility for legacy user-space tools > > Legacy tools/BPF programs accessing existing fields > (by offset or name) can still work without modification > or recompilation.The new field is appended to the end, > preserving original memory layout. > > 2) Note: semantic changes > > The original tracepoint primarily only focused on > successfully retransmitted packets. With this patch, > the tracepoint now can figure out packets that may > terminate early due to specific reasons. For accurate > statistics, users should filter using "err" to > distinguish outcomes. > > Before patched: > # cat /sys/kernel/debug/tracing/events/tcp/tcp_retransmit_skb/format > field:const void * skbaddr; offset:8; size:8; signed:0; > field:const void * skaddr; offset:16; size:8; signed:0; > field:int state; offset:24; size:4; signed:1; > field:__u16 sport; offset:28; size:2; signed:0; > field:__u16 dport; offset:30; size:2; signed:0; > field:__u16 family; offset:32; size:2; signed:0; > field:__u8 saddr[4]; offset:34; size:4; signed:0; > field:__u8 daddr[4]; offset:38; size:4; signed:0; > field:__u8 saddr_v6[16]; offset:42; size:16; signed:0; > field:__u8 daddr_v6[16]; offset:58; size:16; signed:0; > > print fmt: "skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 > daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s" > > After patched: > # cat /sys/kernel/debug/tracing/events/tcp/tcp_retransmit_skb/format > field:const void * skbaddr; offset:8; size:8; signed:0; > field:const void * skaddr; offset:16; size:8; signed:0; > field:int state; offset:24; size:4; signed:1; > field:__u16 sport; offset:28; size:2; signed:0; > field:__u16 dport; offset:30; size:2; signed:0; > field:__u16 family; offset:32; size:2; signed:0; > field:__u8 saddr[4]; offset:34; size:4; signed:0; > field:__u8 daddr[4]; offset:38; size:4; signed:0; > field:__u8 saddr_v6[16]; offset:42; size:16; signed:0; > field:__u8 daddr_v6[16]; offset:58; size:16; signed:0; > field:int err; offset:76; size:4; signed:1; > > print fmt: "skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 > daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d" > > Change Log > ========= > v4->v5: > Some fixes according to > https://lore.kernel.org/all/20250715072058.12f34...@kernel.org/ > 1. Instead of introducing new TCP_RETRANS_* enums, directly > passing the retval to the tracepoint. > > v3->v4: > Some fixes according to > https://lore.kernel.org/all/CANn89i+JGSt=_ctwfhdxypww-34a6sop3razwq9b9vl4+ph...@mail.gmail.com/ > 1. Consolidate ENOMEMs into a unified TCP_RETRANS_NOMEM. > > v2->v3: > Some fixes according to > https://lore.kernel.org/all/CANn89iJvyYjiweCESQL8E-Si7M=gosyvh1bavwwawycxw8g...@mail.gmail.com/ > 1. Rename "quit_reason" to "result". Also, keep "key=val" format concise(no > space in vals). > > v1->v2: > Some fixes according to > https://lore.kernel.org/all/cann89ik-6kt-zupnrmjpy9_tkqj-dlukrdqtvo1140q4eus...@mail.gmail.com/ > 1.Rename TCP_RETRANS_QUIT_UNDEFINED to TCP_RETRANS_ERR_DEFAULT. > 2.Added detailed compatibility consequences section. > > Suggested-by: Jakub Kicinski <k...@kernel.org> > Suggested-by: Eric Dumazet <eduma...@google.com> > Co-developed-by: xu xin <xu.xi...@zte.com.cn> > Signed-off-by: xu xin <xu.xi...@zte.com.cn> > Signed-off-by: Fan Yu <fan....@zte.com.cn> > --- Please add changelog here so it will disappear when the patch is merged. > include/trace/events/tcp.h | 27 ++++++++-------------- > net/ipv4/tcp_output.c | 46 ++++++++++++++++++++++++-------------- > 2 files changed, 38 insertions(+), 35 deletions(-) > > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h > index 54e60c6009e3..9d2c36c6a0ed 100644 > --- a/include/trace/events/tcp.h > +++ b/include/trace/events/tcp.h > @@ -13,17 +13,11 @@ > #include <linux/sock_diag.h> > #include <net/rstreason.h> > > -/* > - * tcp event with arguments sk and skb > - * > - * Note: this class requires a valid sk pointer; while skb pointer could > - * be NULL. > - */ > -DECLARE_EVENT_CLASS(tcp_event_sk_skb, > +TRACE_EVENT(tcp_retransmit_skb, > > - TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > + TP_PROTO(const struct sock *sk, const struct sk_buff *skb, int err), > > - TP_ARGS(sk, skb), > + TP_ARGS(sk, skb, err), > > TP_STRUCT__entry( > __field(const void *, skbaddr) > @@ -36,6 +30,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, > __array(__u8, daddr, 4) > __array(__u8, saddr_v6, 16) > __array(__u8, daddr_v6, 16) > + __field(int, err) > ), > > TP_fast_assign( > @@ -58,21 +53,17 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, > > TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, > sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); > + > + __entry->err = err; > ), > > - TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu > saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", > + TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu > saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d", > __entry->skbaddr, __entry->skaddr, > show_family_name(__entry->family), > __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, > __entry->saddr_v6, __entry->daddr_v6, > - show_tcp_state_name(__entry->state)) > -); > - > -DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, > - > - TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > - > - TP_ARGS(sk, skb) > + show_tcp_state_name(__entry->state), > + __entry->err) > ); > > #undef FN > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c > index b616776e3354..c31e164693d5 100644 > --- a/net/ipv4/tcp_output.c > +++ b/net/ipv4/tcp_output.c > @@ -3330,8 +3330,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct > sk_buff *skb, int segs) > if (icsk->icsk_mtup.probe_size) > icsk->icsk_mtup.probe_size = 0; > > - if (skb_still_in_host_queue(sk, skb)) > - return -EBUSY; > + if (skb_still_in_host_queue(sk, skb)) { > + err = -EBUSY; > + goto out; > + } > > start: > if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { > @@ -3342,14 +3344,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct > sk_buff *skb, int segs) > } > if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { > WARN_ON_ONCE(1); > - return -EINVAL; > + err = -EINVAL; > + goto out; > + } > + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) > { > + err = -ENOMEM; > + goto out; > } > - if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) > - return -ENOMEM; > } > > - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) > - return -EHOSTUNREACH; /* Routing failure or similar. */ > + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { > + err = -EHOSTUNREACH; /* Routing failure or similar. */ > + goto out; > + } > > cur_mss = tcp_current_mss(sk); > avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; > @@ -3360,8 +3367,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct > sk_buff *skb, int segs) > * our retransmit of one segment serves as a zero window probe. > */ > if (avail_wnd <= 0) { > - if (TCP_SKB_CB(skb)->seq != tp->snd_una) > - return -EAGAIN; > + if (TCP_SKB_CB(skb)->seq != tp->snd_una) { > + err = -EAGAIN; > + goto out; > + } > avail_wnd = cur_mss; > } > > @@ -3373,11 +3382,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct > sk_buff *skb, int segs) > } > if (skb->len > len) { > if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, > - cur_mss, GFP_ATOMIC)) > - return -ENOMEM; /* We'll try again later. */ > + cur_mss, GFP_ATOMIC)) { > + err = -ENOMEM; /* We'll try again later. */ This chunk is corrupted by "'" conversion. Please check your email client config. > + goto out; > + } > } else { > - if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) > - return -ENOMEM; > + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) { > + err = -ENOMEM; > + goto out; > + } > > diff = tcp_skb_pcount(skb); > tcp_set_skb_tso_segs(skb, cur_mss); > @@ -3431,17 +3444,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct > sk_buff *skb, int segs) > tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, > TCP_SKB_CB(skb)->seq, segs, err); > > - if (likely(!err)) { > - trace_tcp_retransmit_skb(sk, skb); > - } else if (err != -EBUSY) { > + if (err != -EBUSY) > NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); Now this is counted when !err. > - } > > /* To avoid taking spuriously low RTT samples based on a timestamp > * for a transmit that never happened, always mark EVER_RETRANS > */ > TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; > > +out: > + trace_tcp_retransmit_skb(sk, skb, err); > return err; > } > > -- > 2.25.1 >