This patch allows the user process to use MSG_EOR during tcp_sendmsg to tell the kernel that it is the last byte of an application response message.
It is currently useful when the end-user has turned on any bit of the SOF_TIMESTAMPING_TX_RECORD_MASK (either by setsockopt or cmsg). The kernel will then mark the newly added tcb->eor_info bit so that the shinfo->tskey will not be overwritten (i.e. lost) in the later skb append/collapse operation. With selective SOF_TIMESTAMPING_TX_ACK (by cmsg) and MSG_EOR (this patch), the user application can specially tell which outgoing byte it wants to track its ACK and ask the kernel not to lose this tracking info in the later skb append/collapse action. This patch handles the append case in tcp_sendmsg. The later patches will handle the collapse during retransmission and skb slicing in tcp_fragment()/tso_fragment(). One of our use case is at the webserver. The webserver tracks the HTTP2 response latency by measuring when the webserver sends the first byte to the socket till the TCP ACK of the last byte is received. In the cases where we don't have client side measurement, measuring from the server side is the only option. In the cases we have the client side measurement, the server side data can also be used to justify/cross-check-with the client side data. Signed-off-by: Martin KaFai Lau <ka...@fb.com> Cc: Eric Dumazet <eduma...@google.com> Cc: Neal Cardwell <ncardw...@google.com> Cc: Soheil Hassas Yeganeh <soheil.k...@gmail.com> Cc: Willem de Bruijn <will...@google.com> Cc: Yuchung Cheng <ych...@google.com> --- include/net/tcp.h | 5 ++++- net/ipv4/tcp.c | 21 +++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index c0ef054..f3c5dcb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -762,7 +762,10 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ - unused:7; + eor_info:1, /* Any EOR marked info that prevents + * skbs from merging. + */ + unused:6; __u32 ack_seq; /* Sequence number ACK'd */ union { struct inet_skb_parm h4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d73858..2918f42 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -428,15 +428,18 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb, + int flags) { if (sk->sk_tsflags || tsflags) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); - if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) + if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) { shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + tcb->eor_info = !!(flags & MSG_EOR); + } tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP); } } @@ -874,6 +877,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } +static bool tcp_sendmsg_noappend(struct sock *sk, u16 tx_tsflags) +{ + return unlikely((tx_tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) && + tcp_send_head(sk) && + TCP_SKB_CB(tcp_write_queue_tail(sk))->eor_info); +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -959,7 +969,7 @@ new_segment: offset += copy; size -= copy; if (!size) { - tcp_tx_timestamp(sk, sk->sk_tsflags, skb); + tcp_tx_timestamp(sk, sk->sk_tsflags, skb, 0); goto out; } @@ -1145,6 +1155,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) sg = !!(sk->sk_route_caps & NETIF_F_SG); + if (tcp_sendmsg_noappend(sk, sockc.tsflags)) + goto new_segment; + while (msg_data_left(msg)) { int copy = 0; int max = size_goal; @@ -1249,7 +1262,7 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { - tcp_tx_timestamp(sk, sockc.tsflags, skb); + tcp_tx_timestamp(sk, sockc.tsflags, skb, flags); goto out; } -- 2.5.1