On Mon, 2016-04-18 at 19:27 -0700, Martin KaFai Lau wrote:
> On Mon, Apr 18, 2016 at 05:06:57PM -0700, Eric Dumazet wrote:
> > It should only be a request from user space to ask TCP to not aggregate
> > stuff on future sendpage()/sendmsg() on the skb carrying this new flag.
> >
> How about something like this.  Please advise if tcp_sendmsg_noappend can
> be simpler.
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index c0ef054..ac31798 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -762,7 +762,8 @@ struct tcp_skb_cb {
> 
>       __u8            ip_dsfield;     /* IPv4 tos or IPv6 dsfield     */
>       __u8            txstamp_ack:1,  /* Record TX timestamp for ack? */
> -                     unused:7;
> +                     eor:1,          /* Is skb MSG_EOR marked */
> +                     unused:6;
>       __u32           ack_seq;        /* Sequence number ACK'd        */
>       union {
>               struct inet_skb_parm    h4;
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 4d73858..12772be 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -874,6 +874,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, 
> int flags)
>       return mss_now;
>  }
> 
> +static bool tcp_sendmsg_noappend(const struct sock *sk)
> +{
> +     const struct sk_buff *skb = tcp_write_queue_tail(sk);
> +
> +     return (!skb || TCP_SKB_CB(skb)->eor);
> +}
> +
>  static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int 
> offset,
>                               size_t size, int flags)
>  {
> @@ -903,6 +910,9 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct 
> page *page, int offset,
>       if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
>               goto out_err;
> 
> +     if (tcp_sendmsg_noappend(sk))
> +             goto new_segment;
> +
>       while (size > 0) {
>               struct sk_buff *skb = tcp_write_queue_tail(sk);
>               int copy, i;
> @@ -960,6 +970,7 @@ new_segment:
>               size -= copy;
>               if (!size) {
>                       tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
> +                     TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
>                       goto out;
>               }
> 
> @@ -1145,6 +1156,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, 
> size_t size)
> 
>       sg = !!(sk->sk_route_caps & NETIF_F_SG);
> 
> +     if (tcp_sendmsg_noappend(sk))
> +             goto new_segment;
> +
>       while (msg_data_left(msg)) {
>               int copy = 0;
>               int max = size_goal;
> @@ -1250,6 +1264,7 @@ new_segment:
>               copied += copy;
>               if (!msg_data_left(msg)) {
>                       tcp_tx_timestamp(sk, sockc.tsflags, skb);
> +                     TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
>                       goto out;
>               }

I believe it is slightly wrong (to do the goto new_segment if there is
no data to send)

I would instead use this fast path, doing the test _when_ we already
have an skb to test for.

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fd40f8c64d5f..015851634195 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -760,7 +760,8 @@ struct tcp_skb_cb {
 
        __u8            ip_dsfield;     /* IPv4 tos or IPv6 dsfield     */
        __u8            txstamp_ack:1,  /* Record TX timestamp for ack? */
-                       unused:7;
+                       eor:1,          /* Is skb MSG_EOR marked */
+                       unused:6;
        __u32           ack_seq;        /* Sequence number ACK'd        */
        union {
                struct inet_skb_parm    h4;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4d73858991af..1944c19885ab 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -908,7 +908,9 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct 
page *page, int offset,
                int copy, i;
                bool can_coalesce;
 
-               if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+               if (!tcp_send_head(sk) ||
+                   (copy = size_goal - skb->len) <= 0 ||
+                   TCP_SKB_CB(skb)->eor) {
 new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
@@ -960,6 +962,7 @@ new_segment:
                size -= copy;
                if (!size) {
                        tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+                       TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
                        goto out;
                }
 
@@ -1156,7 +1159,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
                        copy = max - skb->len;
                }
 
-               if (copy <= 0) {
+               if (copy <= 0 || TCP_SKB_CB(skb)->eor) {
 new_segment:
                        /* Allocate new segment. If the interface is SG,
                         * allocate skb fitting to single page.
@@ -1250,6 +1253,7 @@ new_segment:
                copied += copy;
                if (!msg_data_left(msg)) {
                        tcp_tx_timestamp(sk, sockc.tsflags, skb);
+                       TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
                        goto out;
                }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6451b83d81e9..acfbff81ef47 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1171,6 +1171,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, 
u32 len,
        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
        TCP_SKB_CB(buff)->tcp_flags = flags;
        TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+       TCP_SKB_CB(buff)->eor = TCP_SKB_CB(skb)->eor;
+       TCP_SKB_CB(skb)->eor = 0;
 
        if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
                /* Copy and checksum data tail into the new buffer. */
@@ -1730,6 +1732,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff 
*skb, unsigned int len,
 
        /* This packet was never sent out yet, so no SACK bits. */
        TCP_SKB_CB(buff)->sacked = 0;
+       TCP_SKB_CB(buff)->eor = TCP_SKB_CB(skb)->eor;
+       TCP_SKB_CB(skb)->eor = 0;
 
        buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
        skb_split(skb, buff, len);
@@ -2471,6 +2475,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct 
sk_buff *skb)
 
        /* Merge over control information. This moves PSH/FIN etc. over */
        TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
+       TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
 
        /* All done, get rid of second SKB and account for it so
         * packet counting does not break.
@@ -2502,7 +2507,8 @@ static bool tcp_can_collapse(const struct sock *sk, const 
struct sk_buff *skb)
        /* Some heurestics for collapsing over SACK'd could be invented */
        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                return false;
-
+       if (TCP_SKB_CB(skb)->eor)
+               return false;
        return true;
 }
 





Reply via email to