This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s    24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

rfc v2 -> rfc v3:
 - fixed typos in macro name and comments
 - really enforce UDP_GRO_CNT_MAX, instead of UDP_GRO_CNT_MAX + 1
 - acquire socket lock in UDP_GRO setsockopt

rfc v1 -> rfc v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup

Signed-off-by: Paolo Abeni <pab...@redhat.com>
--
Note: I opted for acquiring the socket lock only for the newly introduced
setsockopt instead for every value, despite the previous conversation on
this topic, to avoid introducing somewhat larger and unrelated changes.
---
 include/linux/udp.h      |   3 +-
 include/uapi/linux/udp.h |   1 +
 net/ipv4/udp.c           |   8 +++
 net/ipv4/udp_offload.c   | 109 +++++++++++++++++++++++++++++++--------
 net/ipv6/udp_offload.c   |   6 +--
 5 files changed, 99 insertions(+), 28 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a4dafff407fb..f613b329852e 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -50,11 +50,12 @@ struct udp_sock {
        __u8             encap_type;    /* Is this an Encapsulation socket? */
        unsigned char    no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
                         no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-                        encap_enabled:1; /* This socket enabled encap
+                        encap_enabled:1, /* This socket enabled encap
                                           * processing; UDP tunnels and
                                           * different encapsulation layer set
                                           * this
                                           */
+                        gro_enabled:1; /* Can accept GRO packets */
        /*
         * Following member retains the information to create a UDP header
         * when the socket is uncorked.
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09502de447f5..30baccb6c9c4 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101   /* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102   /* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT    103     /* Set GSO segmentation size */
+#define UDP_GRO                104     /* This socket can receive UDP GRO 
packets */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE     1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0ed715a72249..0d447fd194c5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2476,6 +2476,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int 
optname,
                up->gso_size = val;
                break;
 
+       case UDP_GRO:
+               lock_sock(sk);
+               if (valbool)
+                       udp_tunnel_encap_enable(sk->sk_socket);
+               up->gro_enabled = valbool;
+               release_sock(sk);
+               break;
+
        /*
         *      UDP-Lite's partial checksum coverage (RFC 3828).
         */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..0646d61f4fa8 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff 
*skb,
        return segs;
 }
 
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+                                              struct sk_buff *skb)
+{
+       struct udphdr *uh = udp_hdr(skb);
+       struct sk_buff *pp = NULL;
+       struct udphdr *uh2;
+       struct sk_buff *p;
+
+       /* requires non zero csum, for symmetry with GSO */
+       if (!uh->check) {
+               NAPI_GRO_CB(skb)->flush = 1;
+               return NULL;
+       }
+
+       /* pull encapsulating udp header */
+       skb_gro_pull(skb, sizeof(struct udphdr));
+       skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+       list_for_each_entry(p, head, list) {
+               if (!NAPI_GRO_CB(p)->same_flow)
+                       continue;
+
+               uh2 = udp_hdr(p);
+
+               /* Match ports only, as csum is always non zero */
+               if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+                       NAPI_GRO_CB(p)->same_flow = 0;
+                       continue;
+               }
+
+               /* Terminate the flow on len mismatch or if it grow "too much".
+                * Under small packet flood GRO count could elsewhere grow a lot
+                * leading to execessive truesize values
+                */
+               if (!skb_gro_receive(p, skb) &&
+                   NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+                       pp = p;
+               else if (uh->len != uh2->len)
+                       pp = p;
+
+               return pp;
+       }
+
+       /* mismatch, but we never need to flush */
+       return NULL;
+}
+
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
                                struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, 
struct sk_buff *skb,
        int flush = 1;
        struct sock *sk;
 
+       rcu_read_lock();
+       sk = (*lookup)(skb, uh->source, uh->dest);
+       if (!sk)
+               goto out_unlock;
+
+       if (udp_sk(sk)->gro_enabled) {
+               pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+               rcu_read_unlock();
+               return pp;
+       }
+
        if (NAPI_GRO_CB(skb)->encap_mark ||
            (skb->ip_summed != CHECKSUM_PARTIAL &&
             NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-            !NAPI_GRO_CB(skb)->csum_valid))
-               goto out;
+            !NAPI_GRO_CB(skb)->csum_valid) ||
+           !udp_sk(sk)->gro_receive)
+               goto out_unlock;
 
        /* mark that this skb passed once through the tunnel gro layer */
        NAPI_GRO_CB(skb)->encap_mark = 1;
 
-       rcu_read_lock();
-       sk = (*lookup)(skb, uh->source, uh->dest);
-
-       if (sk && udp_sk(sk)->gro_receive)
-               goto unflush;
-       goto out_unlock;
-
-unflush:
        flush = 0;
 
        list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ struct sk_buff *udp_gro_receive(struct list_head *head, 
struct sk_buff *skb,
 
 out_unlock:
        rcu_read_unlock();
-out:
        skb_gro_flush_final(skb, pp, flush);
        return pp;
 }
@@ -427,6 +478,19 @@ static struct sk_buff *udp4_gro_receive(struct list_head 
*head,
        return NULL;
 }
 
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+       struct udphdr *uh = udp_hdr(skb);
+
+       skb->csum_start = (unsigned char *)uh - skb->head;
+       skb->csum_offset = offsetof(struct udphdr, check);
+       skb->ip_summed = CHECKSUM_PARTIAL;
+
+       skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+       skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+       return 0;
+}
+
 int udp_gro_complete(struct sk_buff *skb, int nhoff,
                     udp_lookup_t lookup)
 {
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
 
        uh->len = newlen;
 
-       /* Set encapsulation before calling into inner gro_complete() functions
-        * to make them set up the inner offsets.
-        */
-       skb->encapsulation = 1;
-
        rcu_read_lock();
        sk = (*lookup)(skb, uh->source, uh->dest);
-       if (sk && udp_sk(sk)->gro_complete)
+       if (sk && udp_sk(sk)->gro_enabled) {
+               err = udp_gro_complete_segment(skb);
+       } else if (sk && udp_sk(sk)->gro_complete) {
+               skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+                                       : SKB_GSO_UDP_TUNNEL;
+
+               /* Set encapsulation before calling into inner gro_complete()
+                * functions to make them set up the inner offsets.
+                */
+               skb->encapsulation = 1;
                err = udp_sk(sk)->gro_complete(sk, skb,
                                nhoff + sizeof(struct udphdr));
+       }
        rcu_read_unlock();
 
        if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int 
nhoff)
        const struct iphdr *iph = ip_hdr(skb);
        struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-       if (uh->check) {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+       if (uh->check)
                uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
                                          iph->daddr, 0);
-       } else {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-       }
 
        return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 1b8e161ac527..828b2457f97b 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int 
nhoff)
        const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-       if (uh->check) {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+       if (uh->check)
                uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
                                          &ipv6h->daddr, 0);
-       } else {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-       }
 
        return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
-- 
2.17.2

Reply via email to