This patch sorts RX skb lists into separate flows, using
a flow dissector, at the IP input layer. Packets of the
same flow are chained at the frag_list pointer of the first
skb of this flow.

After ip_list_rcv_finish() the skb list has this layout:

|---------|        |---------|        |---------|
|flow 1   |        |flow 1   |        |flow 1   |
|---------|        |---------|        |---------|
|frag_list|<-\     |frag_list|        |frag_list|
|---------|   \    |---------|        |---------|
|next     |<-\ \---|next     |<-------|next     |
|---------|   \    |---------|        |---------|
              |
              |
              |    |---------|        |---------|        |---------|
              |    |flow 2   |        |flow 2   |        |flow 2   |
              |    |---------|        |---------|        |---------|
              |    |frag_list|<-\     |frag_list|        |frag_list|
              |    |---------|   \    |---------|        |---------|
              |----|next     |<-\ \---|next     |<-------|next     |
                   |---------|   \    |---------|        |---------|
                                 |
                                 |
                                 |    |---------|        |---------|       
|---------|
                                 |    |flow 3   |        |flow 3   |       
|flow 3   |
                                 |    |---------|        |---------|       
|---------|
                                 |    |frag_list|<-\     |frag_list|       
|frag_list|
                                 |    |---------|   \    |---------|       
|---------|
                                 |----|next     |    \---|next     
|<------|next     |
                                      |---------|        |---------|       
|---------|

With this approach route lookups etc. are done just for one
representative packet of a given flow instead for each packet.

ip_sublist_rcv_finish() splits these lists then into:

|---------|        |---------|        |---------|
|flow 1   |        |flow 1   |        |flow 1   |
|---------|        |---------|        |---------|
|frag_list|<-\     |frag_list|        |frag_list|
|---------|   \    |---------|        |---------|
|next     |    \---|next     |<-------|next     |
|---------|        |---------|        |---------|

Packets of the same flow can still travel together after that point.

On input, this is plumbed through to ip_local_deliver_finish(),
here the skb chain is split back into single packets.

My hope is that this can be plumbed through to the sockets
receive queue. I have a patch for UDP, but it has still
problems with UDP encapsulaion, so it is not included here.

On forward, the skb chain can travel together to the TX path.
__skb_gso_segment() will build a standard skb list from this.

For now, this is only enabled if the receiving device allows
forwarding, as the forwarding path has currently the most gain
from this.

Known issues:

- I don't have a NIC whose driver supports to build skb lists
  to be received by netif_receive_skb_list(). To test this
  codepath I used a hack that builds skb lists at the napi
  layer.

- Performance measurements were done with this hack, so I don't
  know if these measurements are really meaningful.

- This is early stage work, so the functional tests are only
  done on a basic level, it might be still buggy.

- This still uses the skb->next, skb->prev pointers to build
  skb lists. So needs to be converted to standard list handling
  at some point.

Signed-off-by: Steffen Klassert <steffen.klass...@secunet.com>
---
 include/linux/skbuff.h    |   5 ++
 net/core/dev.c            |  45 +++++++++++-
 net/core/flow_dissector.c |  40 +++++++++++
 net/core/skbuff.c         |  52 ++++++++++++++
 net/ipv4/ip_input.c       | 139 ++++++++++++++++++++++++++++++++++----
 net/ipv4/ip_output.c      |   3 +-
 6 files changed, 270 insertions(+), 14 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..d070d073a1dc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -575,6 +575,8 @@ enum {
        SKB_GSO_UDP = 1 << 16,
 
        SKB_GSO_UDP_L4 = 1 << 17,
+
+       SKB_GSO_FRAGLIST = 1 << 18,
 };
 
 #if BITS_PER_LONG > 32
@@ -1226,6 +1228,8 @@ skb_flow_dissect_flow_keys_basic(const struct sk_buff 
*skb,
                                  data, proto, nhoff, hlen, flags);
 }
 
+u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest 
*digest);
+
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
@@ -3302,6 +3306,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, 
int shiftlen);
 void skb_scrub_packet(struct sk_buff *skb, bool xnet);
 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
+void skb_segment_list(struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..147da35d7380 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2998,6 +2998,34 @@ static inline bool skb_needs_check(struct sk_buff *skb, 
bool tx_path)
        return skb->ip_summed == CHECKSUM_NONE;
 }
 
+static void skb_segment_list_ip(struct sk_buff *skb)
+{
+       unsigned int tnl_hlen = 0;
+       struct sk_buff *nskb;
+       int id;
+
+       id = ntohs(ip_hdr(skb)->id);
+       skb_segment_list(skb);
+
+       tnl_hlen = skb_tnl_header_len(skb);
+
+       nskb = skb->next;
+
+       do {
+               skb_push(nskb, skb_network_header(nskb) - skb_mac_header(nskb));
+               skb_headers_offset_update(nskb, skb_headroom(nskb) - 
skb_headroom(skb));
+               skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+                                                nskb->data - tnl_hlen,
+                                                skb_transport_header(nskb) -
+                                                skb_mac_header(nskb) +
+                                                tnl_hlen);
+
+               ip_hdr(nskb)->id = htons(++id);
+               ip_send_check(ip_hdr(nskb));
+               nskb = nskb->next;
+       } while (nskb);
+}
+
 /**
  *     __skb_gso_segment - Perform segmentation on skb.
  *     @skb: buffer to segment
@@ -3016,6 +3044,21 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 {
        struct sk_buff *segs;
 
+       if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+               int dummy;
+
+               if (skb_network_protocol(skb, &dummy) != htons(ETH_P_IP))
+                       return ERR_PTR(-EINVAL);
+
+               skb_segment_list_ip(skb);
+
+               if (skb_needs_linearize(skb, features) &&
+                   __skb_linearize(skb))
+                       return ERR_PTR(-EINVAL);
+
+               return skb;
+       }
+
        if (unlikely(skb_needs_check(skb, tx_path))) {
                int err;
 
@@ -3289,7 +3332,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff 
*skb, struct net_device
                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
                        goto out_kfree_skb;
-               } else if (segs) {
+               } else if (segs && segs != skb) {
                        consume_skb(skb);
                        skb = segs;
                }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..8ca7e09dca5e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1211,6 +1211,46 @@ static inline u32 ___skb_get_hash(const struct sk_buff 
*skb,
        return __flow_hash_from_keys(keys, keyval);
 }
 
+struct _flow_keys_rx_digest_data {
+       __be16  n_proto;
+       u8      ip_proto;
+       u8      poff;
+       __be32  ports;
+       __be32  src;
+       __be32  dst;
+};
+
+u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest 
*digest)
+{
+       struct flow_keys keys;
+       struct _flow_keys_rx_digest_data *data =
+           (struct _flow_keys_rx_digest_data *)digest;
+       struct flow_keys_basic *bkeys;
+       u32 poff;
+
+       __flow_hash_secret_init();
+
+       skb_flow_dissect_flow_keys(skb, &keys,
+                                  FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+       bkeys = (struct flow_keys_basic *)&keys;
+       poff = __skb_get_poff(skb, skb->data, bkeys, skb_headlen(skb));
+       if (poff > 255)
+               poff = 0;
+
+       BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+       data->n_proto = keys.basic.n_proto;
+       data->ip_proto = keys.basic.ip_proto;
+       data->ports = keys.ports.ports;
+       data->poff = poff;
+       data->src = keys.addrs.v4addrs.src;
+       data->dst = keys.addrs.v4addrs.dst;
+
+       return poff;
+}
+EXPORT_SYMBOL(skb_flow_keys_rx_digest);
+
 struct _flow_keys_digest_data {
        __be16  n_proto;
        u8      ip_proto;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c996c09d095f..8f725a78dc93 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3495,6 +3495,58 @@ static inline skb_frag_t 
skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
        return head_frag;
 }
 
+void skb_segment_list(struct sk_buff *skb)
+{
+       struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
+       unsigned int delta_truesize = 0;
+       unsigned int delta_len = 0;
+       struct sk_buff *tail = NULL;
+       struct sk_buff *nskb;
+
+
+       skb_shinfo(skb)->frag_list = NULL;
+
+       do {
+               nskb = list_skb;
+               list_skb = list_skb->next;
+
+               if (!tail)
+                       skb->next = nskb;
+               else
+                       tail->next = nskb;
+
+               tail = nskb;
+
+               delta_len += nskb->len;
+               delta_truesize += nskb->truesize;
+
+               if (!secpath_exists(nskb))
+                       nskb->sp = secpath_get(skb->sp);
+
+               memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+
+               nskb->tstamp = skb->tstamp;
+               nskb->dev = skb->dev;
+               nskb->queue_mapping = skb->queue_mapping;
+
+               nskb->mac_len = skb->mac_len;
+               nskb->mac_header = skb->mac_header;
+               nskb->transport_header = skb->transport_header;
+               nskb->network_header = skb->network_header;
+               skb_dst_copy(nskb, skb);
+
+       } while (list_skb);
+
+       skb->truesize = skb->truesize - delta_truesize;
+       skb->data_len = skb->data_len - delta_len;
+       skb->len = skb->len - delta_len;
+
+       skb_gso_reset(skb);
+
+       skb->prev = tail;
+}
+EXPORT_SYMBOL_GPL(skb_segment_list);
+
 /**
  *     skb_segment - Perform protocol segmentation on skb.
  *     @head_skb: buffer to segment
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..bf710bf95fea 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -190,14 +190,20 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 
 static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct 
sk_buff *skb)
 {
-       __skb_pull(skb, skb_network_header_len(skb));
+       if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+               skb_segment_list(skb);
 
        rcu_read_lock();
-       {
+       do {
                int protocol = ip_hdr(skb)->protocol;
                const struct net_protocol *ipprot;
+               struct sk_buff *nskb = skb->next;
                int raw;
 
+               skb->next = NULL;
+
+               __skb_pull(skb, skb_network_header_len(skb));
+
        resubmit:
                raw = raw_local_deliver(skb, protocol);
 
@@ -208,7 +214,7 @@ static int ip_local_deliver_finish(struct net *net, struct 
sock *sk, struct sk_b
                        if (!ipprot->no_policy) {
                                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, 
skb)) {
                                        kfree_skb(skb);
-                                       goto out;
+                                       continue;
                                }
                                nf_reset(skb);
                        }
@@ -231,8 +237,8 @@ static int ip_local_deliver_finish(struct net *net, struct 
sock *sk, struct sk_b
                                consume_skb(skb);
                        }
                }
-       }
- out:
+               skb = nskb;
+       } while (skb);
        rcu_read_unlock();
 
        return 0;
@@ -403,6 +409,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, 
struct sk_buff *skb)
 {
        int ret;
 
+       /* Remove any debris in the socket control block */
+       memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+       IPCB(skb)->iif = skb->skb_iif;
+
        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
@@ -416,10 +426,108 @@ static int ip_rcv_finish(struct net *net, struct sock 
*sk, struct sk_buff *skb)
        return ret;
 }
 
+struct dissect_skb_cb {
+       struct sk_buff *last;
+       struct  flow_keys_digest keys;
+};
+
+static inline struct dissect_skb_cb *dissect_skb_cb(const struct sk_buff *skb) 
{
+       return (struct dissect_skb_cb *)skb->cb;
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+                          struct net *net);
+
+static struct sk_buff *ip_flow_dissect(struct sk_buff *skb, struct list_head 
*rx_list)
+{
+       unsigned int maclen = skb->dev->hard_header_len;
+       const struct iphdr *iph  = ip_hdr(skb);
+       unsigned int gso_type = 0;
+       struct sk_buff *p;
+       u32 poff;
+
+       if (*(u8 *)iph != 0x45)
+               goto out;
+
+       if (ip_is_fragment(iph))
+               goto out;
+
+       dissect_skb_cb(skb)->last = NULL;
+       poff = skb_flow_keys_rx_digest(skb, &dissect_skb_cb(skb)->keys);
+       if (!poff)
+               goto out;
+
+       switch (iph->protocol) {
+       case IPPROTO_TCP:
+               gso_type = SKB_GSO_TCPV4;
+               break;
+       case IPPROTO_UDP:
+               gso_type = SKB_GSO_UDP_L4;
+               break;
+       default:
+               goto out;
+       }
+
+       list_for_each_entry(p, rx_list, list) {
+               unsigned long diffs;
+
+               diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+               diffs |= p->vlan_tci ^ skb->vlan_tci;
+               diffs |= skb_metadata_dst_cmp(p, skb);
+               diffs |= skb_metadata_differs(p, skb);
+               if (maclen == ETH_HLEN)
+                       diffs |= compare_ether_header(skb_mac_header(p),
+                                                     skb_mac_header(skb));
+               else if (!diffs)
+                       diffs = memcmp(skb_mac_header(p),
+                                      skb_mac_header(skb),
+                                      maclen);
+
+               if (diffs)
+                       continue;
+
+               if (memcmp(&dissect_skb_cb(p)->keys,
+                          &dissect_skb_cb(skb)->keys,
+                          sizeof(dissect_skb_cb(skb)->keys)))
+                       continue;
+
+               if (p->len != skb->len) {
+                       if (!list_empty(rx_list))
+                               ip_sublist_rcv(rx_list, p->dev, 
dev_net(p->dev));
+                       INIT_LIST_HEAD(rx_list);
+                       goto out;
+       }
+
+               skb->next = NULL;
+               skb->prev = NULL;
+
+               if (!dissect_skb_cb(p)->last) {
+                       skb_shinfo(p)->gso_size = p->len - poff;
+                       skb_shinfo(p)->gso_type |= (SKB_GSO_FRAGLIST | 
gso_type);
+                       skb_shinfo(p)->frag_list = skb;
+                       skb_shinfo(p)->gso_segs = 1;
+               } else {
+                       dissect_skb_cb(p)->last->next = skb;
+               }
+
+               dissect_skb_cb(p)->last = skb;
+
+               skb_shinfo(p)->gso_segs++;
+               p->data_len += skb->len;
+               p->truesize += skb->truesize;
+               p->len += skb->len;
+
+               return NULL;
+       }
+
+out:
+       return skb;
+}
+
 /*
  *     Main IP Receive routine.
  */
-static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
+static struct sk_buff *ip_rcv_core(struct list_head *head, struct sk_buff 
*skb, struct net *net)
 {
        const struct iphdr *iph;
        u32 len;
@@ -491,13 +599,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, 
struct net *net)
 
        skb->transport_header = skb->network_header + iph->ihl*4;
 
-       /* Remove any debris in the socket control block */
-       memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-       IPCB(skb)->iif = skb->skb_iif;
-
        /* Must drop socket now because of tproxy. */
        skb_orphan(skb);
 
+       if (IN_DEV_FORWARD(__in_dev_get_rcu(skb->dev))) {
+               if (head)
+                       return ip_flow_dissect(skb, head);
+       }
+
        return skb;
 
 csum_error:
@@ -518,9 +627,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt,
 {
        struct net *net = dev_net(dev);
 
-       skb = ip_rcv_core(skb, net);
+       skb = ip_rcv_core(NULL, skb, net);
        if (skb == NULL)
                return NET_RX_DROP;
+
        return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
                       net, NULL, skb, dev, NULL,
                       ip_rcv_finish);
@@ -552,6 +662,11 @@ static void ip_list_rcv_finish(struct net *net, struct 
sock *sk,
                struct dst_entry *dst;
 
                list_del(&skb->list);
+
+               /* Remove any debris in the socket control block */
+               memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+               IPCB(skb)->iif = skb->skb_iif;
+
                /* if ingress device is enslaved to an L3 master device pass the
                 * skb to its handler for processing
                 */
@@ -599,7 +714,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type 
*pt,
                struct net *net = dev_net(dev);
 
                list_del(&skb->list);
-               skb = ip_rcv_core(skb, net);
+               skb = ip_rcv_core(&sublist, skb, net);
                if (skb == NULL)
                        continue;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..00d8a2576266 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -272,7 +272,8 @@ static int ip_finish_output_gso(struct net *net, struct 
sock *sk,
                return -ENOMEM;
        }
 
-       consume_skb(skb);
+       if (segs != skb)
+               consume_skb(skb);
 
        do {
                struct sk_buff *nskb = segs->next;
-- 
2.17.1

Reply via email to