This patch sorts RX skb lists into separate flows, using a flow dissector, at the IP input layer. Packets of the same flow are chained at the frag_list pointer of the first skb of this flow.
After ip_list_rcv_finish() the skb list has this layout: |---------| |---------| |---------| |flow 1 | |flow 1 | |flow 1 | |---------| |---------| |---------| |frag_list|<-\ |frag_list| |frag_list| |---------| \ |---------| |---------| |next |<-\ \---|next |<-------|next | |---------| \ |---------| |---------| | | | |---------| |---------| |---------| | |flow 2 | |flow 2 | |flow 2 | | |---------| |---------| |---------| | |frag_list|<-\ |frag_list| |frag_list| | |---------| \ |---------| |---------| |----|next |<-\ \---|next |<-------|next | |---------| \ |---------| |---------| | | | |---------| |---------| |---------| | |flow 3 | |flow 3 | |flow 3 | | |---------| |---------| |---------| | |frag_list|<-\ |frag_list| |frag_list| | |---------| \ |---------| |---------| |----|next | \---|next |<------|next | |---------| |---------| |---------| With this approach route lookups etc. are done just for one representative packet of a given flow instead for each packet. ip_sublist_rcv_finish() splits these lists then into: |---------| |---------| |---------| |flow 1 | |flow 1 | |flow 1 | |---------| |---------| |---------| |frag_list|<-\ |frag_list| |frag_list| |---------| \ |---------| |---------| |next | \---|next |<-------|next | |---------| |---------| |---------| Packets of the same flow can still travel together after that point. On input, this is plumbed through to ip_local_deliver_finish(), here the skb chain is split back into single packets. My hope is that this can be plumbed through to the sockets receive queue. I have a patch for UDP, but it has still problems with UDP encapsulaion, so it is not included here. On forward, the skb chain can travel together to the TX path. __skb_gso_segment() will build a standard skb list from this. For now, this is only enabled if the receiving device allows forwarding, as the forwarding path has currently the most gain from this. Known issues: - I don't have a NIC whose driver supports to build skb lists to be received by netif_receive_skb_list(). To test this codepath I used a hack that builds skb lists at the napi layer. - Performance measurements were done with this hack, so I don't know if these measurements are really meaningful. - This is early stage work, so the functional tests are only done on a basic level, it might be still buggy. - This still uses the skb->next, skb->prev pointers to build skb lists. So needs to be converted to standard list handling at some point. Signed-off-by: Steffen Klassert <steffen.klass...@secunet.com> --- include/linux/skbuff.h | 5 ++ net/core/dev.c | 45 +++++++++++- net/core/flow_dissector.c | 40 +++++++++++ net/core/skbuff.c | 52 ++++++++++++++ net/ipv4/ip_input.c | 139 ++++++++++++++++++++++++++++++++++---- net/ipv4/ip_output.c | 3 +- 6 files changed, 270 insertions(+), 14 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 17a13e4785fc..d070d073a1dc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -575,6 +575,8 @@ enum { SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, + + SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 @@ -1226,6 +1228,8 @@ skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb, data, proto, nhoff, hlen, flags); } +u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest *digest); + void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -3302,6 +3306,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); +void skb_segment_list(struct sk_buff *skb); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); diff --git a/net/core/dev.c b/net/core/dev.c index ca78dc5a79a3..147da35d7380 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2998,6 +2998,34 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) return skb->ip_summed == CHECKSUM_NONE; } +static void skb_segment_list_ip(struct sk_buff *skb) +{ + unsigned int tnl_hlen = 0; + struct sk_buff *nskb; + int id; + + id = ntohs(ip_hdr(skb)->id); + skb_segment_list(skb); + + tnl_hlen = skb_tnl_header_len(skb); + + nskb = skb->next; + + do { + skb_push(nskb, skb_network_header(nskb) - skb_mac_header(nskb)); + skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + skb_transport_header(nskb) - + skb_mac_header(nskb) + + tnl_hlen); + + ip_hdr(nskb)->id = htons(++id); + ip_send_check(ip_hdr(nskb)); + nskb = nskb->next; + } while (nskb); +} + /** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment @@ -3016,6 +3044,21 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, { struct sk_buff *segs; + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { + int dummy; + + if (skb_network_protocol(skb, &dummy) != htons(ETH_P_IP)) + return ERR_PTR(-EINVAL); + + skb_segment_list_ip(skb); + + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + return ERR_PTR(-EINVAL); + + return skb; + } + if (unlikely(skb_needs_check(skb, tx_path))) { int err; @@ -3289,7 +3332,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; - } else if (segs) { + } else if (segs && segs != skb) { consume_skb(skb); skb = segs; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ce9eeeb7c024..8ca7e09dca5e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1211,6 +1211,46 @@ static inline u32 ___skb_get_hash(const struct sk_buff *skb, return __flow_hash_from_keys(keys, keyval); } +struct _flow_keys_rx_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 poff; + __be32 ports; + __be32 src; + __be32 dst; +}; + +u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest *digest) +{ + struct flow_keys keys; + struct _flow_keys_rx_digest_data *data = + (struct _flow_keys_rx_digest_data *)digest; + struct flow_keys_basic *bkeys; + u32 poff; + + __flow_hash_secret_init(); + + skb_flow_dissect_flow_keys(skb, &keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + bkeys = (struct flow_keys_basic *)&keys; + poff = __skb_get_poff(skb, skb->data, bkeys, skb_headlen(skb)); + if (poff > 255) + poff = 0; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + data->n_proto = keys.basic.n_proto; + data->ip_proto = keys.basic.ip_proto; + data->ports = keys.ports.ports; + data->poff = poff; + data->src = keys.addrs.v4addrs.src; + data->dst = keys.addrs.v4addrs.dst; + + return poff; +} +EXPORT_SYMBOL(skb_flow_keys_rx_digest); + struct _flow_keys_digest_data { __be16 n_proto; u8 ip_proto; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c996c09d095f..8f725a78dc93 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3495,6 +3495,58 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) return head_frag; } +void skb_segment_list(struct sk_buff *skb) +{ + struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; + unsigned int delta_truesize = 0; + unsigned int delta_len = 0; + struct sk_buff *tail = NULL; + struct sk_buff *nskb; + + + skb_shinfo(skb)->frag_list = NULL; + + do { + nskb = list_skb; + list_skb = list_skb->next; + + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + tail = nskb; + + delta_len += nskb->len; + delta_truesize += nskb->truesize; + + if (!secpath_exists(nskb)) + nskb->sp = secpath_get(skb->sp); + + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + nskb->queue_mapping = skb->queue_mapping; + + nskb->mac_len = skb->mac_len; + nskb->mac_header = skb->mac_header; + nskb->transport_header = skb->transport_header; + nskb->network_header = skb->network_header; + skb_dst_copy(nskb, skb); + + } while (list_skb); + + skb->truesize = skb->truesize - delta_truesize; + skb->data_len = skb->data_len - delta_len; + skb->len = skb->len - delta_len; + + skb_gso_reset(skb); + + skb->prev = tail; +} +EXPORT_SYMBOL_GPL(skb_segment_list); + /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 3196cf58f418..bf710bf95fea 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -190,14 +190,20 @@ bool ip_call_ra_chain(struct sk_buff *skb) static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - __skb_pull(skb, skb_network_header_len(skb)); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) + skb_segment_list(skb); rcu_read_lock(); - { + do { int protocol = ip_hdr(skb)->protocol; const struct net_protocol *ipprot; + struct sk_buff *nskb = skb->next; int raw; + skb->next = NULL; + + __skb_pull(skb, skb_network_header_len(skb)); + resubmit: raw = raw_local_deliver(skb, protocol); @@ -208,7 +214,7 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); - goto out; + continue; } nf_reset(skb); } @@ -231,8 +237,8 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b consume_skb(skb); } } - } - out: + skb = nskb; + } while (skb); rcu_read_unlock(); return 0; @@ -403,6 +409,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; + /* Remove any debris in the socket control block */ + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->skb_iif; + /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -416,10 +426,108 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) return ret; } +struct dissect_skb_cb { + struct sk_buff *last; + struct flow_keys_digest keys; +}; + +static inline struct dissect_skb_cb *dissect_skb_cb(const struct sk_buff *skb) { + return (struct dissect_skb_cb *)skb->cb; +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net); + +static struct sk_buff *ip_flow_dissect(struct sk_buff *skb, struct list_head *rx_list) +{ + unsigned int maclen = skb->dev->hard_header_len; + const struct iphdr *iph = ip_hdr(skb); + unsigned int gso_type = 0; + struct sk_buff *p; + u32 poff; + + if (*(u8 *)iph != 0x45) + goto out; + + if (ip_is_fragment(iph)) + goto out; + + dissect_skb_cb(skb)->last = NULL; + poff = skb_flow_keys_rx_digest(skb, &dissect_skb_cb(skb)->keys); + if (!poff) + goto out; + + switch (iph->protocol) { + case IPPROTO_TCP: + gso_type = SKB_GSO_TCPV4; + break; + case IPPROTO_UDP: + gso_type = SKB_GSO_UDP_L4; + break; + default: + goto out; + } + + list_for_each_entry(p, rx_list, list) { + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + + if (diffs) + continue; + + if (memcmp(&dissect_skb_cb(p)->keys, + &dissect_skb_cb(skb)->keys, + sizeof(dissect_skb_cb(skb)->keys))) + continue; + + if (p->len != skb->len) { + if (!list_empty(rx_list)) + ip_sublist_rcv(rx_list, p->dev, dev_net(p->dev)); + INIT_LIST_HEAD(rx_list); + goto out; + } + + skb->next = NULL; + skb->prev = NULL; + + if (!dissect_skb_cb(p)->last) { + skb_shinfo(p)->gso_size = p->len - poff; + skb_shinfo(p)->gso_type |= (SKB_GSO_FRAGLIST | gso_type); + skb_shinfo(p)->frag_list = skb; + skb_shinfo(p)->gso_segs = 1; + } else { + dissect_skb_cb(p)->last->next = skb; + } + + dissect_skb_cb(p)->last = skb; + + skb_shinfo(p)->gso_segs++; + p->data_len += skb->len; + p->truesize += skb->truesize; + p->len += skb->len; + + return NULL; + } + +out: + return skb; +} + /* * Main IP Receive routine. */ -static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) +static struct sk_buff *ip_rcv_core(struct list_head *head, struct sk_buff *skb, struct net *net) { const struct iphdr *iph; u32 len; @@ -491,13 +599,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) skb->transport_header = skb->network_header + iph->ihl*4; - /* Remove any debris in the socket control block */ - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - IPCB(skb)->iif = skb->skb_iif; - /* Must drop socket now because of tproxy. */ skb_orphan(skb); + if (IN_DEV_FORWARD(__in_dev_get_rcu(skb->dev))) { + if (head) + return ip_flow_dissect(skb, head); + } + return skb; csum_error: @@ -518,9 +627,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, { struct net *net = dev_net(dev); - skb = ip_rcv_core(skb, net); + skb = ip_rcv_core(NULL, skb, net); if (skb == NULL) return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); @@ -552,6 +662,11 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct dst_entry *dst; list_del(&skb->list); + + /* Remove any debris in the socket control block */ + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->skb_iif; + /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -599,7 +714,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net *net = dev_net(dev); list_del(&skb->list); - skb = ip_rcv_core(skb, net); + skb = ip_rcv_core(&sublist, skb, net); if (skb == NULL) continue; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9c4e72e9c60a..00d8a2576266 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -272,7 +272,8 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return -ENOMEM; } - consume_skb(skb); + if (segs != skb) + consume_skb(skb); do { struct sk_buff *nskb = segs->next; -- 2.17.1