-----邮件原件----- 发件人: dev [mailto:ovs-dev-boun...@openvswitch.org] 代表 Ilya Maximets 发送时间: 2020年10月27日 21:03 收件人: yang_y...@163.com; ovs-dev@openvswitch.org 抄送: f...@sysclose.org; i.maxim...@ovn.org 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
On 8/7/20 12:56 PM, yang_y...@163.com wrote: > From: Yi Yang <yangy...@inspur.com> > > GSO(Generic Segment Offload) can segment large UDP and TCP packet to > small packets per MTU of destination , especially for the case that > physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO > can make sure userspace TSO can still work but not drop. > > In addition, GSO can help improve UDP performane when UFO is enabled > in VM. > > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx > function of physical NIC. > > Signed-off-by: Yi Yang <yangy...@inspur.com> > --- > lib/dp-packet.h | 21 +++- > lib/netdev-dpdk.c | 358 > +++++++++++++++++++++++++++++++++++++++++++++++++---- > lib/netdev-linux.c | 17 ++- > lib/netdev.c | 67 +++++++--- > 4 files changed, 417 insertions(+), 46 deletions(-) > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 79895f2..c33868d > 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -83,6 +83,8 @@ enum dp_packet_offload_mask { > DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800), > /* VXLAN TCP Segmentation Offload. */ > DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, > 0x1000), > + /* UDP Segmentation Offload. */ > + DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000), > /* Adding new field requires adding to > DP_PACKET_OL_SUPPORTED_MASK. */ }; > > @@ -97,7 +99,8 @@ enum dp_packet_offload_mask { > DP_PACKET_OL_TX_IPV6 | \ > DP_PACKET_OL_TX_TCP_CKSUM | \ > DP_PACKET_OL_TX_UDP_CKSUM | \ > - DP_PACKET_OL_TX_SCTP_CKSUM) > + DP_PACKET_OL_TX_SCTP_CKSUM | \ > + DP_PACKET_OL_TX_UDP_SEG) > > #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ > DP_PACKET_OL_TX_UDP_CKSUM | \ @@ > -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b) > return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG); > } > > +/* Returns 'true' if packet 'b' is marked for UDP segmentation > +offloading. */ static inline bool dp_packet_hwol_is_uso(const struct > +dp_packet *b) { > + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); > +} > + > /* Returns 'true' if packet 'b' is marked for IPv4 checksum > offloading. */ static inline bool dp_packet_hwol_is_ipv4(const > struct dp_packet *b) @@ -1034,6 +1044,15 @@ > dp_packet_hwol_set_tcp_seg(struct dp_packet *b) > *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } > > +/* Mark packet 'b' for UDP segmentation offloading. It implies that > + * either the packet 'b' is marked for IPv4 or IPv6 checksum > +offloading > + * and also for UDP checksum offloading. */ static inline void > +dp_packet_hwol_set_udp_seg(struct dp_packet *b) { > + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; } > + > #ifdef DPDK_NETDEV > /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static > inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index > 30493ed..888a45e 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -38,13 +38,15 @@ > #include <rte_errno.h> > #include <rte_ethdev.h> > #include <rte_flow.h> > +#include <rte_gso.h> > +#include <rte_ip.h> > #include <rte_malloc.h> > #include <rte_mbuf.h> > #include <rte_meter.h> > #include <rte_pci.h> > #include <rte_version.h> > #include <rte_vhost.h> > -#include <rte_ip.h> > +#include <rte_ip_frag.h> > > #include "cmap.h" > #include "coverage.h" > @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t; > | DEV_TX_OFFLOAD_UDP_CKSUM \ > | DEV_TX_OFFLOAD_IPV4_CKSUM) > > +#define MAX_GSO_MBUFS 64 > > static const struct rte_eth_conf port_conf = { > .rxmode = { > @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct > netdev_dpdk *dev) > return ret; > } > > +static uint16_t > +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) { > + if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) { > + return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr); > + } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ > + return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr); > + } > +} > + > /* Prepare the packet for HWOL. > * Return True if the packet is OK to continue. */ static bool @@ > -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, > struct rte_mbuf *mbuf) > * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared > * outer_l2_len and outer_l3_len must be zeroed. > */ > - if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO) > - || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) > + if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) > && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + > mbuf->outer_l3_len > - + mbuf->l2_len))) { > + + mbuf->l2_len)) { > mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN; > mbuf->l2_len -= sizeof(struct udp_header) > + sizeof(struct vxlanhdr); @@ -2249,7 +2261,7 > @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf > *mbuf) > /* inner IP checksum offload offload */ > mbuf->ol_flags |= PKT_TX_IP_CKSUM; > } > - } else if (mbuf->ol_flags & PKT_TX_L4_MASK) { > + } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) { > /* Handle VLAN TSO */ > /* no inner IP checksum for IPV6 */ > if (mbuf->ol_flags & PKT_TX_IPV4) { @@ -2273,6 +2285,18 @@ > netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); > mbuf->outer_l2_len = 0; > mbuf->outer_l3_len = 0; > + > + /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO > + * APIs, here is a place we can mark it. > + */ > + if ((mbuf->pkt_len > 1464) > + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { > + if (l4_proto == IPPROTO_UDP) { > + mbuf->ol_flags |= PKT_TX_UDP_SEG; > + } else if (l4_proto == IPPROTO_TCP) { > + mbuf->ol_flags |= PKT_TX_TCP_SEG; > + } > + } > } > > /* It is possible that l4_len isn't set for vhostuserclient */ @@ > -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, > struct rte_mbuf *mbuf) > mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; > } > > + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { > + return true; > + } > + > if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { > if (l4_proto != IPPROTO_UDP) { > VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header" > @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, > struct rte_mbuf *mbuf) > mbuf->ol_flags & PKT_TX_TCP_CKSUM) { > if (l4_proto != IPPROTO_TCP) { > VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" > - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); > + " pkt len: %"PRIu32" l4_proto = %d", > + dev->up.name, mbuf->pkt_len, l4_proto); > return false; > } > > - if (mbuf->pkt_len - mbuf->l2_len > 1450) { > + if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > + + mbuf->l2_len) { > dp_packet_hwol_set_tcp_seg(pkt); > } > > @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, > struct rte_mbuf *mbuf) > } else { > mbuf->tso_segsz = 0; > } > + > + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > + /* PKT_TX_TCP_CKSUM must be cleaned for GSO because > + * tcp checksum only can be caculated by software for > + * GSO case. > + */ > + mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM; > + } > } > + > + /* UDP GSO if necessary */ > + if (l4_proto == IPPROTO_UDP) { > + /* VXLAN GSO can be done here */ > + if ((mbuf->ol_flags & PKT_TX_UDP_SEG) || > + (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > + > + mbuf->l2_len))) { > + dp_packet_hwol_set_udp_seg(pkt); > + > + /* For UDP GSO, udp checksum must be calculated by software */ > + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { > + void *l3_hdr, *l4_hdr; > + struct rte_udp_hdr *udp_hdr; > + > + /* PKT_TX_UDP_CKSUM must be cleaned for GSO because > + * udp checksum only can be caculated by software for > + * GSO case. > + */ > + mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM; > + > + eth_hdr = (struct rte_ether_hdr *) > + ((uint8_t *)eth_hdr + mbuf->outer_l2_len + > + mbuf->outer_l3_len + > + sizeof(struct udp_header) + > + sizeof(struct vxlanhdr)); > + l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len - > + sizeof(struct udp_header) - > + sizeof(struct vxlanhdr); > + l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len; > + ip_hdr = (struct rte_ipv4_hdr *)l3_hdr; > + ip_hdr->hdr_checksum = 0; > + ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr); > + /* Don't touch UDP checksum if it is ip fragment */ > + if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) { > + udp_hdr = (struct rte_udp_hdr *)l4_hdr; > + udp_hdr->dgram_cksum = 0; > + udp_hdr->dgram_cksum = > + get_udptcp_checksum(l3_hdr, l4_hdr, > + eth_hdr->ether_type); > + } > + } > + > + /* FOR GSO, gso_size includes l2_len + l3_len */ > + mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > + > + mbuf->l2_len; > + if (mbuf->tso_segsz > dev->mtu) { > + mbuf->tso_segsz = dev->mtu; > + } > + } > + } > + > return true; > } > > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, > struct rte_mbuf **pkts, > return cnt; > } > > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes > ownership of > - * 'pkts', even in case of failure. > - * > - * Returns the number of packets that weren't transmitted. */ static > inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > - struct rte_mbuf **pkts, int cnt) > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > + struct rte_mbuf **pkts, int cnt) > { > uint32_t nb_tx = 0; > - uint16_t nb_tx_prep = cnt; > + uint32_t nb_tx_prep; > > - if (userspace_tso_enabled()) { > - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > - if (nb_tx_prep != cnt) { > - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > - "Only %u/%u are valid: %s", dev->up.name, > nb_tx_prep, > - cnt, rte_strerror(rte_errno)); > - } > + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > + if (nb_tx_prep != cnt) { > + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > + "Only %u/%u are valid: %s", > + dev->up.name, nb_tx_prep, > + cnt, rte_strerror(rte_errno)); > } > > while (nb_tx != nb_tx_prep) { > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int > qid, > return cnt - nb_tx; > } > > +static inline void > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf. _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev