On 12/11/2025 17:04, David Marchand via dev wrote:
> Let's consider a TCP packet in a VxLAN tunnel:
> Ethernet / IP / UDP / VxLAN / Ethernet / IP / TCP / Data
>
> The outer UDP checksum is an accumulation of a pseudo header of the
> outer IP infos (addresses, length, next proto) and the whole packet data:
> UDP / VxLAN / Ethernet / IP / TCP / Data.
>
> Similarly to the outer UDP checksum, the inner TCP checksum is an
> accumulation of a pseudo header of the inner IP infos and the rest of
> the packet data.
>
> The inner TCP header will contain this inner checksum, so when computing
> the outer UDP checksum the inner checksum will cancel any participation
> of the TCP data.
>
> As a consequence, the outer UDP checksum depends on the headers content
> only and can be computed without looking at the data payload.
>
> The same principle applies to inner UDP.
>
> Thanks to this, we can re-enable IPv4, UDP and TCP inner checksums when
> outer UDP checksum is not supported.
>
> TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
> Before: 4.37 Gbits/sec, 100% cpu ("full" csum + SW segmentation)
> After: 7.80 Gbits/sec, 100% cpu (constant csum + SW segmentation)
>
Nice. Just a couple of minor comments below
> Reported-at: https://issues.redhat.com/browse/FDP-1897
> Signed-off-by: David Marchand <[email protected]>
> ---
> Changes since v1:
> - fixed outer UDP checksum for inner UDP traffic with no checksum,
> - fixed inner L4 bad or unknown checksums handling,
> - dropped computing and setting inner IP checksum (for partial status),
> instead inner IPv4 header content is simply skipped for good and
> partial status,
> - fixed inner IP bad or unknown checksums handling (we can still
> apply the optimisation),
> - added unit tests,
>
> ---
> lib/dp-packet-gso.c | 10 ++---
> lib/dp-packet.c | 21 +++++----
> lib/dp-packet.h | 24 ++++++++++
> lib/netdev-dpdk.c | 84 ++++++++++++++---------------------
> lib/packets.c | 103 +++++++++++++++++++++++++++++++++++++++++++
> lib/packets.h | 1 +
> tests/dpif-netdev.at | 81 +++++++++++++++++++++++++++-------
> 7 files changed, 242 insertions(+), 82 deletions(-)
>
> diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> index 362bc8f66d..fe7186ddf4 100644
> --- a/lib/dp-packet-gso.c
> +++ b/lib/dp-packet-gso.c
> @@ -66,17 +66,15 @@ int
> dp_packet_gso_nr_segs(struct dp_packet *p)
> {
> uint16_t segsz = dp_packet_get_tso_segsz(p);
> - const char *data_tail;
> - const char *data_pos;
> + uint32_t data_length;
>
> if (dp_packet_tunnel(p)) {
> - data_pos = dp_packet_get_inner_tcp_payload(p);
> + data_length = dp_packet_get_inner_tcp_payload_length(p);
> } else {
> - data_pos = dp_packet_get_tcp_payload(p);
> + data_length = dp_packet_get_tcp_payload_length(p);
> }
> - data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
>
> - return DIV_ROUND_UP(data_tail - data_pos, segsz);
> + return DIV_ROUND_UP(data_length, segsz);
> }
>
> /* Perform software segmentation on packet 'p'.
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index 3093bd2163..967e5a301b 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -593,19 +593,22 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t
> flags)
> return;
> }
>
> - if (dp_packet_tunnel_geneve(p)
> - || dp_packet_tunnel_vxlan(p)) {
> -
> + if (dp_packet_tunnel_geneve(p) || dp_packet_tunnel_vxlan(p)) {
> /* If the TX interface doesn't support UDP tunnel offload but does
> - * support inner checksum offload and an outer UDP checksum is
> - * required, then we can't offload inner checksum either. As that
> would
> + * support inner SCTP checksum offload and an outer UDP checksum is
> + * required, then we can't offload inner checksum either as that
> would
> * invalidate the outer checksum. */
> if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
> && dp_packet_l4_checksum_partial(p)) {
> - flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM |
> - NETDEV_TX_OFFLOAD_UDP_CKSUM |
> - NETDEV_TX_OFFLOAD_SCTP_CKSUM |
> - NETDEV_TX_OFFLOAD_IPV4_CKSUM);
> + flags &= ~NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> + if (!packet_udp_tunnel_csum(p)) {
> + /* Similarly to the previous comment, since the outer UDP
> + * checksum optimisation did not happen, invalidate inner
> + * checksum offloads support. */
> + flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM |
> + NETDEV_TX_OFFLOAD_UDP_CKSUM |
> + NETDEV_TX_OFFLOAD_IPV4_CKSUM);
> + }
> }
> }
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 285d0e43f6..6a4a61922b 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -585,6 +585,18 @@ dp_packet_get_tcp_payload_length(const struct dp_packet
> *pkt)
> }
> }
>
> +static inline uint32_t
> +dp_packet_get_inner_tcp_payload_length(const struct dp_packet *pkt)
> +{
> + const char *tcp_payload = dp_packet_get_inner_tcp_payload(pkt);
> + if (tcp_payload) {
> + return ((char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
> + - tcp_payload);
> + } else {
> + return 0;
> + }
> +}
> +
> static inline const void *
> dp_packet_get_udp_payload(const struct dp_packet *b)
> {
> @@ -1171,6 +1183,12 @@ dp_packet_inner_ip_checksum_set_partial(struct
> dp_packet *p)
> p->offloads |= DP_PACKET_OL_INNER_IP_CKSUM_MASK;
> }
>
> +static inline bool OVS_WARN_UNUSED_RESULT
> +dp_packet_inner_ip_checksum_valid(const struct dp_packet *p)
> +{
> + return !!(p->offloads & DP_PACKET_OL_INNER_IP_CKSUM_GOOD);
> +}
> +
> /* Calculate and set the IPv4 header checksum in packet 'p'. */
> static inline void
> dp_packet_ip_set_header_csum(struct dp_packet *p, bool inner)
> @@ -1364,6 +1382,12 @@ dp_packet_inner_l4_checksum_set_partial(struct
> dp_packet *p)
> p->offloads |= DP_PACKET_OL_INNER_L4_CKSUM_MASK;
> }
>
> +static inline bool OVS_WARN_UNUSED_RESULT
> +dp_packet_inner_l4_checksum_valid(const struct dp_packet *p)
> +{
> + return !!(p->offloads & DP_PACKET_OL_INNER_L4_CKSUM_GOOD);
> +}
> +
> static inline void
> dp_packet_reset_packet(struct dp_packet *b, int off)
> {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 29b1b21d64..38cf0ebb2e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -2648,59 +2648,43 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev,
> struct rte_mbuf *mbuf)
> }
>
> if (dp_packet_tunnel(pkt)) {
> - if (dp_packet_ip_checksum_partial(pkt)
> - || dp_packet_l4_checksum_partial(pkt)) {
> - mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> - (char *) dp_packet_eth(pkt);
> - mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> - (char *) dp_packet_l3(pkt);
> -
> - if (dp_packet_tunnel_geneve(pkt)) {
> - mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
> - } else if (dp_packet_tunnel_vxlan(pkt)) {
> - mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
> - } else {
> - ovs_assert(dp_packet_tunnel_gre(pkt));
> - mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
> - }
> -
> - if (dp_packet_ip_checksum_partial(pkt)) {
> - mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
> - }
> + mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> + (char *) dp_packet_eth(pkt);
> + mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> + (char *) dp_packet_l3(pkt);
> +
> + if (dp_packet_tunnel_geneve(pkt)) {
> + mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
> + } else if (dp_packet_tunnel_vxlan(pkt)) {
> + mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
> + } else {
> + ovs_assert(dp_packet_tunnel_gre(pkt));
> + mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
> + }
>
> - if (dp_packet_l4_checksum_partial(pkt)) {
> - ovs_assert(dp_packet_l4_proto_udp(pkt));
> - mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
> - }
> + if (dp_packet_ip_checksum_partial(pkt)) {
> + mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
> + }
>
> - ip = dp_packet_l3(pkt);
> - mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
> - ? RTE_MBUF_F_TX_OUTER_IPV4
> - : RTE_MBUF_F_TX_OUTER_IPV6;
> -
> - /* Inner L2 length must account for the tunnel header length. */
> - l2 = dp_packet_l4(pkt);
> - l3 = dp_packet_inner_l3(pkt);
> - l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
> - l4 = dp_packet_inner_l4(pkt);
> - l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
> - is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
> - is_udp = dp_packet_inner_l4_proto_udp(pkt);
> - is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
> - } else {
> - mbuf->outer_l2_len = 0;
> - mbuf->outer_l3_len = 0;
> -
> - /* Skip outer headers. */
> - l2 = dp_packet_eth(pkt);
> - l3 = dp_packet_inner_l3(pkt);
> - l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
> - l4 = dp_packet_inner_l4(pkt);
> - l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
> - is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
> - is_udp = dp_packet_inner_l4_proto_udp(pkt);
> - is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
> + if (dp_packet_l4_checksum_partial(pkt)) {
> + ovs_assert(dp_packet_l4_proto_udp(pkt));
> + mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
> }
> +
> + ip = dp_packet_l3(pkt);
> + mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
> + ? RTE_MBUF_F_TX_OUTER_IPV4
> + : RTE_MBUF_F_TX_OUTER_IPV6;
> +
> + /* Inner L2 length must account for the tunnel header length. */
> + l2 = dp_packet_l4(pkt);
> + l3 = dp_packet_inner_l3(pkt);
> + l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
> + l4 = dp_packet_inner_l4(pkt);
> + l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
> + is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
> + is_udp = dp_packet_inner_l4_proto_udp(pkt);
> + is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
> } else {
> mbuf->outer_l2_len = 0;
> mbuf->outer_l3_len = 0;
> diff --git a/lib/packets.c b/lib/packets.c
> index a0bb2ad482..c05b4abcc8 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -2085,6 +2085,109 @@ out:
> }
> }
>
> +/* This helper computes a "constant" UDP checksum without looking at the
> + * L4 payload.
> + *
> + * This is possible when L4 is either TCP or UDP: the L4 payload checksum
> + * is either computed in SW or in HW later, but its contribution to the
> + * outer checksum is cancelled by the L4 payload being part of the global
> + * packet sum. */
> +bool
> +packet_udp_tunnel_csum(struct dp_packet *p)
> +{
> + const ovs_be16 *inner_l4_csum_p;
> + struct ip_header *inner_ip;
> + const void *inner_l4_data;
> + struct udp_header *udp;
> + ovs_be16 inner_l4_csum;
> + uint32_t partial_csum;
> + struct ip_header *ip;
> + uint32_t inner_csum;
> + void *inner_l4;
> +
> + inner_ip = dp_packet_inner_l3(p);
> + inner_l4 = dp_packet_inner_l4(p);
> + ip = dp_packet_l3(p);
> + udp = dp_packet_l4(p);
> +
> + if (!dp_packet_inner_l4_proto_tcp(p)
> + && !dp_packet_inner_l4_proto_udp(p)) {
> + return false;
> + }
> +
> + if (!dp_packet_inner_l4_checksum_valid(p)) {
> + /* We have no idea about the contribution of the payload data
> + * and what the L4 checksum put in the packet data looks like.
> + * Simpler is to let a full checksum happen. */
> + return false;
> + }
> +
> + if (dp_packet_inner_l4_proto_tcp(p)) {
> + inner_l4_csum_p = &(((struct tcp_header *) inner_l4)->tcp_csum);
> + inner_l4_data = dp_packet_get_inner_tcp_payload(p);
> + } else {
> + ovs_assert(dp_packet_inner_l4_proto_udp(p));
> + inner_l4_csum_p = &(((struct udp_header *) inner_l4)->udp_csum);
> + inner_l4_data = (char *) inner_l4 + sizeof (struct udp_header);
> + if (*inner_l4_csum_p == 0) {
> + /* There is no nested checksum.
> + * No choice but compute a full checksum. */
> + return false;
> + }
> + }
> +
> + if (IP_VER(inner_ip->ip_ihl_ver) == 4) {
> + inner_csum = packet_csum_pseudoheader(inner_ip);
> + } else {
> + struct ovs_16aligned_ip6_hdr *inner_ip6 = dp_packet_inner_l3(p);
> +
> + inner_csum = packet_csum_pseudoheader6(inner_ip6);
> + }
> +
> + ovs_assert(inner_l4_data);
> + inner_csum = csum_continue(inner_csum, inner_l4,
> + (char *) inner_l4_csum_p - (char *) inner_l4);
> + inner_l4_csum = csum_finish(csum_continue(inner_csum, inner_l4_csum_p +
> 1,
> + (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1)));
> + if (dp_packet_inner_l4_proto_udp(p) && !inner_l4_csum) {
Less likely to be an inner_l4_csum == 0, so could reverse order/nest
> + inner_l4_csum = htons(0xffff);
> + }
I think you could remove this block as inner_l4_csum == 0 will have the
same effect later, but only concern would be code clarity.
> +
> + udp->udp_csum = 0;
> + if (IP_VER(ip->ip_ihl_ver) == 4) {
> + partial_csum = packet_csum_pseudoheader(ip);
> + } else {
> + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p);
> +
> + partial_csum = packet_csum_pseudoheader6(ip6);
> + }
> +
> + partial_csum = csum_continue(partial_csum, udp,
> + (char *) inner_ip - (char *) udp);
> + if (IP_VER(inner_ip->ip_ihl_ver) != 4
> + || !dp_packet_inner_ip_checksum_valid(p)) {
> + /* IPv6 has no checksum, so for inner IPv6, we need to sum the
> header.
> + *
> + * In IPv4 case, if inner checksum is already good or HW offload
> + * has been requested, the (final) sum of the IPv4 header will be 0.
> + * Otherwise, we need to sum the header like for IPv6. */
> + partial_csum = csum_continue(partial_csum, inner_ip,
> + (char *) inner_l4 - (char *) inner_ip);
> + }
> + partial_csum = csum_continue(partial_csum, inner_l4,
> + (char *) inner_l4_csum_p - (char *) inner_l4);
> + partial_csum = csum_add16(partial_csum, inner_l4_csum);
> + partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
> + (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
> + udp->udp_csum = csum_finish(partial_csum);
> + if (!udp->udp_csum) {
> + udp->udp_csum = htons(0xffff);
> + }
> + dp_packet_l4_checksum_set_good(p);
> +
> + return true;
> +}
> +
> /* Set SCTP checksum field in packet 'p' with complete checksum.
> * The packet must have the L3 and L4 offsets. */
> void
> diff --git a/lib/packets.h b/lib/packets.h
> index 6eba07700a..843e9653a8 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -1689,6 +1689,7 @@ bool packet_rh_present(struct dp_packet *packet,
> uint8_t *nexthdr,
> void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
> void packet_tcp_complete_csum(struct dp_packet *, bool is_inner);
> void packet_udp_complete_csum(struct dp_packet *, bool is_inner);
> +bool packet_udp_tunnel_csum(struct dp_packet *);
> void packet_sctp_complete_csum(struct dp_packet *, bool is_inner);
>
> #define DNS_HEADER_LEN 12
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev