On 12/11/2025 17:04, David Marchand via dev wrote:
> Let's consider a TCP packet in a VxLAN tunnel:
> Ethernet / IP / UDP / VxLAN / Ethernet / IP / TCP / Data
> 
> The outer UDP checksum is an accumulation of a pseudo header of the
> outer IP infos (addresses, length, next proto) and the whole packet data:
> UDP / VxLAN / Ethernet / IP / TCP / Data.
> 
> Similarly to the outer UDP checksum, the inner TCP checksum is an
> accumulation of a pseudo header of the inner IP infos and the rest of
> the packet data.
> 
> The inner TCP header will contain this inner checksum, so when computing
> the outer UDP checksum the inner checksum will cancel any participation
> of the TCP data.
> 
> As a consequence, the outer UDP checksum depends on the headers content
> only and can be computed without looking at the data payload.
> 
> The same principle applies to inner UDP.
> 
> Thanks to this, we can re-enable IPv4, UDP and TCP inner checksums when
> outer UDP checksum is not supported.
> 
> TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
> Before:  4.37 Gbits/sec, 100% cpu ("full" csum + SW segmentation)
> After:   7.80 Gbits/sec, 100% cpu (constant csum + SW segmentation)
> 

Nice. Just a couple of minor comments below

> Reported-at: https://issues.redhat.com/browse/FDP-1897
> Signed-off-by: David Marchand <[email protected]>
> ---
> Changes since v1:
> - fixed outer UDP checksum for inner UDP traffic with no checksum,
> - fixed inner L4 bad or unknown checksums handling,
> - dropped computing and setting inner IP checksum (for partial status),
>   instead inner IPv4 header content is simply skipped for good and
>   partial status,
> - fixed inner IP bad or unknown checksums handling (we can still
>   apply the optimisation),
> - added unit tests,
> 
> ---
>  lib/dp-packet-gso.c  |  10 ++---
>  lib/dp-packet.c      |  21 +++++----
>  lib/dp-packet.h      |  24 ++++++++++
>  lib/netdev-dpdk.c    |  84 ++++++++++++++---------------------
>  lib/packets.c        | 103 +++++++++++++++++++++++++++++++++++++++++++
>  lib/packets.h        |   1 +
>  tests/dpif-netdev.at |  81 +++++++++++++++++++++++++++-------
>  7 files changed, 242 insertions(+), 82 deletions(-)
> 
> diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> index 362bc8f66d..fe7186ddf4 100644
> --- a/lib/dp-packet-gso.c
> +++ b/lib/dp-packet-gso.c
> @@ -66,17 +66,15 @@ int
>  dp_packet_gso_nr_segs(struct dp_packet *p)
>  {
>      uint16_t segsz = dp_packet_get_tso_segsz(p);
> -    const char *data_tail;
> -    const char *data_pos;
> +    uint32_t data_length;
>  
>      if (dp_packet_tunnel(p)) {
> -        data_pos = dp_packet_get_inner_tcp_payload(p);
> +        data_length = dp_packet_get_inner_tcp_payload_length(p);
>      } else {
> -        data_pos = dp_packet_get_tcp_payload(p);
> +        data_length = dp_packet_get_tcp_payload_length(p);
>      }
> -    data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
>  
> -    return DIV_ROUND_UP(data_tail - data_pos, segsz);
> +    return DIV_ROUND_UP(data_length, segsz);
>  }
>  
>  /* Perform software segmentation on packet 'p'.
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index 3093bd2163..967e5a301b 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -593,19 +593,22 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t 
> flags)
>          return;
>      }
>  
> -    if (dp_packet_tunnel_geneve(p)
> -        || dp_packet_tunnel_vxlan(p)) {
> -
> +    if (dp_packet_tunnel_geneve(p) || dp_packet_tunnel_vxlan(p)) {
>          /* If the TX interface doesn't support UDP tunnel offload but does
> -         * support inner checksum offload and an outer UDP checksum is
> -         * required, then we can't offload inner checksum either. As that 
> would
> +         * support inner SCTP checksum offload and an outer UDP checksum is
> +         * required, then we can't offload inner checksum either as that 
> would
>           * invalidate the outer checksum. */
>          if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
>              && dp_packet_l4_checksum_partial(p)) {
> -            flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM |
> -                       NETDEV_TX_OFFLOAD_UDP_CKSUM |
> -                       NETDEV_TX_OFFLOAD_SCTP_CKSUM |
> -                       NETDEV_TX_OFFLOAD_IPV4_CKSUM);
> +            flags &= ~NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> +            if (!packet_udp_tunnel_csum(p)) {
> +                /* Similarly to the previous comment, since the outer UDP
> +                 * checksum optimisation did not happen, invalidate inner
> +                 * checksum offloads support. */
> +                flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM |
> +                           NETDEV_TX_OFFLOAD_UDP_CKSUM |
> +                           NETDEV_TX_OFFLOAD_IPV4_CKSUM);
> +            }
>          }
>      }
>  
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 285d0e43f6..6a4a61922b 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -585,6 +585,18 @@ dp_packet_get_tcp_payload_length(const struct dp_packet 
> *pkt)
>      }
>  }
>  
> +static inline uint32_t
> +dp_packet_get_inner_tcp_payload_length(const struct dp_packet *pkt)
> +{
> +    const char *tcp_payload = dp_packet_get_inner_tcp_payload(pkt);
> +    if (tcp_payload) {
> +        return ((char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
> +                - tcp_payload);
> +    } else {
> +        return 0;
> +    }
> +}
> +
>  static inline const void *
>  dp_packet_get_udp_payload(const struct dp_packet *b)
>  {
> @@ -1171,6 +1183,12 @@ dp_packet_inner_ip_checksum_set_partial(struct 
> dp_packet *p)
>      p->offloads |= DP_PACKET_OL_INNER_IP_CKSUM_MASK;
>  }
>  
> +static inline bool OVS_WARN_UNUSED_RESULT
> +dp_packet_inner_ip_checksum_valid(const struct dp_packet *p)
> +{
> +    return !!(p->offloads & DP_PACKET_OL_INNER_IP_CKSUM_GOOD);
> +}
> +
>  /* Calculate and set the IPv4 header checksum in packet 'p'. */
>  static inline void
>  dp_packet_ip_set_header_csum(struct dp_packet *p, bool inner)
> @@ -1364,6 +1382,12 @@ dp_packet_inner_l4_checksum_set_partial(struct 
> dp_packet *p)
>      p->offloads |= DP_PACKET_OL_INNER_L4_CKSUM_MASK;
>  }
>  
> +static inline bool OVS_WARN_UNUSED_RESULT
> +dp_packet_inner_l4_checksum_valid(const struct dp_packet *p)
> +{
> +    return !!(p->offloads & DP_PACKET_OL_INNER_L4_CKSUM_GOOD);
> +}
> +
>  static inline void
>  dp_packet_reset_packet(struct dp_packet *b, int off)
>  {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 29b1b21d64..38cf0ebb2e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -2648,59 +2648,43 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
> struct rte_mbuf *mbuf)
>      }
>  
>      if (dp_packet_tunnel(pkt)) {
> -        if (dp_packet_ip_checksum_partial(pkt)
> -            || dp_packet_l4_checksum_partial(pkt)) {
> -            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> -                                 (char *) dp_packet_eth(pkt);
> -            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> -                                 (char *) dp_packet_l3(pkt);
> -
> -            if (dp_packet_tunnel_geneve(pkt)) {
> -                mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
> -            } else if (dp_packet_tunnel_vxlan(pkt)) {
> -                mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
> -            } else {
> -                ovs_assert(dp_packet_tunnel_gre(pkt));
> -                mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
> -            }
> -
> -            if (dp_packet_ip_checksum_partial(pkt)) {
> -                mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
> -            }
> +        mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> +                             (char *) dp_packet_eth(pkt);
> +        mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> +                             (char *) dp_packet_l3(pkt);
> +
> +        if (dp_packet_tunnel_geneve(pkt)) {
> +            mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
> +        } else if (dp_packet_tunnel_vxlan(pkt)) {
> +            mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
> +        } else {
> +            ovs_assert(dp_packet_tunnel_gre(pkt));
> +            mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
> +        }
>  
> -            if (dp_packet_l4_checksum_partial(pkt)) {
> -                ovs_assert(dp_packet_l4_proto_udp(pkt));
> -                mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
> -            }
> +        if (dp_packet_ip_checksum_partial(pkt)) {
> +            mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
> +        }
>  
> -            ip = dp_packet_l3(pkt);
> -            mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
> -                              ? RTE_MBUF_F_TX_OUTER_IPV4
> -                              : RTE_MBUF_F_TX_OUTER_IPV6;
> -
> -            /* Inner L2 length must account for the tunnel header length. */
> -            l2 = dp_packet_l4(pkt);
> -            l3 = dp_packet_inner_l3(pkt);
> -            l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
> -            l4 = dp_packet_inner_l4(pkt);
> -            l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
> -            is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
> -            is_udp = dp_packet_inner_l4_proto_udp(pkt);
> -            is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
> -        } else {
> -            mbuf->outer_l2_len = 0;
> -            mbuf->outer_l3_len = 0;
> -
> -            /* Skip outer headers. */
> -            l2 = dp_packet_eth(pkt);
> -            l3 = dp_packet_inner_l3(pkt);
> -            l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
> -            l4 = dp_packet_inner_l4(pkt);
> -            l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
> -            is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
> -            is_udp = dp_packet_inner_l4_proto_udp(pkt);
> -            is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
> +        if (dp_packet_l4_checksum_partial(pkt)) {
> +            ovs_assert(dp_packet_l4_proto_udp(pkt));
> +            mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
>          }
> +
> +        ip = dp_packet_l3(pkt);
> +        mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
> +                          ? RTE_MBUF_F_TX_OUTER_IPV4
> +                          : RTE_MBUF_F_TX_OUTER_IPV6;
> +
> +        /* Inner L2 length must account for the tunnel header length. */
> +        l2 = dp_packet_l4(pkt);
> +        l3 = dp_packet_inner_l3(pkt);
> +        l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
> +        l4 = dp_packet_inner_l4(pkt);
> +        l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
> +        is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
> +        is_udp = dp_packet_inner_l4_proto_udp(pkt);
> +        is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
>      } else {
>          mbuf->outer_l2_len = 0;
>          mbuf->outer_l3_len = 0;
> diff --git a/lib/packets.c b/lib/packets.c
> index a0bb2ad482..c05b4abcc8 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -2085,6 +2085,109 @@ out:
>      }
>  }
>  
> +/* This helper computes a "constant" UDP checksum without looking at the
> + * L4 payload.
> + *
> + * This is possible when L4 is either TCP or UDP: the L4 payload checksum
> + * is either computed in SW or in HW later, but its contribution to the
> + * outer checksum is cancelled by the L4 payload being part of the global
> + * packet sum. */
> +bool
> +packet_udp_tunnel_csum(struct dp_packet *p)
> +{
> +    const ovs_be16 *inner_l4_csum_p;
> +    struct ip_header *inner_ip;
> +    const void *inner_l4_data;
> +    struct udp_header *udp;
> +    ovs_be16 inner_l4_csum;
> +    uint32_t partial_csum;
> +    struct ip_header *ip;
> +    uint32_t inner_csum;
> +    void *inner_l4;
> +
> +    inner_ip = dp_packet_inner_l3(p);
> +    inner_l4 = dp_packet_inner_l4(p);
> +    ip = dp_packet_l3(p);
> +    udp = dp_packet_l4(p);
> +
> +    if (!dp_packet_inner_l4_proto_tcp(p)
> +        && !dp_packet_inner_l4_proto_udp(p)) {
> +        return false;
> +    }
> +
> +    if (!dp_packet_inner_l4_checksum_valid(p)) {
> +        /* We have no idea about the contribution of the payload data
> +         * and what the L4 checksum put in the packet data looks like.
> +         * Simpler is to let a full checksum happen. */
> +        return false;
> +    }
> +
> +    if (dp_packet_inner_l4_proto_tcp(p)) {
> +        inner_l4_csum_p = &(((struct tcp_header *) inner_l4)->tcp_csum);
> +        inner_l4_data = dp_packet_get_inner_tcp_payload(p);
> +    } else {
> +        ovs_assert(dp_packet_inner_l4_proto_udp(p));
> +        inner_l4_csum_p = &(((struct udp_header *) inner_l4)->udp_csum);
> +        inner_l4_data = (char *) inner_l4 + sizeof (struct udp_header);
> +        if (*inner_l4_csum_p == 0) {
> +            /* There is no nested checksum.
> +             * No choice but compute a full checksum. */
> +            return false;
> +        }
> +    }
> +
> +    if (IP_VER(inner_ip->ip_ihl_ver) == 4) {
> +        inner_csum = packet_csum_pseudoheader(inner_ip);
> +    } else {
> +        struct ovs_16aligned_ip6_hdr *inner_ip6 = dp_packet_inner_l3(p);
> +
> +        inner_csum = packet_csum_pseudoheader6(inner_ip6);
> +    }
> +
> +    ovs_assert(inner_l4_data);
> +    inner_csum = csum_continue(inner_csum, inner_l4,
> +        (char *) inner_l4_csum_p - (char *) inner_l4);
> +    inner_l4_csum = csum_finish(csum_continue(inner_csum, inner_l4_csum_p + 
> 1,
> +        (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1)));

> +    if (dp_packet_inner_l4_proto_udp(p) && !inner_l4_csum) {

Less likely to be an inner_l4_csum == 0, so could reverse order/nest

> +        inner_l4_csum = htons(0xffff);
> +    }

I think you could remove this block as inner_l4_csum == 0 will have the
same effect later, but only concern would be code clarity.

> +
> +    udp->udp_csum = 0;
> +    if (IP_VER(ip->ip_ihl_ver) == 4) {
> +        partial_csum = packet_csum_pseudoheader(ip);
> +    } else {
> +        struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p);
> +
> +        partial_csum = packet_csum_pseudoheader6(ip6);
> +    }
> +
> +    partial_csum = csum_continue(partial_csum, udp,
> +        (char *) inner_ip - (char *) udp);
> +    if (IP_VER(inner_ip->ip_ihl_ver) != 4
> +        || !dp_packet_inner_ip_checksum_valid(p)) {
> +        /* IPv6 has no checksum, so for inner IPv6, we need to sum the 
> header.
> +         *
> +         * In IPv4 case, if inner checksum is already good or HW offload
> +         * has been requested, the (final) sum of the IPv4 header will be 0.
> +         * Otherwise, we need to sum the header like for IPv6. */
> +        partial_csum = csum_continue(partial_csum, inner_ip,
> +            (char *) inner_l4 - (char *) inner_ip);
> +    }
> +    partial_csum = csum_continue(partial_csum, inner_l4,
> +        (char *) inner_l4_csum_p - (char *) inner_l4);
> +    partial_csum = csum_add16(partial_csum, inner_l4_csum);
> +    partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
> +        (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
> +    udp->udp_csum = csum_finish(partial_csum);
> +    if (!udp->udp_csum) {
> +        udp->udp_csum = htons(0xffff);
> +    }
> +    dp_packet_l4_checksum_set_good(p);
> +
> +    return true;
> +}
> +
>  /* Set SCTP checksum field in packet 'p' with complete checksum.
>   * The packet must have the L3 and L4 offsets. */
>  void
> diff --git a/lib/packets.h b/lib/packets.h
> index 6eba07700a..843e9653a8 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -1689,6 +1689,7 @@ bool packet_rh_present(struct dp_packet *packet, 
> uint8_t *nexthdr,
>  void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
>  void packet_tcp_complete_csum(struct dp_packet *, bool is_inner);
>  void packet_udp_complete_csum(struct dp_packet *, bool is_inner);
> +bool packet_udp_tunnel_csum(struct dp_packet *);
>  void packet_sctp_complete_csum(struct dp_packet *, bool is_inner);
>  
>  #define DNS_HEADER_LEN 12

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to