On 12/11/2025 17:04, David Marchand via dev wrote:
> If we can predict that the segmented traffic will have the same headers,
> it is possible to rely on HW segmentation.
>
> TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
> Before: 7.80 Gbits/sec, 100% cpu (SW segmentation)
> After: 10.8 Gbits/sec, 27% cpu (HW segmentation)
>
Nice, I don't see any real issues in the code. Couple of minor comments
below.
> For this optimisation, no touching of original tso_segsz should be
> allowed, so revert the commit
> 844a7cfa6edd ("netdev-dpdk: Use guest TSO segmentation size hint.").
>
> Yet, there might still be some non optimal cases where the L4 payload
> size would not be a multiple of tso_segsz.
> For this condition, "partially" segment: split the "last" segment and keep
> n-1 segments data in the original packet which will be segmented by HW.
>
> Reported-at: https://issues.redhat.com/browse/FDP-1897
> Signed-off-by: David Marchand <[email protected]>
> ---
> lib/dp-packet-gso.c | 72 ++++++++++++++++++++++++++++++++++++++-------
> lib/dp-packet-gso.h | 2 ++
> lib/netdev-dpdk.c | 7 -----
> lib/netdev.c | 34 ++++++++++++++-------
> lib/packets.c | 34 +++++++++++++++++++++
> 5 files changed, 121 insertions(+), 28 deletions(-)
>
It probably needs a bit of documentation somewhere for users to know
when they can expect to see the optimization and when not. Can point
them to DPDK for checking supported offloads on specific NICs
> diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> index 6b8da8b370..d8d3a87500 100644
> --- a/lib/dp-packet-gso.c
> +++ b/lib/dp-packet-gso.c
> @@ -77,6 +77,20 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
> return DIV_ROUND_UP(data_length, segsz);
> }
>
Could add a comment for this function to explain it
> +int
> +dp_packet_gso_partial_nr_segs(struct dp_packet *p)
> +{
> + if ((dp_packet_tunnel_geneve(p)
> + || dp_packet_tunnel_vxlan(p))
Can be a single line
> + && dp_packet_l4_checksum_partial(p)
> + && dp_packet_get_inner_tcp_payload_length(p)
> + != dp_packet_gso_nr_segs(p) * dp_packet_get_tso_segsz(p)) {
> + return 2;
> + }
> +
> + return 1;
> +}
> +
> static void
> dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
> uint16_t tso_segsz)
> @@ -139,7 +153,7 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int
> seg_no, int n_segs,
> tcp_seq += seg_no * tso_segsz;
> put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
>
> - if (OVS_LIKELY(seg_no < (n_segs - 1))) {
> + if (seg_no < (n_segs - 1) && !dp_packet_get_tso_segsz(seg)) {
> uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> /* Reset flags PUSH and FIN unless it is the last segment. */
> uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
> @@ -160,12 +174,9 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int
> seg_no, int n_segs,
> }
> }
>
> -/* Perform software segmentation on packet 'p'.
> - *
> - * Segments packet 'p' into the array of preallocated batches in 'batches',
> - * updating the 'batches' pointer as needed. */
> -void
> -dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> +static void
> +dp_packet_gso__(struct dp_packet *p, struct dp_packet_batch **batches,
> + bool partial_seg)
> {
> struct dp_packet_batch *curr_batch = *batches;
> struct dp_packet *seg;
> @@ -199,6 +210,13 @@ dp_packet_gso(struct dp_packet *p, struct
> dp_packet_batch **batches)
> data_len = dp_packet_get_tcp_payload_length(p);
> }
>
> + if (partial_seg) {
> + if (dp_packet_gso_partial_nr_segs(p) != 1) {
> + goto last_seg;
> + }
> + goto first_seg;
> + }
> +
> for (int i = 1; i < n_segs - 1; i++) {
> seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
> tso_segsz);
> @@ -210,6 +228,7 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch
> **batches)
> dp_packet_batch_add(curr_batch, seg);
> }
>
> +last_seg:
> /* Create the last segment. */
> seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) *
> tso_segsz,
> data_len - (n_segs - 1) * tso_segsz);
> @@ -220,11 +239,44 @@ dp_packet_gso(struct dp_packet *p, struct
> dp_packet_batch **batches)
> }
> dp_packet_batch_add(curr_batch, seg);
>
> - /* Trim the first segment and reset TSO. */
> - dp_packet_set_size(p, hdr_len + tso_segsz);
> - dp_packet_set_tso_segsz(p, 0);
> +first_seg:
> + if (partial_seg) {
> + if (dp_packet_gso_partial_nr_segs(p) != 1) {
> + dp_packet_set_size(p, hdr_len + (n_segs - 1) * tso_segsz);
> + if (n_segs == 2) {
> + /* No need to ask HW segmentation, we already did the job. */
> + dp_packet_set_tso_segsz(p, 0);
> + }
> + }
> + } else {
> + /* Trim the first segment and reset TSO. */
> + dp_packet_set_size(p, hdr_len + tso_segsz);
> + dp_packet_set_tso_segsz(p, 0);
> + }
> dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);
>
> out:
> *batches = curr_batch;
> }
> +
> +/* Perform software segmentation on packet 'p'.
> + *
> + * Segments packet 'p' into the array of preallocated batches in 'batches',
> + * updating the 'batches' pointer as needed. */
> +void
> +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> +{
> + dp_packet_gso__(p, batches, false);
> +}
> +
> +/* Perform partial software segmentation on packet 'p'.
> + *
> + * For UDP tunnels, if the packet payload length is not aligned on the
> + * segmentation size, segments the last segment of packet 'p' into the array
> + * of preallocated batches in 'batches', updating the 'batches' pointer
> + * as needed. */
> +void
> +dp_packet_gso_partial(struct dp_packet *p, struct dp_packet_batch **batches)
> +{
> + dp_packet_gso__(p, batches, true);
> +}
> diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
> index 2abc19ea31..ed55a11d79 100644
> --- a/lib/dp-packet-gso.h
> +++ b/lib/dp-packet-gso.h
> @@ -19,5 +19,7 @@
>
> void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
> int dp_packet_gso_nr_segs(struct dp_packet *);
> +void dp_packet_gso_partial(struct dp_packet *, struct dp_packet_batch **);
> +int dp_packet_gso_partial_nr_segs(struct dp_packet *);
>
> #endif /* dp-packet-gso.h */
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 38cf0ebb2e..55278c2450 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -2725,20 +2725,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev,
> struct rte_mbuf *mbuf)
>
> if (mbuf->tso_segsz) {
> struct tcp_header *th = l4;
> - uint16_t link_tso_segsz;
> int hdr_len;
>
> mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
>
> hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> - link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> if (dp_packet_tunnel(pkt)) {
> hdr_len += mbuf->outer_l2_len + mbuf->outer_l3_len;
> - link_tso_segsz -= mbuf->outer_l3_len + mbuf->l2_len;
> - }
> -
> - if (mbuf->tso_segsz > link_tso_segsz) {
> - mbuf->tso_segsz = link_tso_segsz;
> }
>
> if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len))
> {
> diff --git a/lib/netdev.c b/lib/netdev.c
> index f493dab005..5413ca61d0 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -71,6 +71,7 @@ COVERAGE_DEFINE(netdev_add_router);
> COVERAGE_DEFINE(netdev_get_stats);
> COVERAGE_DEFINE(netdev_push_header_drops);
> COVERAGE_DEFINE(netdev_soft_seg_good);
> +COVERAGE_DEFINE(netdev_partial_seg_good);
>
> struct netdev_saved_flags {
> struct netdev *netdev;
> @@ -802,7 +803,8 @@ netdev_get_pt_mode(const struct netdev *netdev)
> * from netdev_class->send() if at least one batch failed to send. */
> static int
> netdev_send_tso(struct netdev *netdev, int qid,
> - struct dp_packet_batch *batch, bool concurrent_txq)
> + struct dp_packet_batch *batch, bool concurrent_txq,
> + bool partial_seg)
> {
> struct dp_packet_batch *batches;
> struct dp_packet *packet;
> @@ -812,11 +814,15 @@ netdev_send_tso(struct netdev *netdev, int qid,
> int error;
>
> /* Calculate the total number of packets in the batch after
> - * the segmentation. */
> + * the (partial?) segmentation. */
> n_packets = 0;
> DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> if (dp_packet_get_tso_segsz(packet)) {
> - n_packets += dp_packet_gso_nr_segs(packet);
> + if (partial_seg) {
> + n_packets += dp_packet_gso_partial_nr_segs(packet);
> + } else {
> + n_packets += dp_packet_gso_nr_segs(packet);
> + }
> } else {
> n_packets++;
> }
> @@ -842,8 +848,13 @@ netdev_send_tso(struct netdev *netdev, int qid,
> curr_batch = batches;
> DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
> if (dp_packet_get_tso_segsz(packet)) {
> - dp_packet_gso(packet, &curr_batch);
> - COVERAGE_INC(netdev_soft_seg_good);
> + if (partial_seg) {
> + dp_packet_gso_partial(packet, &curr_batch);
> + COVERAGE_INC(netdev_partial_seg_good);
> + } else {
> + dp_packet_gso(packet, &curr_batch);
> + COVERAGE_INC(netdev_soft_seg_good);
> + }
> } else {
> if (dp_packet_batch_is_full(curr_batch)) {
> curr_batch++;
> @@ -906,7 +917,8 @@ netdev_send(struct netdev *netdev, int qid, struct
> dp_packet_batch *batch,
> if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> if (dp_packet_get_tso_segsz(packet)) {
> - return netdev_send_tso(netdev, qid, batch,
> concurrent_txq);
> + return netdev_send_tso(netdev, qid, batch,
> concurrent_txq,
> + false);
> }
> }
> } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
> @@ -915,16 +927,16 @@ netdev_send(struct netdev *netdev, int qid, struct
> dp_packet_batch *batch,
> DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> if (dp_packet_get_tso_segsz(packet)
> && dp_packet_tunnel(packet)) {
> - return netdev_send_tso(netdev, qid, batch,
> concurrent_txq);
> + return netdev_send_tso(netdev, qid, batch,
> concurrent_txq,
> + false);
> }
> }
> } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
> DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> if (dp_packet_get_tso_segsz(packet)
> - && (dp_packet_tunnel_vxlan(packet)
> - || dp_packet_tunnel_geneve(packet))
> - && dp_packet_l4_checksum_partial(packet)) {
> - return netdev_send_tso(netdev, qid, batch,
> concurrent_txq);
> + && dp_packet_gso_partial_nr_segs(packet) != 1) {
> + return netdev_send_tso(netdev, qid, batch,
> concurrent_txq,
> + true);
> }
> }
> }
> diff --git a/lib/packets.c b/lib/packets.c
> index c05b4abcc8..4e5cac66e4 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -33,6 +33,7 @@
> #include "ovs-thread.h"
> #include "odp-util.h"
> #include "dp-packet.h"
> +#include "dp-packet-gso.h"
> #include "unaligned.h"
>
> const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
> @@ -2103,6 +2104,7 @@ packet_udp_tunnel_csum(struct dp_packet *p)
> uint32_t partial_csum;
> struct ip_header *ip;
> uint32_t inner_csum;
> + uint16_t tso_segsz;
> void *inner_l4;
>
> inner_ip = dp_packet_inner_l3(p);
> @@ -2180,6 +2182,38 @@ packet_udp_tunnel_csum(struct dp_packet *p)
> partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
> (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
> udp->udp_csum = csum_finish(partial_csum);
> + tso_segsz = dp_packet_get_tso_segsz(p);
> + if (tso_segsz) {
> + uint16_t payload_len = dp_packet_get_inner_tcp_payload_length(p);
> +
> + ovs_assert(payload_len == tso_segsz * dp_packet_gso_nr_segs(p));
> +
> + /* The pseudo header used in the outer UDP checksum is dependent on
> + * the ip_tot_len / ip6_plen which was a reflection of the TSO frame
> + * size. The segmented packet will be shorter. */
> + udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
> + htons(tso_segsz));
> +
> + /* When segmenting the packet, various headers get updated:
> + * - inner L3
> + * - for IPv4, ip_tot_len is updated, BUT it is not affecting the
> + * outer UDP checksum because the IPv4 header itself contains
> + * a checksum that compensates for this update,
> + * - for IPv6, ip6_plen is updated, and this must be considered,
> + * - inner L4
> + * - inner pseudo header used in the TCP checksum is dependent on
> + * the inner ip_tot_len / ip6_plen,
> + * - TCP seq number is updated,
> + * - the HW may change some TCP flags (think PSH/FIN),
> + * BUT the TCP checksum will compensate for those updates,
> + *
> + * Summary: we only to care about the inner IPv6 header update.
"we only care" or "we only need to care"
> + */
> + if (IP_VER(inner_ip->ip_ihl_ver) != 4) {
> + udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
> + htons(tso_segsz));
> + }
> + }
> if (!udp->udp_csum) {
> udp->udp_csum = htons(0xffff);
> }
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev