On Fri, 12 Dec 2025 at 19:08, Kevin Traynor <[email protected]> wrote:
>
> On 12/11/2025 17:04, David Marchand via dev wrote:
> > If we can predict that the segmented traffic will have the same headers,
> > it is possible to rely on HW segmentation.
> >
> > TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
> > Before: 7.80 Gbits/sec, 100% cpu (SW segmentation)
> > After: 10.8 Gbits/sec, 27% cpu (HW segmentation)
> >
>
> Nice, I don't see any real issues in the code. Couple of minor comments
> below.
>
> > For this optimisation, no touching of original tso_segsz should be
> > allowed, so revert the commit
> > 844a7cfa6edd ("netdev-dpdk: Use guest TSO segmentation size hint.").
> >
> > Yet, there might still be some non optimal cases where the L4 payload
> > size would not be a multiple of tso_segsz.
> > For this condition, "partially" segment: split the "last" segment and keep
> > n-1 segments data in the original packet which will be segmented by HW.
> >
> > Reported-at: https://issues.redhat.com/browse/FDP-1897
> > Signed-off-by: David Marchand <[email protected]>
> > ---
> > lib/dp-packet-gso.c | 72 ++++++++++++++++++++++++++++++++++++++-------
> > lib/dp-packet-gso.h | 2 ++
> > lib/netdev-dpdk.c | 7 -----
> > lib/netdev.c | 34 ++++++++++++++-------
> > lib/packets.c | 34 +++++++++++++++++++++
> > 5 files changed, 121 insertions(+), 28 deletions(-)
> >
>
> It probably needs a bit of documentation somewhere for users to know
> when they can expect to see the optimization and when not. Can point
> them to DPDK for checking supported offloads on specific NICs
Ok, I'll see where I can add this (probably worth combining a
description of offload capabilities for a port and the new coverage
counter).
>
> > diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> > index 6b8da8b370..d8d3a87500 100644
> > --- a/lib/dp-packet-gso.c
> > +++ b/lib/dp-packet-gso.c
> > @@ -77,6 +77,20 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
> > return DIV_ROUND_UP(data_length, segsz);
> > }
> >
>
> Could add a comment for this function to explain it
Ack.
>
> > +int
> > +dp_packet_gso_partial_nr_segs(struct dp_packet *p)
> > +{
> > + if ((dp_packet_tunnel_geneve(p)
> > + || dp_packet_tunnel_vxlan(p))
>
> Can be a single line
Ack.
>
> > + && dp_packet_l4_checksum_partial(p)
> > + && dp_packet_get_inner_tcp_payload_length(p)
> > + != dp_packet_gso_nr_segs(p) * dp_packet_get_tso_segsz(p)) {
> > + return 2;
> > + }
> > +
> > + return 1;
> > +}
> > +
> > static void
> > dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
> > uint16_t tso_segsz)
> > @@ -139,7 +153,7 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int
> > seg_no, int n_segs,
> > tcp_seq += seg_no * tso_segsz;
> > put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
> >
> > - if (OVS_LIKELY(seg_no < (n_segs - 1))) {
> > + if (seg_no < (n_segs - 1) && !dp_packet_get_tso_segsz(seg)) {
> > uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> > /* Reset flags PUSH and FIN unless it is the last segment. */
> > uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
> > @@ -160,12 +174,9 @@ dp_packet_gso_update_segment(struct dp_packet *seg,
> > int seg_no, int n_segs,
> > }
> > }
> >
> > -/* Perform software segmentation on packet 'p'.
> > - *
> > - * Segments packet 'p' into the array of preallocated batches in 'batches',
> > - * updating the 'batches' pointer as needed. */
> > -void
> > -dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> > +static void
> > +dp_packet_gso__(struct dp_packet *p, struct dp_packet_batch **batches,
> > + bool partial_seg)
> > {
> > struct dp_packet_batch *curr_batch = *batches;
> > struct dp_packet *seg;
> > @@ -199,6 +210,13 @@ dp_packet_gso(struct dp_packet *p, struct
> > dp_packet_batch **batches)
> > data_len = dp_packet_get_tcp_payload_length(p);
> > }
> >
> > + if (partial_seg) {
> > + if (dp_packet_gso_partial_nr_segs(p) != 1) {
> > + goto last_seg;
> > + }
> > + goto first_seg;
> > + }
> > +
> > for (int i = 1; i < n_segs - 1; i++) {
> > seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
> > tso_segsz);
> > @@ -210,6 +228,7 @@ dp_packet_gso(struct dp_packet *p, struct
> > dp_packet_batch **batches)
> > dp_packet_batch_add(curr_batch, seg);
> > }
> >
> > +last_seg:
> > /* Create the last segment. */
> > seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) *
> > tso_segsz,
> > data_len - (n_segs - 1) * tso_segsz);
> > @@ -220,11 +239,44 @@ dp_packet_gso(struct dp_packet *p, struct
> > dp_packet_batch **batches)
> > }
> > dp_packet_batch_add(curr_batch, seg);
> >
> > - /* Trim the first segment and reset TSO. */
> > - dp_packet_set_size(p, hdr_len + tso_segsz);
> > - dp_packet_set_tso_segsz(p, 0);
> > +first_seg:
> > + if (partial_seg) {
> > + if (dp_packet_gso_partial_nr_segs(p) != 1) {
> > + dp_packet_set_size(p, hdr_len + (n_segs - 1) * tso_segsz);
> > + if (n_segs == 2) {
> > + /* No need to ask HW segmentation, we already did the job.
> > */
> > + dp_packet_set_tso_segsz(p, 0);
> > + }
> > + }
> > + } else {
> > + /* Trim the first segment and reset TSO. */
> > + dp_packet_set_size(p, hdr_len + tso_segsz);
> > + dp_packet_set_tso_segsz(p, 0);
> > + }
> > dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);
> >
> > out:
> > *batches = curr_batch;
> > }
> > +
> > +/* Perform software segmentation on packet 'p'.
> > + *
> > + * Segments packet 'p' into the array of preallocated batches in 'batches',
> > + * updating the 'batches' pointer as needed. */
> > +void
> > +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> > +{
> > + dp_packet_gso__(p, batches, false);
> > +}
> > +
> > +/* Perform partial software segmentation on packet 'p'.
> > + *
> > + * For UDP tunnels, if the packet payload length is not aligned on the
> > + * segmentation size, segments the last segment of packet 'p' into the
> > array
> > + * of preallocated batches in 'batches', updating the 'batches' pointer
> > + * as needed. */
> > +void
> > +dp_packet_gso_partial(struct dp_packet *p, struct dp_packet_batch
> > **batches)
> > +{
> > + dp_packet_gso__(p, batches, true);
> > +}
> > diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
> > index 2abc19ea31..ed55a11d79 100644
> > --- a/lib/dp-packet-gso.h
> > +++ b/lib/dp-packet-gso.h
> > @@ -19,5 +19,7 @@
> >
> > void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
> > int dp_packet_gso_nr_segs(struct dp_packet *);
> > +void dp_packet_gso_partial(struct dp_packet *, struct dp_packet_batch **);
> > +int dp_packet_gso_partial_nr_segs(struct dp_packet *);
> >
> > #endif /* dp-packet-gso.h */
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index 38cf0ebb2e..55278c2450 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -2725,20 +2725,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk
> > *dev, struct rte_mbuf *mbuf)
> >
> > if (mbuf->tso_segsz) {
> > struct tcp_header *th = l4;
> > - uint16_t link_tso_segsz;
> > int hdr_len;
> >
> > mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> >
> > hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> > - link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> > if (dp_packet_tunnel(pkt)) {
> > hdr_len += mbuf->outer_l2_len + mbuf->outer_l3_len;
> > - link_tso_segsz -= mbuf->outer_l3_len + mbuf->l2_len;
> > - }
> > -
> > - if (mbuf->tso_segsz > link_tso_segsz) {
> > - mbuf->tso_segsz = link_tso_segsz;
> > }
> >
> > if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) >
> > dev->max_packet_len)) {
> > diff --git a/lib/netdev.c b/lib/netdev.c
> > index f493dab005..5413ca61d0 100644
> > --- a/lib/netdev.c
> > +++ b/lib/netdev.c
> > @@ -71,6 +71,7 @@ COVERAGE_DEFINE(netdev_add_router);
> > COVERAGE_DEFINE(netdev_get_stats);
> > COVERAGE_DEFINE(netdev_push_header_drops);
> > COVERAGE_DEFINE(netdev_soft_seg_good);
> > +COVERAGE_DEFINE(netdev_partial_seg_good);
> >
> > struct netdev_saved_flags {
> > struct netdev *netdev;
> > @@ -802,7 +803,8 @@ netdev_get_pt_mode(const struct netdev *netdev)
> > * from netdev_class->send() if at least one batch failed to send. */
> > static int
> > netdev_send_tso(struct netdev *netdev, int qid,
> > - struct dp_packet_batch *batch, bool concurrent_txq)
> > + struct dp_packet_batch *batch, bool concurrent_txq,
> > + bool partial_seg)
> > {
> > struct dp_packet_batch *batches;
> > struct dp_packet *packet;
> > @@ -812,11 +814,15 @@ netdev_send_tso(struct netdev *netdev, int qid,
> > int error;
> >
> > /* Calculate the total number of packets in the batch after
> > - * the segmentation. */
> > + * the (partial?) segmentation. */
> > n_packets = 0;
> > DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > if (dp_packet_get_tso_segsz(packet)) {
> > - n_packets += dp_packet_gso_nr_segs(packet);
> > + if (partial_seg) {
> > + n_packets += dp_packet_gso_partial_nr_segs(packet);
> > + } else {
> > + n_packets += dp_packet_gso_nr_segs(packet);
> > + }
> > } else {
> > n_packets++;
> > }
> > @@ -842,8 +848,13 @@ netdev_send_tso(struct netdev *netdev, int qid,
> > curr_batch = batches;
> > DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
> > if (dp_packet_get_tso_segsz(packet)) {
> > - dp_packet_gso(packet, &curr_batch);
> > - COVERAGE_INC(netdev_soft_seg_good);
> > + if (partial_seg) {
> > + dp_packet_gso_partial(packet, &curr_batch);
> > + COVERAGE_INC(netdev_partial_seg_good);
> > + } else {
> > + dp_packet_gso(packet, &curr_batch);
> > + COVERAGE_INC(netdev_soft_seg_good);
> > + }
> > } else {
> > if (dp_packet_batch_is_full(curr_batch)) {
> > curr_batch++;
> > @@ -906,7 +917,8 @@ netdev_send(struct netdev *netdev, int qid, struct
> > dp_packet_batch *batch,
> > if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > if (dp_packet_get_tso_segsz(packet)) {
> > - return netdev_send_tso(netdev, qid, batch,
> > concurrent_txq);
> > + return netdev_send_tso(netdev, qid, batch,
> > concurrent_txq,
> > + false);
> > }
> > }
> > } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
> > @@ -915,16 +927,16 @@ netdev_send(struct netdev *netdev, int qid, struct
> > dp_packet_batch *batch,
> > DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > if (dp_packet_get_tso_segsz(packet)
> > && dp_packet_tunnel(packet)) {
> > - return netdev_send_tso(netdev, qid, batch,
> > concurrent_txq);
> > + return netdev_send_tso(netdev, qid, batch,
> > concurrent_txq,
> > + false);
> > }
> > }
> > } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
> > DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > if (dp_packet_get_tso_segsz(packet)
> > - && (dp_packet_tunnel_vxlan(packet)
> > - || dp_packet_tunnel_geneve(packet))
> > - && dp_packet_l4_checksum_partial(packet)) {
> > - return netdev_send_tso(netdev, qid, batch,
> > concurrent_txq);
> > + && dp_packet_gso_partial_nr_segs(packet) != 1) {
> > + return netdev_send_tso(netdev, qid, batch,
> > concurrent_txq,
> > + true);
> > }
> > }
> > }
> > diff --git a/lib/packets.c b/lib/packets.c
> > index c05b4abcc8..4e5cac66e4 100644
> > --- a/lib/packets.c
> > +++ b/lib/packets.c
> > @@ -33,6 +33,7 @@
> > #include "ovs-thread.h"
> > #include "odp-util.h"
> > #include "dp-packet.h"
> > +#include "dp-packet-gso.h"
> > #include "unaligned.h"
> >
> > const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
> > @@ -2103,6 +2104,7 @@ packet_udp_tunnel_csum(struct dp_packet *p)
> > uint32_t partial_csum;
> > struct ip_header *ip;
> > uint32_t inner_csum;
> > + uint16_t tso_segsz;
> > void *inner_l4;
> >
> > inner_ip = dp_packet_inner_l3(p);
> > @@ -2180,6 +2182,38 @@ packet_udp_tunnel_csum(struct dp_packet *p)
> > partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
> > (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
> > udp->udp_csum = csum_finish(partial_csum);
> > + tso_segsz = dp_packet_get_tso_segsz(p);
> > + if (tso_segsz) {
> > + uint16_t payload_len = dp_packet_get_inner_tcp_payload_length(p);
> > +
> > + ovs_assert(payload_len == tso_segsz * dp_packet_gso_nr_segs(p));
> > +
> > + /* The pseudo header used in the outer UDP checksum is dependent on
> > + * the ip_tot_len / ip6_plen which was a reflection of the TSO
> > frame
> > + * size. The segmented packet will be shorter. */
> > + udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
> > + htons(tso_segsz));
> > +
> > + /* When segmenting the packet, various headers get updated:
> > + * - inner L3
> > + * - for IPv4, ip_tot_len is updated, BUT it is not affecting the
> > + * outer UDP checksum because the IPv4 header itself contains
> > + * a checksum that compensates for this update,
> > + * - for IPv6, ip6_plen is updated, and this must be considered,
> > + * - inner L4
> > + * - inner pseudo header used in the TCP checksum is dependent on
> > + * the inner ip_tot_len / ip6_plen,
> > + * - TCP seq number is updated,
> > + * - the HW may change some TCP flags (think PSH/FIN),
> > + * BUT the TCP checksum will compensate for those updates,
> > + *
> > + * Summary: we only to care about the inner IPv6 header update.
>
> "we only care" or "we only need to care"
Indeed..
--
David Marchand
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev