If we can predict that the segmented traffic will have the same headers,
it is possible to rely on HW segmentation.
TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
Before: 7.80 Gbits/sec, 100% cpu (SW segmentation)
After: 10.8 Gbits/sec, 27% cpu (HW segmentation)
For this optimisation, no touching of original tso_segsz should be
allowed, so revert the commit
844a7cfa6edd ("netdev-dpdk: Use guest TSO segmentation size hint.").
Yet, there might still be some non optimal cases where the L4 payload
size would not be a multiple of tso_segsz.
For this condition, "partially" segment: split the "last" segment and keep
n-1 segments data in the original packet which will be segmented by HW.
Reported-at: https://issues.redhat.com/browse/FDP-1897
Signed-off-by: David Marchand <[email protected]>
---
Changes since v2:
- added some description on dp_packet_gso_partial_nr_segs(),
- added some documentation on coverage counters,
---
Documentation/topics/userspace-tso.rst | 26 +++++++++
lib/dp-packet-gso.c | 75 ++++++++++++++++++++++----
lib/dp-packet-gso.h | 2 +
lib/netdev-dpdk.c | 7 ---
lib/netdev.c | 34 ++++++++----
lib/packets.c | 34 ++++++++++++
6 files changed, 150 insertions(+), 28 deletions(-)
diff --git a/Documentation/topics/userspace-tso.rst
b/Documentation/topics/userspace-tso.rst
index ed4d36edd1..a47965768b 100644
--- a/Documentation/topics/userspace-tso.rst
+++ b/Documentation/topics/userspace-tso.rst
@@ -105,6 +105,32 @@ then started again. OvS will then report::
$ ovs-vsctl get interface vhost0 status:userspace-tso
ovs-vsctl: no key "userspace-tso" in Interface record "vhost0" column status
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Software segmentation fallback
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In case a `TSO` packet is transmitted over a port that does not support
+segmentation, OvS userspace datapath segments the packet in software
+(see lib/dp-packet-gso.c).
+
+This case is common for port not supporting TSO at all.
+A coverage counter exists to reflect when the software fallback helper
+is called.::
+
+ $ ovs-appctl coverage/read-counter netdev_soft_seg_good
+ 178760
+
+Another possibility is when the `TSO` packet involves a header encapsulation
+requested by OvS.
+When the tunnel encapsulation is UDP based, and checksum on the tunnel header
+is requested but the transmitting port does not support this combination,
+OvS does some partial segmentation based on the segmentation size coming
+from the receiving port.
+A coverage counter exists to reflect when this optimisation is called.::
+
+ $ ovs-appctl coverage/read-counter netdev_partial_seg_good
+ 1345
+
~~~~~~~~~~~
Limitations
~~~~~~~~~~~
diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index a7915dfe56..9076c674a1 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -77,6 +77,23 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
return DIV_ROUND_UP(data_length, segsz);
}
+/* For partial segmentation, we try to pack as much data as we can in a first
+ * packet (up to the final number of segments on the wire).
+ * If there is still some data left, we need an extra "little" packet
+ * (shorter than tso_segsz). */
+int
+dp_packet_gso_partial_nr_segs(struct dp_packet *p)
+{
+ if ((dp_packet_tunnel_geneve(p) || dp_packet_tunnel_vxlan(p))
+ && dp_packet_l4_checksum_partial(p)
+ && dp_packet_get_inner_tcp_payload_length(p)
+ != dp_packet_gso_nr_segs(p) * dp_packet_get_tso_segsz(p)) {
+ return 2;
+ }
+
+ return 1;
+}
+
static void
dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
uint16_t tso_segsz, bool udp_tnl, bool gre_tnl)
@@ -136,7 +153,7 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int
seg_no, int n_segs,
tcp_seq += seg_no * tso_segsz;
put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
- if (OVS_LIKELY(seg_no < (n_segs - 1))) {
+ if (seg_no < (n_segs - 1) && !dp_packet_get_tso_segsz(seg)) {
uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
/* Reset flags PUSH and FIN unless it is the last segment. */
uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
@@ -157,12 +174,9 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int
seg_no, int n_segs,
}
}
-/* Perform software segmentation on packet 'p'.
- *
- * Segments packet 'p' into the array of preallocated batches in 'batches',
- * updating the 'batches' pointer as needed. */
-void
-dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+static void
+dp_packet_gso__(struct dp_packet *p, struct dp_packet_batch **batches,
+ bool partial_seg)
{
struct dp_packet_batch *curr_batch = *batches;
struct dp_packet *seg;
@@ -200,6 +214,13 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch
**batches)
data_len = dp_packet_get_tcp_payload_length(p);
}
+ if (partial_seg) {
+ if (dp_packet_gso_partial_nr_segs(p) != 1) {
+ goto last_seg;
+ }
+ goto first_seg;
+ }
+
for (int i = 1; i < n_segs - 1; i++) {
seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
tso_segsz);
@@ -212,6 +233,7 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch
**batches)
dp_packet_batch_add(curr_batch, seg);
}
+last_seg:
/* Create the last segment. */
seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) * tso_segsz,
data_len - (n_segs - 1) * tso_segsz);
@@ -223,11 +245,44 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch
**batches)
}
dp_packet_batch_add(curr_batch, seg);
- /* Trim the first segment and reset TSO. */
- dp_packet_set_size(p, hdr_len + tso_segsz);
- dp_packet_set_tso_segsz(p, 0);
+first_seg:
+ if (partial_seg) {
+ if (dp_packet_gso_partial_nr_segs(p) != 1) {
+ dp_packet_set_size(p, hdr_len + (n_segs - 1) * tso_segsz);
+ if (n_segs == 2) {
+ /* No need to ask HW segmentation, we already did the job. */
+ dp_packet_set_tso_segsz(p, 0);
+ }
+ }
+ } else {
+ /* Trim the first segment and reset TSO. */
+ dp_packet_set_size(p, hdr_len + tso_segsz);
+ dp_packet_set_tso_segsz(p, 0);
+ }
dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz, udp_tnl, gre_tnl);
out:
*batches = curr_batch;
}
+
+/* Perform software segmentation on packet 'p'.
+ *
+ * Segments packet 'p' into the array of preallocated batches in 'batches',
+ * updating the 'batches' pointer as needed. */
+void
+dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+ dp_packet_gso__(p, batches, false);
+}
+
+/* Perform partial software segmentation on packet 'p'.
+ *
+ * For UDP tunnels, if the packet payload length is not aligned on the
+ * segmentation size, segments the last segment of packet 'p' into the array
+ * of preallocated batches in 'batches', updating the 'batches' pointer
+ * as needed. */
+void
+dp_packet_gso_partial(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+ dp_packet_gso__(p, batches, true);
+}
diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
index 2abc19ea31..ed55a11d79 100644
--- a/lib/dp-packet-gso.h
+++ b/lib/dp-packet-gso.h
@@ -19,5 +19,7 @@
void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
int dp_packet_gso_nr_segs(struct dp_packet *);
+void dp_packet_gso_partial(struct dp_packet *, struct dp_packet_batch **);
+int dp_packet_gso_partial_nr_segs(struct dp_packet *);
#endif /* dp-packet-gso.h */
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 38cf0ebb2e..55278c2450 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2725,20 +2725,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev,
struct rte_mbuf *mbuf)
if (mbuf->tso_segsz) {
struct tcp_header *th = l4;
- uint16_t link_tso_segsz;
int hdr_len;
mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
- link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
if (dp_packet_tunnel(pkt)) {
hdr_len += mbuf->outer_l2_len + mbuf->outer_l3_len;
- link_tso_segsz -= mbuf->outer_l3_len + mbuf->l2_len;
- }
-
- if (mbuf->tso_segsz > link_tso_segsz) {
- mbuf->tso_segsz = link_tso_segsz;
}
if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
diff --git a/lib/netdev.c b/lib/netdev.c
index f493dab005..5413ca61d0 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -71,6 +71,7 @@ COVERAGE_DEFINE(netdev_add_router);
COVERAGE_DEFINE(netdev_get_stats);
COVERAGE_DEFINE(netdev_push_header_drops);
COVERAGE_DEFINE(netdev_soft_seg_good);
+COVERAGE_DEFINE(netdev_partial_seg_good);
struct netdev_saved_flags {
struct netdev *netdev;
@@ -802,7 +803,8 @@ netdev_get_pt_mode(const struct netdev *netdev)
* from netdev_class->send() if at least one batch failed to send. */
static int
netdev_send_tso(struct netdev *netdev, int qid,
- struct dp_packet_batch *batch, bool concurrent_txq)
+ struct dp_packet_batch *batch, bool concurrent_txq,
+ bool partial_seg)
{
struct dp_packet_batch *batches;
struct dp_packet *packet;
@@ -812,11 +814,15 @@ netdev_send_tso(struct netdev *netdev, int qid,
int error;
/* Calculate the total number of packets in the batch after
- * the segmentation. */
+ * the (partial?) segmentation. */
n_packets = 0;
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
if (dp_packet_get_tso_segsz(packet)) {
- n_packets += dp_packet_gso_nr_segs(packet);
+ if (partial_seg) {
+ n_packets += dp_packet_gso_partial_nr_segs(packet);
+ } else {
+ n_packets += dp_packet_gso_nr_segs(packet);
+ }
} else {
n_packets++;
}
@@ -842,8 +848,13 @@ netdev_send_tso(struct netdev *netdev, int qid,
curr_batch = batches;
DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
if (dp_packet_get_tso_segsz(packet)) {
- dp_packet_gso(packet, &curr_batch);
- COVERAGE_INC(netdev_soft_seg_good);
+ if (partial_seg) {
+ dp_packet_gso_partial(packet, &curr_batch);
+ COVERAGE_INC(netdev_partial_seg_good);
+ } else {
+ dp_packet_gso(packet, &curr_batch);
+ COVERAGE_INC(netdev_soft_seg_good);
+ }
} else {
if (dp_packet_batch_is_full(curr_batch)) {
curr_batch++;
@@ -906,7 +917,8 @@ netdev_send(struct netdev *netdev, int qid, struct
dp_packet_batch *batch,
if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
if (dp_packet_get_tso_segsz(packet)) {
- return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+ return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+ false);
}
}
} else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
@@ -915,16 +927,16 @@ netdev_send(struct netdev *netdev, int qid, struct
dp_packet_batch *batch,
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
if (dp_packet_get_tso_segsz(packet)
&& dp_packet_tunnel(packet)) {
- return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+ return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+ false);
}
}
} else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
if (dp_packet_get_tso_segsz(packet)
- && (dp_packet_tunnel_vxlan(packet)
- || dp_packet_tunnel_geneve(packet))
- && dp_packet_l4_checksum_partial(packet)) {
- return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+ && dp_packet_gso_partial_nr_segs(packet) != 1) {
+ return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+ true);
}
}
}
diff --git a/lib/packets.c b/lib/packets.c
index 3d12563e3b..d1254676f9 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -33,6 +33,7 @@
#include "ovs-thread.h"
#include "odp-util.h"
#include "dp-packet.h"
+#include "dp-packet-gso.h"
#include "unaligned.h"
const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
@@ -2103,6 +2104,7 @@ packet_udp_tunnel_csum(struct dp_packet *p)
uint32_t partial_csum;
struct ip_header *ip;
uint32_t inner_csum;
+ uint16_t tso_segsz;
bool inner_ipv4;
void *inner_l4;
@@ -2183,6 +2185,38 @@ packet_udp_tunnel_csum(struct dp_packet *p)
partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
(char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
udp->udp_csum = csum_finish(partial_csum);
+ tso_segsz = dp_packet_get_tso_segsz(p);
+ if (tso_segsz) {
+ uint16_t payload_len = dp_packet_get_inner_tcp_payload_length(p);
+
+ ovs_assert(payload_len == tso_segsz * dp_packet_gso_nr_segs(p));
+
+ /* The pseudo header used in the outer UDP checksum is dependent on
+ * the ip_tot_len / ip6_plen which was a reflection of the TSO frame
+ * size. The segmented packet will be shorter. */
+ udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
+ htons(tso_segsz));
+
+ /* When segmenting the packet, various headers get updated:
+ * - inner L3
+ * - for IPv4, ip_tot_len is updated, BUT it is not affecting the
+ * outer UDP checksum because the IPv4 header itself contains
+ * a checksum that compensates for this update,
+ * - for IPv6, ip6_plen is updated, and this must be considered,
+ * - inner L4
+ * - inner pseudo header used in the TCP checksum is dependent on
+ * the inner ip_tot_len / ip6_plen,
+ * - TCP seq number is updated,
+ * - the HW may change some TCP flags (think PSH/FIN),
+ * BUT the TCP checksum will compensate for those updates,
+ *
+ * Summary: we only care about the inner IPv6 header update.
+ */
+ if (IP_VER(inner_ip->ip_ihl_ver) != 4) {
+ udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
+ htons(tso_segsz));
+ }
+ }
if (!udp->udp_csum) {
udp->udp_csum = htons(0xffff);
}
--
2.51.1
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev