This patch adds GSO support for IPv4 TCP, when userspace-tso is enabled. Tested using veth sending a TSO packet to OVS, segments to smaller TCP segment, and forward to netdev-afxdp port at another namespace.
Future work includes: 1. GSO for UDP, and IPv6 TCP/UDP GSO. 2. Tunnel GSO: VxLan GSO, Geneve GSO, GRE GSO... Tested using $ make check-afxdp TESTSUITEFLAGS='3' Or script below: ovs-vsctl set Open_vSwitch . other_config:userspace-tso-enable=true ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev ip netns add at_ns0 ip link add p0 type veth peer name afxdp-p0 ip link set p0 netns at_ns0 ip link set dev afxdp-p0 up ovs-vsctl add-port br0 afxdp-p0 ip netns exec at_ns0 sh << NS_EXEC_HEREDOC ip addr add "10.1.1.1/24" dev p0 ip link set dev p0 up NS_EXEC_HEREDOC ip netns add at_ns1 ip link add p1 type veth peer name afxdp-p1 ip link set p1 netns at_ns1 ip link set dev afxdp-p1 up ovs-vsctl add-port br0 afxdp-p1 -- set int afxdp-p1 type=afxdp ip netns exec at_ns1 sh << NS_EXEC_HEREDOC ip addr add "10.1.1.2/24" dev p1 ip link set dev p1 up NS_EXEC_HEREDOC ip netns exec at_ns0 ping -c 3 -i .2 10.1.1.2 ip netns exec at_ns1 ethtool -K p1 tx off ip netns exec at_ns1 iperf -s ip netns exec at_ns0 iperf -c 10.1.1.2 -t1 Tested-at: https://github.com/williamtu/ovs-travis/actions/runs/553156643 Signed-off-by: William Tu <u9012...@gmail.com> --- lib/automake.mk | 2 + lib/dp-packet-gso.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/dp-packet-gso.h | 27 +++++++++ lib/netdev-afxdp.c | 6 ++ lib/netdev.c | 88 +++++++++++++++++++++++------ lib/packets.c | 35 ++++++++++++ lib/packets.h | 1 + tests/system-afxdp.at | 32 +++++++++++ 8 files changed, 324 insertions(+), 16 deletions(-) create mode 100644 lib/dp-packet-gso.c create mode 100644 lib/dp-packet-gso.h diff --git a/lib/automake.mk b/lib/automake.mk index 39afbff9d1a0..57f504d52f5c 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -104,6 +104,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpctl.h \ lib/dp-packet.h \ lib/dp-packet.c \ + lib/dp-packet-gso.h \ + lib/dp-packet-gso.c \ lib/dpdk.h \ lib/dpif-netdev-lookup.h \ lib/dpif-netdev-lookup.c \ diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c new file mode 100644 index 000000000000..5ae7c88298a5 --- /dev/null +++ b/lib/dp-packet-gso.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2021 VMware, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <config.h> + +#include <errno.h> +#include <inttypes.h> +#include <stdlib.h> +#include <unistd.h> + +#include "coverage.h" +#include "csum.h" +#include "dp-packet.h" +#include "dp-packet-gso.h" +#include "dpif-netdev.h" +#include "openvswitch/compiler.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/vlog.h" +#include "packets.h" +#include "util.h" + +VLOG_DEFINE_THIS_MODULE(dp_packet_gso); +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + +/* Update ip header's total len, and id and update tcp header's + * sent sequence number. In the end, update ip and tcp csum. + */ +static void +update_ipv4_tcp_headers(const struct dp_packet *src, struct dp_packet **pkts, + uint16_t nb_segs) +{ + struct tcp_header *tcp; + struct ip_header *ip; + struct dp_packet *p; + uint32_t tcp_seq; + uint16_t ipid; + int i; + + ip = dp_packet_l3(src); + ipid = ntohs(ip->ip_id); + tcp = dp_packet_l4(src); + tcp_seq = ntohl(get_16aligned_be32(&tcp->tcp_seq)); + + for (i = 0; i < nb_segs; i++) { + p = pkts[i]; + + ip = dp_packet_l3(p); + ip->ip_tot_len = htons(dp_packet_l3_size(p)); + ip->ip_id = htons(ipid); + ip->ip_csum = 0; + ip->ip_csum = csum(ip, sizeof *ip); + + tcp = dp_packet_l4(p); + put_16aligned_be32(&tcp->tcp_seq, htonl(tcp_seq)); + packet_csum_tcpudp(p); + + ipid += 1; + tcp_seq += (const char *) dp_packet_tail(p) - + (const char *) dp_packet_l4(p) - + TCP_OFFSET(tcp->tcp_ctl) * 4; + } +} + +static void +hdr_segment_init(struct dp_packet *dst, const struct dp_packet *src) +{ + /* Copy the following fields into the returned buffer: l2_pad_size, + * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */ + memcpy(&dst->l2_pad_size, &src->l2_pad_size, + sizeof(struct dp_packet) - + offsetof(struct dp_packet, l2_pad_size)); + + *dp_packet_ol_flags_ptr(dst) = 0; +} + +static int +gso_do_segment(const struct dp_packet *p, uint16_t hdr_offset, + uint16_t pyld_unit_size, struct dp_packet **pout, + uint16_t nb_pout) +{ + uint16_t nb_segs = 0; + struct dp_packet *pkt; + uint16_t seg_size; + uint16_t pos = hdr_offset; + int bytes_remaining = dp_packet_size(p) - hdr_offset; + + while (bytes_remaining > 0) { + + seg_size = (bytes_remaining >= pyld_unit_size) ? + pyld_unit_size : bytes_remaining; + + /* Create a new dp_packet, put payload, push header. */ + pkt = dp_packet_new_with_headroom(seg_size, hdr_offset); + hdr_segment_init(pkt, p); + dp_packet_put(pkt, (char *) dp_packet_data(p) + pos, seg_size); + dp_packet_push(pkt, dp_packet_data(p), hdr_offset); + + pos += seg_size; + bytes_remaining -= seg_size; + pout[nb_segs] = pkt; + nb_segs++; + + if (nb_segs > nb_pout) { + VLOG_WARN_RL(&rl, "Not enough memory to process GSO."); + nb_segs = -1; + /* need to free dp_packet. */ + break; + } + } + return nb_segs; +} + +int +gso_tcp4_segment(struct dp_packet *p, uint16_t gso_size, + struct dp_packet **pout, uint16_t nb_pout) +{ + uint16_t pyld_unit_size, hdr_offset; + int nb_segs; + + hdr_offset = (char *) dp_packet_get_tcp_payload(p) - + (char *) dp_packet_eth(p); + pyld_unit_size = gso_size - hdr_offset; + + if (OVS_UNLIKELY(dp_packet_size(p) < ETH_PAYLOAD_MAX)) { + VLOG_WARN_RL(&rl, "Packet size %u bytes too small for GSO.", + dp_packet_size(p)); + return -EINVAL; + } + + nb_segs = gso_do_segment(p, hdr_offset, pyld_unit_size, pout, nb_pout); + if (nb_segs > 0) { + /* Update TCP checksum. */ + update_ipv4_tcp_headers(p, pout, nb_segs); + } + + return nb_segs; +} diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h new file mode 100644 index 000000000000..d33d904c9e22 --- /dev/null +++ b/lib/dp-packet-gso.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2021 VMware, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DP_PACKET_GSO_H +#define DP_PACKET_GSO_H 1 + +#include <stdint.h> +#include <stdbool.h> + +int gso_tcp4_segment(struct dp_packet *p, uint16_t gso_size, + struct dp_packet **pouts, uint16_t nb_pouts); +int gso_udp4_segment(struct dp_packet *p, uint16_t gso_size, + struct dp_packet **pouts, uint16_t nb_pouts); +#endif /* dp-packet-gso.h */ diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index 482400d8d135..bf7b85d73a63 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -47,6 +47,7 @@ #include "ovs-numa.h" #include "packets.h" #include "socket-util.h" +#include "userspace-tso.h" #include "util.h" #ifndef SOL_XDP @@ -867,6 +868,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, FRAME_SIZE - FRAME_HEADROOM, OVS_XDP_HEADROOM); dp_packet_set_size(packet, len); + *dp_packet_ol_flags_ptr(packet) = 0; /* Add packet into batch, increase batch->count. */ dp_packet_batch_add(batch, packet); @@ -1187,6 +1189,10 @@ netdev_afxdp_construct(struct netdev *netdev) dev->xsks = NULL; dev->tx_locks = NULL; + if (userspace_tso_enabled()) { + netdev->ol_flags = 0; + } + netdev_request_reconfigure(netdev); return 0; } diff --git a/lib/netdev.c b/lib/netdev.c index 91e91955c09b..691ce81a01be 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -34,6 +34,7 @@ #include "cmap.h" #include "coverage.h" #include "dpif.h" +#include "dp-packet-gso.h" #include "dp-packet.h" #include "openvswitch/dynamic-string.h" #include "fatal-signal.h" @@ -797,7 +798,6 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, if (dp_packet_hwol_is_tso(packet) && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { /* Fall back to GSO in software. */ - VLOG_ERR_BUF(errormsg, "No TSO support"); return false; } @@ -806,8 +806,8 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, if (dp_packet_hwol_l4_is_tcp(packet)) { if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { /* Fall back to TCP csum in software. */ - VLOG_ERR_BUF(errormsg, "No TCP checksum support"); - return false; + packet_csum_tcpudp(packet); + return true; } } else if (dp_packet_hwol_l4_is_udp(packet)) { if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { @@ -835,7 +835,8 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, * otherwise either fall back to software implementation or drop it. */ static void netdev_send_prepare_batch(const struct netdev *netdev, - struct dp_packet_batch *batch) + struct dp_packet_batch *batch, + struct dp_packet_batch *gso_batch) { struct dp_packet *packet; size_t i, size = dp_packet_batch_size(batch); @@ -846,11 +847,16 @@ netdev_send_prepare_batch(const struct netdev *netdev, if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) { dp_packet_batch_refill(batch, packet, i); } else { - dp_packet_delete(packet); - COVERAGE_INC(netdev_send_prepare_drops); - VLOG_WARN_RL(&rl, "%s: Packet dropped: %s", - netdev_get_name(netdev), errormsg); - free(errormsg); + if (dp_packet_hwol_is_tso(packet) && + !(netdev->ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + dp_packet_batch_add(gso_batch, packet); + } else { + dp_packet_delete(packet); + COVERAGE_INC(netdev_send_prepare_drops); + VLOG_WARN_RL(&rl, "%s: Packet dropped: %s", + netdev_get_name(netdev), errormsg); + free(errormsg); + } } } } @@ -884,17 +890,67 @@ int netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool concurrent_txq) { - int error; + struct dp_packet_batch *gso_batch_ptr; + struct dp_packet_batch gso_batch; + struct dp_packet **gso_pkts; + struct dp_packet *packet; + uint16_t gso_pkts_len, nb_segs; + int error = 0; - netdev_send_prepare_batch(netdev, batch); - if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) { - return 0; + dp_packet_batch_init(&gso_batch); + netdev_send_prepare_batch(netdev, batch, &gso_batch); + + if (!dp_packet_batch_is_empty(batch)) { + error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq); + if (!error) { + COVERAGE_INC(netdev_sent); + } } - error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq); - if (!error) { - COVERAGE_INC(netdev_sent); + if (dp_packet_batch_is_empty(&gso_batch)) { + return error; } + gso_batch_ptr = &gso_batch; + DP_PACKET_BATCH_FOR_EACH (i, packet, gso_batch_ptr) { + struct dp_packet_batch seg_batch; + uint16_t gso_size = 1000; /* How to decide gso_size? */ + + gso_pkts_len = 2 * NETDEV_MAX_BURST; + gso_pkts = xmalloc(gso_pkts_len * sizeof(struct dp_packet *)); + + nb_segs = gso_tcp4_segment(packet, gso_size, gso_pkts, gso_pkts_len); + if (nb_segs <= 0) { + VLOG_WARN("GSO tcp4 segment failed"); + dp_packet_delete_batch(gso_batch_ptr, true); + return EINVAL; + } + dp_packet_batch_init(&seg_batch); + + for (i = 0; i < nb_segs; i++) { + dp_packet_batch_add(&seg_batch, gso_pkts[i]); + + if (dp_packet_batch_is_full(&seg_batch)) { + /* Send the first batch when full. */ + error = netdev->netdev_class->send(netdev, qid, &seg_batch, + concurrent_txq); + if (!error) { + COVERAGE_INC(netdev_sent); + } + dp_packet_batch_init(&seg_batch); + } + } + if (!dp_packet_batch_is_empty(&seg_batch)) { + /* Send the rest. */ + error = netdev->netdev_class->send(netdev, qid, &seg_batch, + concurrent_txq); + if (!error) { + COVERAGE_INC(netdev_sent); + } + } + + } + free(gso_pkts); + return error; } diff --git a/lib/packets.c b/lib/packets.c index 4a7643c5dd3a..20702d25c2af 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1887,3 +1887,38 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) } } } + +void +packet_csum_tcpudp(struct dp_packet *p) +{ + struct eth_header *eth; + struct ip_header *ip; + struct tcp_header *tcp; + struct udp_header *udp; + uint32_t pseudo_hdr_csum; + uint8_t l4proto; + size_t l4_size; + + eth = dp_packet_eth(p); + if (eth->eth_type != htons(ETH_TYPE_IP)) { + return; + } + + ip = dp_packet_l3(p); + l4proto = ip->ip_proto; + l4_size = dp_packet_l4_size(p); + + if (l4proto == IPPROTO_TCP) { + pseudo_hdr_csum = packet_csum_pseudoheader(ip); + tcp = dp_packet_l4(p); + tcp->tcp_csum = 0; + tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, + tcp, l4_size)); + } else if (l4proto == IPPROTO_UDP) { + pseudo_hdr_csum = packet_csum_pseudoheader(ip); + udp = dp_packet_l4(p); + udp->udp_csum = 0; + udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum, + udp, l4_size)); + } +} diff --git a/lib/packets.h b/lib/packets.h index 481bc22fa1fe..108087f916ac 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1635,6 +1635,7 @@ void packet_put_ra_prefix_opt(struct dp_packet *, const ovs_be128 router_prefix); uint32_t packet_csum_pseudoheader(const struct ip_header *); void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); +void packet_csum_tcpudp(struct dp_packet *p); #define DNS_HEADER_LEN 12 struct dns_header { diff --git a/tests/system-afxdp.at b/tests/system-afxdp.at index 0d09906fb6c8..3c6a7708435c 100644 --- a/tests/system-afxdp.at +++ b/tests/system-afxdp.at @@ -45,3 +45,35 @@ NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +dnl p0 at at_ns0 sends TSO packet to ovs-p0 at OVS. +dnl ovs-p1 attached to OVS as type=afxdp +AT_SETUP([AF_XDP - enable userspace TSO]) +AT_KEYWORDS([afxdp tso]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:userspace-tso-enable=true]) +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +dnl Create and add ovs-p0 as system port +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +AT_CHECK([ovs-vsctl del-port ovs-p0]) +AT_CHECK([ovs-vsctl add-port br0 ovs-p0]) +dnl Enable tx offload at p0, so ovs-p0 sees TSO packets +NS_CHECK_EXEC([at_ns0], [ethtool -K p0 tx on > /dev/null 2>&1]) + +dnl Create and add ovs-p1 as afxdp port +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Send a TSO from ns0 to ns1 +NETNS_DAEMONIZE([at_ns1], [iperf -s], [iperf.pid]) +NS_CHECK_EXEC([at_ns0], [iperf -c 10.1.1.2 -t1 1> /dev/null], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP -- 2.7.4 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev