By default, DPDK based dp-packets points to data buffers that can't be expanded dynamically. Their layout is as follows: - a minimum 128 bytes headroom chosen at DPDK build time (RTE_PKTMBUF_HEADROOM), - a maximum size chosen at mempool creation,
In some usecases though (like encapsulating with multiple tunnels), a 128 bytes headroom is too short. Keep on using mono segment packets but dynamically allocate buffers in DPDK memory and make use of DPDK external buffers API (previously used for userspace TSO). Signed-off-by: David Marchand <[email protected]> --- Changes since v4: - fixed tailroom, - added a check on configured DPDK headroom, - added more description and renamed ifaces in the unit test, Changes since v3: - split buffer length calculation in a helper, - handled running test without qdisc (net/tap does not require those qdiscs, but spews ERR level logs if absent), - added check on firewall, Changes since v2: - moved check on uint16_t overflow in netdev_dpdk_extbuf_allocate(), Changes since v1: - fixed new segment length (reset by extbuf attach helper), - added a system-dpdk unit test, --- acinclude.m4 | 7 +++ lib/dp-packet.c | 21 ++++++++- lib/netdev-dpdk.c | 47 +++++++++++++++++--- lib/netdev-dpdk.h | 4 ++ tests/atlocal.in | 1 + tests/system-dpdk.at | 100 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 174 insertions(+), 6 deletions(-) diff --git a/acinclude.m4 b/acinclude.m4 index e4e48cb531..060c416f8a 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -431,6 +431,13 @@ AC_DEFUN([OVS_CHECK_DPDK], [ AC_MSG_ERROR([unable to find rte_config.h in $with_dpdk]) ], [AC_INCLUDES_DEFAULT]) + AC_COMPUTE_INT([dpdk_mbuf_headroom], [RTE_PKTMBUF_HEADROOM], + [AC_INCLUDES_DEFAULT], + [AC_MSG_ERROR([unable to determine RTE_PKTMBUF_HEADROOM])]) + AC_DEFINE_UNQUOTED([DPDK_MBUF_HEADROOM], [$dpdk_mbuf_headroom], + [Value of RTE_PKTMBUF_HEADROOM from DPDK]) + AC_SUBST([DPDK_MBUF_HEADROOM], [$dpdk_mbuf_headroom]) + AC_CHECK_DECLS([RTE_LIBRTE_VHOST_NUMA, RTE_EAL_NUMA_AWARE_HUGEPAGES], [ OVS_FIND_DEPENDENCY([get_mempolicy], [numa], [libnuma]) ], [], [[#include <rte_config.h>]]) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index c04d608be6..30fd013c29 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -255,8 +255,27 @@ dp_packet_resize(struct dp_packet *b, size_t new_headroom, size_t new_tailroom) new_allocated = new_headroom + dp_packet_size(b) + new_tailroom; switch (b->source) { - case DPBUF_DPDK: + case DPBUF_DPDK: { +#ifdef DPDK_NETDEV + uint32_t extbuf_len; + + extbuf_len = netdev_dpdk_extbuf_size(new_allocated); + ovs_assert(extbuf_len <= UINT16_MAX); + new_base = netdev_dpdk_extbuf_allocate(extbuf_len); + if (!new_base) { + out_of_memory(); + } + dp_packet_copy__(b, new_base, new_headroom, new_tailroom); + netdev_dpdk_extbuf_replace(b, new_base, extbuf_len); + /* Because of alignment, we may have gained a bit more tailroom than + * expected. Rely on this mbuf buf_len which got adjusted by + * rte_pktmbuf_attach_extbuf(). */ + new_allocated = b->mbuf.buf_len; + break; +#else OVS_NOT_REACHED(); +#endif + } case DPBUF_MALLOC: if (new_headroom == dp_packet_headroom(b)) { diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index b5d72283c9..54959ff0d4 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -3074,12 +3074,51 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } +uint32_t +netdev_dpdk_extbuf_size(uint32_t data_len) +{ + uint32_t buf_len = data_len; + + buf_len += sizeof(struct rte_mbuf_ext_shared_info) + sizeof(uintptr_t); + buf_len = RTE_ALIGN_CEIL(buf_len, sizeof(uintptr_t)); + + return buf_len; +} + +void * +netdev_dpdk_extbuf_allocate(uint32_t buf_len) +{ + return rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE); +} + static void netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) { rte_free(opaque); } +void +netdev_dpdk_extbuf_replace(struct dp_packet *b, void *buf, uint32_t data_len) +{ + struct rte_mbuf *pkt = (struct rte_mbuf *) b; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = data_len; + + shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, + netdev_dpdk_extbuf_free, + buf); + ovs_assert(shinfo != NULL); + + if (RTE_MBUF_HAS_EXTBUF(pkt)) { + rte_pktmbuf_detach_extbuf(pkt); + } + rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len, + shinfo); + /* OVS only supports mono segment. + * Packet size did not change, restore the current segment length. */ + pkt->data_len = pkt->pkt_len; +} + static struct rte_mbuf * dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) { @@ -3088,16 +3127,14 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) uint16_t buf_len; void *buf; - total_len += sizeof *shinfo + sizeof(uintptr_t); - total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t)); - + total_len = netdev_dpdk_extbuf_size(total_len); if (OVS_UNLIKELY(total_len > UINT16_MAX)) { VLOG_ERR("Can't copy packet: too big %u", total_len); return NULL; } buf_len = total_len; - buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE); + buf = netdev_dpdk_extbuf_allocate(buf_len); if (OVS_UNLIKELY(buf == NULL)) { VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len); return NULL; @@ -3108,7 +3145,7 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) netdev_dpdk_extbuf_free, buf); if (OVS_UNLIKELY(shinfo == NULL)) { - rte_free(buf); + netdev_dpdk_extbuf_free(NULL, buf); VLOG_ERR("Failed to initialize shared info for mbuf while " "attempting to attach an external buffer."); return NULL; diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index e6779d478a..0029372ee3 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -32,6 +32,10 @@ struct netdev; void netdev_dpdk_register(const struct smap *); void free_dpdk_buf(struct dp_packet *); +uint32_t netdev_dpdk_extbuf_size(uint32_t); +void *netdev_dpdk_extbuf_allocate(uint32_t); +void netdev_dpdk_extbuf_replace(struct dp_packet *, void *, uint32_t); + bool netdev_dpdk_flow_api_supported(struct netdev *, bool check_only); int diff --git a/tests/atlocal.in b/tests/atlocal.in index e70c03f8c1..e4c82a7407 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -8,6 +8,7 @@ PYTHON3='@PYTHON3@' CFLAGS='@CFLAGS@' HAVE_TCA_HTB_RATE64='@HAVE_TCA_HTB_RATE64@' HAVE_TCA_POLICE_PKTRATE64='@HAVE_TCA_POLICE_PKTRATE64@' +DPDK_MBUF_HEADROOM='@DPDK_MBUF_HEADROOM@' # PYTHONCOERCECLOCALE=0 disables the Unicode compatibility warning on # stderr that breaks almost any Python3 test (PEP 0538) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 17d3d25955..47a70f2b03 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -976,3 +976,103 @@ AT_CHECK([ovs-appctl --format json --pretty dpif/offload/show], [0], [dnl OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- + +dnl -------------------------------------------------------------------------- +dnl Test headroom expansion. +dnl +dnl Interesting packets in this test are the ones going from p2 (netns at_ns1) +dnl to vxlan2 (netns at_ns0). +dnl Those packets from the kernel stack (netns at_ns1) are received on a +dnl OVS dpdk port p2. +dnl Then OVS userspace datapath encapsulates them twice and sends them +dnl to the kernel stack (netns at_ns0) via a OVS dpdk port p0. +dnl +dnl ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +dnl at_ns0 . init_net . at_ns1 +dnl ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +dnl . . +dnl . +--------------------.--+ +dnl . | . | +dnl vxlan2 fc00:2::1/64 . br2 fc00:2::100/64 . p2 fc00:2:200/64 +dnl (remote: fc00:1::100) . | . +dnl . ovs-vxlan2 . +dnl . (remote: fc00:1::1) . +dnl . . +dnl . . +dnl vxlan1 fc00:1::1/64 . br1 fc00:1::100/64 . +dnl (remote: fc00:0::100) . | . +dnl . ovs-vxlan1 . +dnl . (remote: fc00:0::1) . +dnl . . +dnl . . +dnl p0 fc00:0::1/64 . br0 fc00:0::100/64 . +dnl | . | . +dnl +----------------------.--+ . + +AT_SETUP([OVS-DPDK - headroom expansion]) +AT_KEYWORDS([dpdk]) +OVS_CHECK_FIREWALL() +OVS_DPDK_PRE_CHECK() +dnl This test uses 2 IPv6 VxLAN encapsulations (140 bytes of tunnel headers) +dnl to ensure that the headroom can't fit those headers. +AT_SKIP_IF([test $DPDK_MBUF_HEADROOM -ge 140]) +OVS_DPDK_START([--no-pci]) + +ADD_BR([br0]) +ADD_BR([br1]) +ADD_BR([br2]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br1 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br2 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +AT_CHECK([ip link set dev br0 up]) +AT_CHECK([ip link set dev br1 up]) + +AT_CHECK([ip addr add dev br0 "fc00::100/64" nodad]) +AT_CHECK([ovs-vsctl add-port br0 p0 -- \ + set interface p0 type=dpdk -- \ + set interface p0 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { print $NF + 1 }') -- \ + set interface p0 options:dpdk-devargs=net_tap0,iface=p0]) +AT_CHECK([ip link set p0 netns at_ns0]) +NS_CHECK_EXEC([at_ns0], [ip link set p0 up]) +OVS_WAIT_UNTIL([ip -n at_ns0 link show dev p0 | grep -qw LOWER_UP]) +NS_CHECK_EXEC([at_ns0], [ip -6 addr add "fc00::1/64" nodad dev p0]) + +ADD_OVS_TUNNEL6([vxlan], [br1], [ovs-vxlan1], [fc00::1], + ["fc00:1::100/64" nodad], [options:key=0]) +ADD_NATIVE_TUNNEL6([vxlan], [vxlan1], [at_ns0], [fc00::100], + ["fc00:1::1/64" nodad], [id 0 dstport 4789]) + +ADD_OVS_TUNNEL6([vxlan], [br2], [ovs-vxlan2], [fc00:1::1], + ["fc00:2::100/64" nodad], [options:key=1]) +ADD_NATIVE_TUNNEL6([vxlan], [vxlan2], [at_ns0], [fc00:1::100], + ["fc00:2::1/64" nodad], [id 1 dstport 4789]) + +AT_CHECK([ovs-vsctl add-port br2 p2 -- \ + set interface p2 type=dpdk -- \ + set interface p2 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { print $NF + 1 }') -- \ + set interface p2 options:dpdk-devargs=net_tap2,iface=p2]) +AT_CHECK([ip link set p2 netns at_ns1]) +NS_CHECK_EXEC([at_ns1], [ip link set p2 up]) +OVS_WAIT_UNTIL([ip -n at_ns1 link show dev p2 | grep -qw LOWER_UP]) +NS_CHECK_EXEC([at_ns1], [ip -6 addr add "fc00:2::200/64" nodad dev p2]) + +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::100]) +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:2::100]) +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:2::200 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Clean up +OVS_DPDK_STOP_VSWITCHD(["dnl +/Failed to send start req to secondary 95/d +/eth_dev_tap_create(): .*: failed to create multiq qdisc./d +/eth_dev_tap_create(): Disabling rte flow support: No such file or directory/d +/qdisc_create_multiq(): Could not add multiq qdisc (2): No such file or directory/d +/tap_nl_dump_ext_ack(): Specified qdisc kind is unknown/d"]) +AT_CLEANUP +dnl -------------------------------------------------------------------------- -- 2.53.0 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
