On 3/5/26 12:04 AM, Ilya Maximets wrote:
> On 2/25/26 12:03 PM, David Marchand via dev wrote:
>> By default, DPDK based dp-packets points to data buffers that can't be
>> expanded dynamically.
>> Their layout is as follows:
>> - a minimum 128 bytes headroom chosen at DPDK build time
>> (RTE_PKTMBUF_HEADROOM),
>> - a maximum size chosen at mempool creation,
>>
>> In some usecases though (like encapsulating with multiple tunnels),
>> a 128 bytes headroom is too short.
>>
>> Keep on using mono segment packets but dynamically allocate buffers
>> in DPDK memory and make use of DPDK external buffers API
>> (previously used for userspace TSO).
>>
>> Signed-off-by: David Marchand <[email protected]>
>> ---
>> Changes since v3:
>> - split buffer length calculation in a helper,
>> - handled running test without qdisc (net/tap does not require
>> those qdiscs, but spews ERR level logs if absent),
>> - added check on firewall,
>>
>> Changes since v2:
>> - moved check on uint16_t overflow in netdev_dpdk_extbuf_allocate(),
>>
>> Changes since v1:
>> - fixed new segment length (reset by extbuf attach helper),
>> - added a system-dpdk unit test,
>>
>> ---
>> lib/dp-packet.c | 17 +++++++++++-
>> lib/netdev-dpdk.c | 47 ++++++++++++++++++++++++++++----
>> lib/netdev-dpdk.h | 4 +++
>> tests/system-dpdk.at | 65 ++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 127 insertions(+), 6 deletions(-)
>>
>> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
>> index c04d608be6..4c45636039 100644
>> --- a/lib/dp-packet.c
>> +++ b/lib/dp-packet.c
>> @@ -255,8 +255,23 @@ dp_packet_resize(struct dp_packet *b, size_t
>> new_headroom, size_t new_tailroom)
>> new_allocated = new_headroom + dp_packet_size(b) + new_tailroom;
>>
>> switch (b->source) {
>> - case DPBUF_DPDK:
>> + case DPBUF_DPDK: {
>> +#ifdef DPDK_NETDEV
>> + uint32_t buf_len;
>> +
>> + buf_len = netdev_dpdk_extbuf_size(new_allocated);
>
> Shouldn't we assign into new_allocated here? The rte_pktmbuf_attach_extbuf()
> will update the mbuf->buf_len to the result of this call. However, there
> is the dp_packet_set_allocated(b, new_allocated); call that will overwrite
> that value with a potentially smaller 'new_allocated'. I'm not sure if that
> can cause any issues, since the value is smaller, but it doesn't feel right.
>
May also need to adjust the new_tailroom.
> Or am I missing something here?
>
>> + ovs_assert(buf_len <= UINT16_MAX);
>> + new_base = netdev_dpdk_extbuf_allocate(buf_len);
>> + if (!new_base) {
>> + out_of_memory();
>> + }
>> + dp_packet_copy__(b, new_base, new_headroom, new_tailroom);
>> + netdev_dpdk_extbuf_replace(b, new_base, buf_len);
>> + break;
>> +#else
>> OVS_NOT_REACHED();
>> +#endif
>> + }
>>
>> case DPBUF_MALLOC:
>> if (new_headroom == dp_packet_headroom(b)) {
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index 923191da84..cfd641b493 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -3072,12 +3072,51 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk
>> *dev, struct rte_mbuf **pkts,
>> return cnt;
>> }
>>
>> +uint32_t
>> +netdev_dpdk_extbuf_size(uint32_t data_len)
>> +{
>> + uint32_t buf_len = data_len;
>> +
>> + buf_len += sizeof(struct rte_mbuf_ext_shared_info) + sizeof(uintptr_t);
>> + buf_len = RTE_ALIGN_CEIL(buf_len, sizeof(uintptr_t));
>> +
>> + return buf_len;
>> +}
>> +
>> +void *
>> +netdev_dpdk_extbuf_allocate(uint32_t buf_len)
>> +{
>> + return rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
>> +}
>> +
>> static void
>> netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
>> {
>> rte_free(opaque);
>> }
>>
>> +void
>> +netdev_dpdk_extbuf_replace(struct dp_packet *b, void *buf, uint32_t
>> data_len)
>> +{
>> + struct rte_mbuf *pkt = (struct rte_mbuf *) b;
>> + struct rte_mbuf_ext_shared_info *shinfo;
>> + uint16_t buf_len = data_len;
>> +
>> + shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
>> + netdev_dpdk_extbuf_free,
>> + buf);
>> + ovs_assert(shinfo != NULL);
>> +
>> + if (RTE_MBUF_HAS_EXTBUF(pkt)) {
>> + rte_pktmbuf_detach_extbuf(pkt);
>> + }
>> + rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
>> + shinfo);
>> + /* OVS only supports mono segment.
>> + * Packet size did not change, restore the current segment length. */
>> + pkt->data_len = pkt->pkt_len;
>> +}
>> +
>> static struct rte_mbuf *
>> dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
>> {
>> @@ -3086,16 +3125,14 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt,
>> uint32_t data_len)
>> uint16_t buf_len;
>> void *buf;
>>
>> - total_len += sizeof *shinfo + sizeof(uintptr_t);
>> - total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
>> -
>> + total_len = netdev_dpdk_extbuf_size(total_len);
>> if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
>> VLOG_ERR("Can't copy packet: too big %u", total_len);
>> return NULL;
>> }
>>
>> buf_len = total_len;
>> - buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
>> + buf = netdev_dpdk_extbuf_allocate(buf_len);
>> if (OVS_UNLIKELY(buf == NULL)) {
>> VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
>> return NULL;
>> @@ -3106,7 +3143,7 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt,
>> uint32_t data_len)
>> netdev_dpdk_extbuf_free,
>> buf);
>> if (OVS_UNLIKELY(shinfo == NULL)) {
>> - rte_free(buf);
>> + netdev_dpdk_extbuf_free(NULL, buf);
>> VLOG_ERR("Failed to initialize shared info for mbuf while "
>> "attempting to attach an external buffer.");
>> return NULL;
>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
>> index e6779d478a..0029372ee3 100644
>> --- a/lib/netdev-dpdk.h
>> +++ b/lib/netdev-dpdk.h
>> @@ -32,6 +32,10 @@ struct netdev;
>> void netdev_dpdk_register(const struct smap *);
>> void free_dpdk_buf(struct dp_packet *);
>>
>> +uint32_t netdev_dpdk_extbuf_size(uint32_t);
>> +void *netdev_dpdk_extbuf_allocate(uint32_t);
>> +void netdev_dpdk_extbuf_replace(struct dp_packet *, void *, uint32_t);
>
> nit: Please, name the arguments with the non-descriptive types.
>
>> +
>> bool netdev_dpdk_flow_api_supported(struct netdev *, bool check_only);
>>
>> int
>> diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at
>> index 17d3d25955..57d20acf3b 100644
>> --- a/tests/system-dpdk.at
>> +++ b/tests/system-dpdk.at
>> @@ -976,3 +976,68 @@ AT_CHECK([ovs-appctl --format json --pretty
>> dpif/offload/show], [0], [dnl
>> OVS_DPDK_STOP_VSWITCHD
>> AT_CLEANUP
>> dnl
>> --------------------------------------------------------------------------
>> +
>> +dnl
>> --------------------------------------------------------------------------
>> +dnl Test headroom expansion.
>> +AT_SETUP([OVS-DPDK - headroom expansion])
>> +AT_KEYWORDS([dpdk])
>> +OVS_CHECK_FIREWALL()
>> +OVS_DPDK_PRE_CHECK()
>> +OVS_DPDK_START([--no-pci])
>
> I think, this test deserves an explanation of the topology or a diagram.
>
> The naming scheme also can be cleaned up, as it's hard to follow what is
> what. AFAIU, we have tap0, vxlan3 and vxlan1 in the test namespace and
> we have corresponding br_underlay1, br_underlay0 and br0 in the root.
>
> br_underlay1 and tap0 are in fc00:0::/64 network and are directly connected.
> br_underlay0 and vxlan3 are terminating first vxlan tunnel within fc00:1::/64.
> br0 and vxlan1 are terminating the second vxlan tunnel within fc00:2::/64.
>
> This naming scheme doesn't make a lot of sense to me and is very confusing.
>
> I'd suggest to rename:
>
> br_underlay1 --> br0 (fc00:0::100)
> br_underlay0 --> br1 (fc00:1::100, remote: fc00:0::1)
> br0 --> br2 (fc00:2::100, remote: fc00:1::1)
> tap1 --> p0 (fc00:0::1)
> vxlan3 --> vxlan1 (fc00:1::1, remote: fc00:0::100)
> vxlan1 --> vxlan2 (fc00:2::1, remote: fc00:1::100)
>
> Also, it may be better to name the OVS interface and the tap port the
> same name to avoid extra confusion.
>
> The test also creates tap2 interface in the root namespace in the same
> network as br0. I don't think that interface plays any role in the setup.
> There is no difference between pinging fc00:2::100 and fc00:2::200.
>
> And I wonder if it's maybe worth doing 3 levels of encaps instead of 2,
> just to be sure. WDYT?
>
> Best regards, Ilya Maximets.
>
>> +
>> +ADD_BR([br0])
>> +ADD_BR([br-underlay0])
>> +ADD_BR([br-underlay1])
>> +
>> +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
>> +AT_CHECK([ovs-ofctl add-flow br-underlay0 "actions=normal"])
>> +AT_CHECK([ovs-ofctl add-flow br-underlay1 "actions=normal"])
>> +
>> +ADD_NAMESPACES(at_ns0)
>> +
>> +AT_CHECK([ip link set dev br-underlay1 up])
>> +AT_CHECK([ip link set dev br-underlay0 up])
>> +
>> +AT_CHECK([ip addr add dev br-underlay1 "fc00::100/64" nodad])
>> +AT_CHECK([ovs-vsctl add-port br-underlay1 p0 -- \
>> + set interface p0 type=dpdk -- \
>> + set interface p0 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ {
>> print $NF + 1 }') -- \
>> + set interface p0 options:dpdk-devargs=net_tap0,iface=tap1])
>> +AT_CHECK([ip link set tap1 netns at_ns0])
>> +NS_CHECK_EXEC([at_ns0], [ip link set tap1 up])
>> +OVS_WAIT_UNTIL([ip -n at_ns0 link show dev tap1 | grep -qw LOWER_UP])
>> +NS_CHECK_EXEC([at_ns0], [ip -6 addr add "fc00::1/64" nodad dev tap1])
>> +
>> +ADD_OVS_TUNNEL6([vxlan], [br-underlay0], [at_vxlan2], [fc00::1],
>> + ["fc00:1::100/64" nodad], [options:key=0])
>> +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan3], [at_ns0], [fc00::100],
>> + ["fc00:1::1/64" nodad], [id 0 dstport 4789])
>> +
>> +ADD_OVS_TUNNEL6([vxlan], [br0], [at_vxlan0], [fc00:1::1],
>> + ["fc00:2::100/64" nodad], [options:key=1])
>> +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan1], [at_ns0], [fc00:1::100],
>> + ["fc00:2::1/64" nodad], [id 1 dstport 4789])
>> +
>> +AT_CHECK([ovs-vsctl add-port br0 p1 -- \
>> + set interface p1 type=dpdk -- \
>> + set interface p1 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ {
>> print $NF + 1 }') -- \
>> + set interface p1 options:dpdk-devargs=net_tap1,iface=tap2])
>> +OVS_WAIT_UNTIL([ip link show dev tap2 | grep -qw LOWER_UP])
>> +AT_CHECK([ip -6 addr add "fc00:2::200/64" nodad dev tap2], [], [stdout],
>> [stderr])
>> +
>> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100])
>> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::100])
>> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:2::100])
>> +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:2::200 |
>> FORMAT_PING], [0], [dnl
>> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
>> +])
>> +
>> +dnl Clean up
>> +OVS_DPDK_STOP_VSWITCHD(["dnl
>> +/Failed to send start req to secondary 95/d
>> +/eth_dev_tap_create(): .*: failed to create multiq qdisc./d
>> +/eth_dev_tap_create(): Disabling rte flow support: No such file or
>> directory/d
>> +/qdisc_create_multiq(): Could not add multiq qdisc (2): No such file or
>> directory/d
>> +/tap_nl_dump_ext_ack(): Specified qdisc kind is unknown/d"])
>> +AT_CLEANUP
>> +dnl
>> --------------------------------------------------------------------------
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev