On 3/5/26 12:04 AM, Ilya Maximets wrote:
> On 2/25/26 12:03 PM, David Marchand via dev wrote:
>> By default, DPDK based dp-packets points to data buffers that can't be
>> expanded dynamically.
>> Their layout is as follows:
>> - a minimum 128 bytes headroom chosen at DPDK build time
>>   (RTE_PKTMBUF_HEADROOM),
>> - a maximum size chosen at mempool creation,
>>
>> In some usecases though (like encapsulating with multiple tunnels),
>> a 128 bytes headroom is too short.
>>
>> Keep on using mono segment packets but dynamically allocate buffers
>> in DPDK memory and make use of DPDK external buffers API
>> (previously used for userspace TSO).
>>
>> Signed-off-by: David Marchand <[email protected]>
>> ---
>> Changes since v3:
>> - split buffer length calculation in a helper,
>> - handled running test without qdisc (net/tap does not require
>>   those qdiscs, but spews ERR level logs if absent),
>> - added check on firewall,
>>
>> Changes since v2:
>> - moved check on uint16_t overflow in netdev_dpdk_extbuf_allocate(),
>>
>> Changes since v1:
>> - fixed new segment length (reset by extbuf attach helper),
>> - added a system-dpdk unit test,
>>
>> ---
>>  lib/dp-packet.c      | 17 +++++++++++-
>>  lib/netdev-dpdk.c    | 47 ++++++++++++++++++++++++++++----
>>  lib/netdev-dpdk.h    |  4 +++
>>  tests/system-dpdk.at | 65 ++++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 127 insertions(+), 6 deletions(-)
>>
>> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
>> index c04d608be6..4c45636039 100644
>> --- a/lib/dp-packet.c
>> +++ b/lib/dp-packet.c
>> @@ -255,8 +255,23 @@ dp_packet_resize(struct dp_packet *b, size_t 
>> new_headroom, size_t new_tailroom)
>>      new_allocated = new_headroom + dp_packet_size(b) + new_tailroom;
>>  
>>      switch (b->source) {
>> -    case DPBUF_DPDK:
>> +    case DPBUF_DPDK: {
>> +#ifdef DPDK_NETDEV
>> +        uint32_t buf_len;
>> +
>> +        buf_len = netdev_dpdk_extbuf_size(new_allocated);
> 
> Shouldn't we assign into new_allocated here?  The rte_pktmbuf_attach_extbuf()
> will update the mbuf->buf_len to the result of this call.  However, there
> is the dp_packet_set_allocated(b, new_allocated); call that will overwrite
> that value with a potentially smaller 'new_allocated'.  I'm not sure if that
> can cause any issues, since the value is smaller, but it doesn't feel right.
> 

May also need to adjust the new_tailroom.

> Or am I missing something here?
> 
>> +        ovs_assert(buf_len <= UINT16_MAX);
>> +        new_base = netdev_dpdk_extbuf_allocate(buf_len);
>> +        if (!new_base) {
>> +            out_of_memory();
>> +        }
>> +        dp_packet_copy__(b, new_base, new_headroom, new_tailroom);
>> +        netdev_dpdk_extbuf_replace(b, new_base, buf_len);
>> +        break;
>> +#else
>>          OVS_NOT_REACHED();
>> +#endif
>> +    }
>>  
>>      case DPBUF_MALLOC:
>>          if (new_headroom == dp_packet_headroom(b)) {
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index 923191da84..cfd641b493 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -3072,12 +3072,51 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk 
>> *dev, struct rte_mbuf **pkts,
>>      return cnt;
>>  }
>>  
>> +uint32_t
>> +netdev_dpdk_extbuf_size(uint32_t data_len)
>> +{
>> +    uint32_t buf_len = data_len;
>> +
>> +    buf_len += sizeof(struct rte_mbuf_ext_shared_info) + sizeof(uintptr_t);
>> +    buf_len = RTE_ALIGN_CEIL(buf_len, sizeof(uintptr_t));
>> +
>> +    return buf_len;
>> +}
>> +
>> +void *
>> +netdev_dpdk_extbuf_allocate(uint32_t buf_len)
>> +{
>> +    return rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
>> +}
>> +
>>  static void
>>  netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
>>  {
>>      rte_free(opaque);
>>  }
>>  
>> +void
>> +netdev_dpdk_extbuf_replace(struct dp_packet *b, void *buf, uint32_t 
>> data_len)
>> +{
>> +    struct rte_mbuf *pkt = (struct rte_mbuf *) b;
>> +    struct rte_mbuf_ext_shared_info *shinfo;
>> +    uint16_t buf_len = data_len;
>> +
>> +    shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
>> +                                                netdev_dpdk_extbuf_free,
>> +                                                buf);
>> +    ovs_assert(shinfo != NULL);
>> +
>> +    if (RTE_MBUF_HAS_EXTBUF(pkt)) {
>> +        rte_pktmbuf_detach_extbuf(pkt);
>> +    }
>> +    rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
>> +                              shinfo);
>> +    /* OVS only supports mono segment.
>> +     * Packet size did not change, restore the current segment length. */
>> +    pkt->data_len = pkt->pkt_len;
>> +}
>> +
>>  static struct rte_mbuf *
>>  dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
>>  {
>> @@ -3086,16 +3125,14 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, 
>> uint32_t data_len)
>>      uint16_t buf_len;
>>      void *buf;
>>  
>> -    total_len += sizeof *shinfo + sizeof(uintptr_t);
>> -    total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
>> -
>> +    total_len = netdev_dpdk_extbuf_size(total_len);
>>      if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
>>          VLOG_ERR("Can't copy packet: too big %u", total_len);
>>          return NULL;
>>      }
>>  
>>      buf_len = total_len;
>> -    buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
>> +    buf = netdev_dpdk_extbuf_allocate(buf_len);
>>      if (OVS_UNLIKELY(buf == NULL)) {
>>          VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
>>          return NULL;
>> @@ -3106,7 +3143,7 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, 
>> uint32_t data_len)
>>                                                  netdev_dpdk_extbuf_free,
>>                                                  buf);
>>      if (OVS_UNLIKELY(shinfo == NULL)) {
>> -        rte_free(buf);
>> +        netdev_dpdk_extbuf_free(NULL, buf);
>>          VLOG_ERR("Failed to initialize shared info for mbuf while "
>>                   "attempting to attach an external buffer.");
>>          return NULL;
>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
>> index e6779d478a..0029372ee3 100644
>> --- a/lib/netdev-dpdk.h
>> +++ b/lib/netdev-dpdk.h
>> @@ -32,6 +32,10 @@ struct netdev;
>>  void netdev_dpdk_register(const struct smap *);
>>  void free_dpdk_buf(struct dp_packet *);
>>  
>> +uint32_t netdev_dpdk_extbuf_size(uint32_t);
>> +void *netdev_dpdk_extbuf_allocate(uint32_t);
>> +void netdev_dpdk_extbuf_replace(struct dp_packet *, void *, uint32_t);
> 
> nit: Please, name the arguments with the non-descriptive types.
> 
>> +
>>  bool netdev_dpdk_flow_api_supported(struct netdev *, bool check_only);
>>  
>>  int
>> diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at
>> index 17d3d25955..57d20acf3b 100644
>> --- a/tests/system-dpdk.at
>> +++ b/tests/system-dpdk.at
>> @@ -976,3 +976,68 @@ AT_CHECK([ovs-appctl --format json --pretty 
>> dpif/offload/show], [0], [dnl
>>  OVS_DPDK_STOP_VSWITCHD
>>  AT_CLEANUP
>>  dnl 
>> --------------------------------------------------------------------------
>> +
>> +dnl 
>> --------------------------------------------------------------------------
>> +dnl Test headroom expansion.
>> +AT_SETUP([OVS-DPDK - headroom expansion])
>> +AT_KEYWORDS([dpdk])
>> +OVS_CHECK_FIREWALL()
>> +OVS_DPDK_PRE_CHECK()
>> +OVS_DPDK_START([--no-pci])
> 
> I think, this test deserves an explanation of the topology or a diagram.
> 
> The naming scheme also can be cleaned up, as it's hard to follow what is
> what.  AFAIU, we have tap0, vxlan3 and vxlan1 in the test namespace and
> we have corresponding br_underlay1, br_underlay0 and br0 in the root.
> 
> br_underlay1 and tap0 are in fc00:0::/64 network and are directly connected.
> br_underlay0 and vxlan3 are terminating first vxlan tunnel within fc00:1::/64.
> br0 and vxlan1 are terminating the second vxlan tunnel within fc00:2::/64.
> 
> This naming scheme doesn't make a lot of sense to me and is very confusing.
> 
> I'd suggest to rename:
> 
>   br_underlay1 --> br0    (fc00:0::100)
>   br_underlay0 --> br1    (fc00:1::100, remote: fc00:0::1)
>   br0          --> br2    (fc00:2::100, remote: fc00:1::1)
>   tap1         --> p0     (fc00:0::1)
>   vxlan3       --> vxlan1 (fc00:1::1,   remote: fc00:0::100)
>   vxlan1       --> vxlan2 (fc00:2::1,   remote: fc00:1::100)
> 
> Also, it may be better to name the OVS interface and the tap port the
> same name to avoid extra confusion.
> 
> The test also creates tap2 interface in the root namespace in the same
> network as br0.  I don't think that interface plays any role in the setup.
> There is no difference between pinging fc00:2::100 and fc00:2::200.
> 
> And I wonder if it's maybe worth doing 3 levels of encaps instead of 2,
> just to be sure.  WDYT?
> 
> Best regards, Ilya Maximets.
> 
>> +
>> +ADD_BR([br0])
>> +ADD_BR([br-underlay0])
>> +ADD_BR([br-underlay1])
>> +
>> +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
>> +AT_CHECK([ovs-ofctl add-flow br-underlay0 "actions=normal"])
>> +AT_CHECK([ovs-ofctl add-flow br-underlay1 "actions=normal"])
>> +
>> +ADD_NAMESPACES(at_ns0)
>> +
>> +AT_CHECK([ip link set dev br-underlay1 up])
>> +AT_CHECK([ip link set dev br-underlay0 up])
>> +
>> +AT_CHECK([ip addr add dev br-underlay1 "fc00::100/64" nodad])
>> +AT_CHECK([ovs-vsctl add-port br-underlay1 p0 -- \
>> +          set interface p0 type=dpdk -- \
>> +          set interface p0 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { 
>> print $NF + 1 }') -- \
>> +          set interface p0 options:dpdk-devargs=net_tap0,iface=tap1])
>> +AT_CHECK([ip link set tap1 netns at_ns0])
>> +NS_CHECK_EXEC([at_ns0], [ip link set tap1 up])
>> +OVS_WAIT_UNTIL([ip -n at_ns0 link show dev tap1 | grep -qw LOWER_UP])
>> +NS_CHECK_EXEC([at_ns0], [ip -6 addr add "fc00::1/64" nodad dev tap1])
>> +
>> +ADD_OVS_TUNNEL6([vxlan], [br-underlay0], [at_vxlan2], [fc00::1],
>> +                ["fc00:1::100/64" nodad], [options:key=0])
>> +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan3], [at_ns0], [fc00::100],
>> +                   ["fc00:1::1/64" nodad], [id 0 dstport 4789])
>> +
>> +ADD_OVS_TUNNEL6([vxlan], [br0], [at_vxlan0], [fc00:1::1],
>> +                ["fc00:2::100/64" nodad], [options:key=1])
>> +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan1], [at_ns0], [fc00:1::100],
>> +                   ["fc00:2::1/64" nodad], [id 1 dstport 4789])
>> +
>> +AT_CHECK([ovs-vsctl add-port br0 p1 -- \
>> +          set interface p1 type=dpdk -- \
>> +          set interface p1 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { 
>> print $NF + 1 }') -- \
>> +          set interface p1 options:dpdk-devargs=net_tap1,iface=tap2])
>> +OVS_WAIT_UNTIL([ip link show dev tap2 | grep -qw LOWER_UP])
>> +AT_CHECK([ip -6 addr add "fc00:2::200/64" nodad dev tap2], [], [stdout], 
>> [stderr])
>> +
>> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100])
>> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::100])
>> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:2::100])
>> +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:2::200 | 
>> FORMAT_PING], [0], [dnl
>> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
>> +])
>> +
>> +dnl Clean up
>> +OVS_DPDK_STOP_VSWITCHD(["dnl
>> +/Failed to send start req to secondary 95/d
>> +/eth_dev_tap_create(): .*: failed to create multiq qdisc./d
>> +/eth_dev_tap_create():  Disabling rte flow support: No such file or 
>> directory/d
>> +/qdisc_create_multiq(): Could not add multiq qdisc (2): No such file or 
>> directory/d
>> +/tap_nl_dump_ext_ack(): Specified qdisc kind is unknown/d"])
>> +AT_CLEANUP
>> +dnl 
>> --------------------------------------------------------------------------
> 

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to