Re: [PATCH 2/3] vhost: fix virtqueue access lock in datapath
On 27 Oct 2023, at 11:22, David Marchand wrote: > On Fri, Oct 27, 2023 at 11:05 AM Eelco Chaudron wrote: >>> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c >>> index 759a78e3e3..4116f79d4f 100644 >>> --- a/lib/vhost/virtio_net.c >>> +++ b/lib/vhost/virtio_net.c >>> @@ -1694,6 +1694,17 @@ virtio_dev_rx_packed(struct virtio_net *dev, >>> return pkt_idx; >>> } >>> >>> +static void >>> +virtio_dev_vring_translate(struct virtio_net *dev, struct vhost_virtqueue >>> *vq) >>> +{ >> >> Would it be an idea to annotate this function that it needs to be called >> with the “read locks” (and that it will free them) to avoid the duplicate: >> >> + vhost_user_iotlb_rd_unlock(vq); >> + rte_rwlock_read_unlock(&vq->access_lock); > > The "unlock" annotations do not express read/write concerns for locks. > So that would make the code less readable and potentially hide some issues. > > I prefer to keep as is, with clear calls to rd_lock / rd_unlock in > those functions. ACK, keeping this as is fine by me. Acked-by: Eelco Chaudron >>> + rte_rwlock_write_lock(&vq->access_lock); >>> + vhost_user_iotlb_rd_lock(vq); >>> + if (!vq->access_ok) >>> + vring_translate(dev, vq); >>> + vhost_user_iotlb_rd_unlock(vq); >>> + rte_rwlock_write_unlock(&vq->access_lock); >>> +} >>> + > > -- > David Marchand
Re: [PATCH 3/3] vhost: annotate virtqueue access checks
On 23 Oct 2023, at 11:55, David Marchand wrote: > Modifying vq->access_ok should be done with a write lock taken. > Annotate vring_translate() and vring_invalidate() and add missing locks. > > Signed-off-by: David Marchand The changes look good to me. Acked-by: Eelco Chaudron
Re: [PATCH 2/3] vhost: fix virtqueue access lock in datapath
On 23 Oct 2023, at 11:55, David Marchand wrote: > Now that a r/w lock is used, the access_ok field should only be updated > under a write lock. > > Since the datapath code only takes a read lock on the virtqueue to check > access_ok, this lock must be released and a write lock taken before > calling vring_translate(). > > Fixes: 03f77d66d966 ("vhost: change virtqueue access lock to a read/write > one") > Cc: sta...@dpdk.org > > Signed-off-by: David Marchand Only one question, but whatever the outcome is the change looks good to me. Acked-by: Eelco Chaudron > --- > lib/vhost/virtio_net.c | 60 +++--- > 1 file changed, 44 insertions(+), 16 deletions(-) > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c > index 759a78e3e3..4116f79d4f 100644 > --- a/lib/vhost/virtio_net.c > +++ b/lib/vhost/virtio_net.c > @@ -1694,6 +1694,17 @@ virtio_dev_rx_packed(struct virtio_net *dev, > return pkt_idx; > } > > +static void > +virtio_dev_vring_translate(struct virtio_net *dev, struct vhost_virtqueue > *vq) > +{ Would it be an idea to annotate this function that it needs to be called with the “read locks” (and that it will free them) to avoid the duplicate: + vhost_user_iotlb_rd_unlock(vq); + rte_rwlock_read_unlock(&vq->access_lock); > + rte_rwlock_write_lock(&vq->access_lock); > + vhost_user_iotlb_rd_lock(vq); > + if (!vq->access_ok) > + vring_translate(dev, vq); > + vhost_user_iotlb_rd_unlock(vq); > + rte_rwlock_write_unlock(&vq->access_lock); > +} > + > static __rte_always_inline uint32_t > virtio_dev_rx(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mbuf **pkts, uint32_t count) > @@ -1708,9 +1719,13 @@ virtio_dev_rx(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > vhost_user_iotlb_rd_lock(vq); > > - if (unlikely(!vq->access_ok)) > - if (unlikely(vring_translate(dev, vq) < 0)) > - goto out; > + if (unlikely(!vq->access_ok)) { > + vhost_user_iotlb_rd_unlock(vq); > + rte_rwlock_read_unlock(&vq->access_lock); > + > + virtio_dev_vring_translate(dev, vq); > + goto out_no_unlock; > + } > > count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); > if (count == 0) > @@ -1729,6 +1744,7 @@ virtio_dev_rx(struct virtio_net *dev, struct > vhost_virtqueue *vq, > out_access_unlock: > rte_rwlock_read_unlock(&vq->access_lock); > > +out_no_unlock: > return nb_tx; > } > > @@ -2523,9 +2539,13 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, > struct vhost_virtqueue *vq, > > vhost_user_iotlb_rd_lock(vq); > > - if (unlikely(!vq->access_ok)) > - if (unlikely(vring_translate(dev, vq) < 0)) > - goto out; > + if (unlikely(!vq->access_ok)) { > + vhost_user_iotlb_rd_unlock(vq); > + rte_rwlock_read_unlock(&vq->access_lock); > + > + virtio_dev_vring_translate(dev, vq); > + goto out_no_unlock; > + } > > count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); > if (count == 0) > @@ -2546,6 +2566,7 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, > struct vhost_virtqueue *vq, > out_access_unlock: > rte_rwlock_write_unlock(&vq->access_lock); > > +out_no_unlock: > return nb_tx; > } > > @@ -3576,11 +3597,13 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, > > vhost_user_iotlb_rd_lock(vq); > > - if (unlikely(!vq->access_ok)) > - if (unlikely(vring_translate(dev, vq) < 0)) { > - count = 0; > - goto out; > - } > + if (unlikely(!vq->access_ok)) { > + vhost_user_iotlb_rd_unlock(vq); > + rte_rwlock_read_unlock(&vq->access_lock); > + > + virtio_dev_vring_translate(dev, vq); > + goto out_no_unlock; > + } > > /* >* Construct a RARP broadcast packet, and inject it to the "pkts" > @@ -3641,6 +3664,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, > if (unlikely(rarp_mbuf != NULL)) > count += 1; > > +out_no_unlock: > return count; > } > > @@ -4190,11 +4214,14 @@ rte_vhost_async_try_dequeue_burst(int vid, uint16_t > queue_id, > > vhost_user_iotlb_rd_lock(vq); > > - if (unlikely(vq->access_ok == 0)) > - if (unlikely(vring_translate(dev, vq) < 0)) { > -
Re: [PATCH 1/3] vhost: robustify virtqueue access lock asserts
On 23 Oct 2023, at 11:55, David Marchand wrote: > A simple comment in vhost_user_msg_handler() is not that robust. > > Add a lock_all_qps property to message handlers so that their > implementation can add a build check and assert a vq is locked. > > Signed-off-by: David Marchand This change looks good to me. Acked-by: Eelco Chaudron
Re: [PATCH v3] vhost: add IRQ suppression
On 29 Sep 2023, at 12:38, Maxime Coquelin wrote: > Guest notifications offloading, which has been introduced > in v23.07, aims at offloading syscalls out of the datapath. > > This patch optimizes the offloading by not offloading the > guest notification for a given virtqueue if one is already > being offloaded by the application. > > With a single VDUSE device, we can already see few > notifications being suppressed when doing throughput > testing with Iperf3. We can expect to see much more being > suppressed when the offloading thread is under pressure. > > Signed-off-by: Maxime Coquelin Thanks for adding this Maxime. I did some tests with OVS and my old determinism patchset, and it works perfectly. I have two small nits, but this change looks good to me. Acked-by: Eelco Chaudron > --- > > v3: s/0/false/ (David) > > lib/vhost/vhost.c | 4 > lib/vhost/vhost.h | 27 +-- > 2 files changed, 25 insertions(+), 6 deletions(-) > > diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c > index c03bb9c6eb..7fde412ef3 100644 > --- a/lib/vhost/vhost.c > +++ b/lib/vhost/vhost.c > @@ -49,6 +49,8 @@ static const struct vhost_vq_stats_name_off > vhost_vq_stat_strings[] = { > stats.guest_notifications_offloaded)}, > {"guest_notifications_error", offsetof(struct vhost_virtqueue, > stats.guest_notifications_error)}, > + {"guest_notifications_suppressed", offsetof(struct vhost_virtqueue, > + stats.guest_notifications_suppressed)}, > {"iotlb_hits", offsetof(struct vhost_virtqueue, > stats.iotlb_hits)}, > {"iotlb_misses", offsetof(struct vhost_virtqueue, > stats.iotlb_misses)}, > {"inflight_submitted", offsetof(struct vhost_virtqueue, > stats.inflight_submitted)}, > @@ -1517,6 +1519,8 @@ rte_vhost_notify_guest(int vid, uint16_t queue_id) > > rte_rwlock_read_lock(&vq->access_lock); > > + __atomic_store_n(&vq->irq_pending, false, __ATOMIC_RELEASE); > + > if (dev->backend_ops->inject_irq(dev, vq)) { > if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > __atomic_fetch_add(&vq->stats.guest_notifications_error, > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h > index 9723429b1c..5fc9035a1f 100644 > --- a/lib/vhost/vhost.h > +++ b/lib/vhost/vhost.h > @@ -156,6 +156,7 @@ struct virtqueue_stats { > uint64_t iotlb_misses; > uint64_t inflight_submitted; > uint64_t inflight_completed; > + uint64_t guest_notifications_suppressed; > /* Counters below are atomic, and should be incremented as such. */ > uint64_t guest_notifications; > uint64_t guest_notifications_offloaded; > @@ -346,6 +347,8 @@ struct vhost_virtqueue { > > struct vhost_vring_addr ring_addrs; > struct virtqueue_stats stats; > + > + bool irq_pending; nit: Other elements in this structure have the names aligned, not sure if this should be done for this item also. > } __rte_cache_aligned; > > /* Virtio device status as per Virtio specification */ > @@ -908,12 +911,24 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, > uint16_t old) > static __rte_always_inline void > vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq) > { > - if (dev->notify_ops->guest_notify && > - dev->notify_ops->guest_notify(dev->vid, vq->index)) { > - if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > - > __atomic_fetch_add(&vq->stats.guest_notifications_offloaded, > - 1, __ATOMIC_RELAXED); > - return; > + bool expected = false; > + > + if (dev->notify_ops->guest_notify) { > + if (__atomic_compare_exchange_n(&vq->irq_pending, &expected, > true, 0, > + __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { > + if (dev->notify_ops->guest_notify(dev->vid, vq->index)) > { > + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > + > __atomic_fetch_add(&vq->stats.guest_notifications_offloaded, > + 1, __ATOMIC_RELAXED); > + return; > + } > + > + /* Offloading failed, fallback to direct IRQ injection > */ nit: Some comments end with a dot and some do not, not sure what is the preference in DPDK. > + __atomic_store_n(&vq->irq_pending, false, > __ATOMIC_RELEASE); > + } else { > + vq->stats.guest_notifications_suppressed++; > + return; > + } > } > > if (dev->backend_ops->inject_irq(dev, vq)) { > -- > 2.41.0
Re: [PATCH] net/iavf: fix checksum offloading
On 18 Aug 2023, at 11:03, David Marchand wrote: > The only presence of RTE_MBUF_F_TX_IPV4 can't be used as an indicator > that a checksum offload has been requested by an application. > Check that RTE_MBUF_F_TX_IP_CKSUM or others flags have been set. > > Fixes: 1e728b01120c ("net/iavf: rework Tx path") > Cc: sta...@dpdk.org Thanks for fixing this David! I tested and reviewed this change and it works now. Tested-by: Eelco Chaudron Acked-by: Eelco Chaudron > Signed-off-by: David Marchand > --- > drivers/net/iavf/iavf_rxtx.c | 4 > 1 file changed, 4 insertions(+) > > diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c > index f7df4665d1..b9e2879764 100644 > --- a/drivers/net/iavf/iavf_rxtx.c > +++ b/drivers/net/iavf/iavf_rxtx.c > @@ -2652,6 +2652,9 @@ iavf_build_data_desc_cmd_offset_fields(volatile > uint64_t *qw1, > offset |= (m->l2_len >> 1) > << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT; > > + if ((m->ol_flags & IAVF_TX_CKSUM_OFFLOAD_MASK) == 0) > + goto skip_cksum; > + > /* Enable L3 checksum offloading inner */ > if (m->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) { > if (m->ol_flags & RTE_MBUF_F_TX_IPV4) { > @@ -2702,6 +2705,7 @@ iavf_build_data_desc_cmd_offset_fields(volatile > uint64_t *qw1, > break; > } > > +skip_cksum: > *qw1 = rte_cpu_to_le_64uint64_t)command << > IAVF_TXD_DATA_QW1_CMD_SHIFT) & IAVF_TXD_DATA_QW1_CMD_MASK) | > (((uint64_t)offset << IAVF_TXD_DATA_QW1_OFFSET_SHIFT) & > -- > 2.41.0
Re: [PATCH v3 0/4] vhost: add device op to offload the interrupt kick
On 1 Jun 2023, at 22:00, Maxime Coquelin wrote: > On 5/17/23 11:08, Eelco Chaudron wrote: >> This series adds an operation callback which gets called every time the >> library wants to call eventfd_write(). This eventfd_write() call could >> result in a system call, which could potentially block the PMD thread. >> >> The callback function can decide whether it's ok to handle the >> eventfd_write() now or have the newly introduced function, >> rte_vhost_notify_guest(), called at a later time. >> >> This can be used by 3rd party applications, like OVS, to avoid system >> calls being called as part of the PMD threads. >> >> v3: >> - Changed ABI compatibility code to no longer use a boolean >>to avoid having to disable specific GCC warnings. >> - Moved the fd check fix to a separate patch (patch 3/4). >> - Fixed some coding style issues. >> >> v2: - Used vhost_virtqueue->index to find index for operation. >> - Aligned function name to VDUSE RFC patchset. >> - Added error and offload statistics counter. >> - Mark new API as experimental. >> - Change the virtual queue spin lock to read/write spin lock. >> - Made shared counters atomic. >> - Add versioned rte_vhost_driver_callback_register() for >>ABI compliance. >> >> Eelco Chaudron (4): >>vhost: change vhost_virtqueue access lock to a read/write one >>vhost: make the guest_notifications statistic counter atomic >>vhost: fix invalid call FD handling >>vhost: add device op to offload the interrupt kick >> >> >> lib/eal/include/generic/rte_rwlock.h | 17 + >> lib/vhost/meson.build| 2 + >> lib/vhost/rte_vhost.h| 23 ++- >> lib/vhost/socket.c | 63 +-- >> lib/vhost/version.map| 9 +++ >> lib/vhost/vhost.c| 92 +--- >> lib/vhost/vhost.h| 69 ++--- >> lib/vhost/vhost_user.c | 14 ++--- >> lib/vhost/virtio_net.c | 90 +-- >> 9 files changed, 278 insertions(+), 101 deletions(-) >> > > > Applied to dpdk-next-virtio/main. Thanks Maxime! //Eelco
Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick
On 1 Jun 2023, at 10:29, Maxime Coquelin wrote: > On 6/1/23 10:15, Eelco Chaudron wrote: >> >> >> On 1 Jun 2023, at 4:18, Xia, Chenbo wrote: >> >>>> -Original Message- >>>> From: Maxime Coquelin >>>> Sent: Wednesday, May 31, 2023 5:29 PM >>>> To: Xia, Chenbo ; Thomas Monjalon >>>> ; Eelco Chaudron ; >>>> david.march...@redhat.com >>>> Cc: dev@dpdk.org >>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt >>>> kick >>>> >>>> >>>> >>>> On 5/31/23 08:19, Xia, Chenbo wrote: >>>>>> -Original Message- >>>>>> From: Maxime Coquelin >>>>>> Sent: Tuesday, May 30, 2023 11:17 PM >>>>>> To: Thomas Monjalon ; Eelco Chaudron >>>>>> ; Xia, Chenbo ; >>>>>> david.march...@redhat.com >>>>>> Cc: dev@dpdk.org >>>>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the >>>> interrupt >>>>>> kick >>>>>> >>>>>> >>>>>> >>>>>> On 5/30/23 15:16, Thomas Monjalon wrote: >>>>>>> 30/05/2023 15:02, Maxime Coquelin: >>>>>>>> >>>>>>>> On 5/17/23 11:09, Eelco Chaudron wrote: >>>>>>>>> This patch adds an operation callback which gets called every time >>>> the >>>>>>>>> library wants to call eventfd_write(). This eventfd_write() call >>>> could >>>>>>>>> result in a system call, which could potentially block the PMD >>>> thread. >>>>>>>>> >>>>>>>>> The callback function can decide whether it's ok to handle the >>>>>>>>> eventfd_write() now or have the newly introduced function, >>>>>>>>> rte_vhost_notify_guest(), called at a later time. >>>>>>>>> >>>>>>>>> This can be used by 3rd party applications, like OVS, to avoid >>>> system >>>>>>>>> calls being called as part of the PMD threads. >>>>>>>>> >>>>>>>>> Signed-off-by: Eelco Chaudron >>>>>>>>> --- >>>>>>>>> lib/vhost/meson.build |2 ++ >>>>>>>>> lib/vhost/rte_vhost.h | 23 +- >>>>>>>>> lib/vhost/socket.c| 63 >>>>>> ++--- >>>>>>>>> lib/vhost/version.map |9 +++ >>>>>>>>> lib/vhost/vhost.c | 38 ++ >>>>>>>>> lib/vhost/vhost.h | 58 --- >>>> --- >>>>>> --- >>>>>>>>> 6 files changed, 171 insertions(+), 22 deletions(-) >>>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> The patch looks good to me, but that's the first time we use function >>>>>>>> versioning in Vhost library, so I'd like another pair of eyes to be >>>>>> sure >>>>>>>> I don't miss anything. >>>>>>>> >>>>>>>> Reviewed-by: Maxime Coquelin >>>>>>>> >>>>>>>> Thomas, do we need to mention it somewhere in the release note? >>>>>>> >>>>>>> If compatibility is kept, I think we don't need to mention it. >>>>>>> >>>>>>> >>>>>> >>>>>> Thanks Thomas for the information. >>>>>> >>>>>> Maxime >>>>> >>>>> About release note, except the versioning, there is also one new API >>>> introduced >>>>> in this patch, so we still need to mention this in release note >>>> >>>> Right, good catch. >>>> Eelco, let me know what you would put, I'll add it while applying (No >>>> need for a new revision). >>> >>> Btw, the vhost_lib.rst also needs a new item.. >> >> What about the following? >> >> diff --git a/doc/guides/prog_guide/vhost_lib.rst >> b/doc/guides/prog_guide/vhost_lib.rst >> index e8bb8c9b7b..0f964d7a4a 100644 >> --- a/doc/guides/prog_guide/vhost_lib.rst >> +++ b/doc/guides/prog_guide/vhost_lib.rst >> @@ -334,6 +334,11 @@ The following is an overview of some key Vhost API >> functions: >> Clean DMA vChannel finished to use. After this function is called, >> the specified DMA vChannel should no longer be used by the Vhost library. >> >> +* ``rte_vhost_notify_guest(int vid, uint16_t queue_id)`` >> + >> + Inject the offloaded interrupt received by the 'guest_notify' callback, >> + into the vhost device's queue. >> + >> Vhost-user Implementations >> -- >> >> Maxime do you want to add it, or do you want a new rev? > > That's fine, I just added it now :) Can you check the next-virtio main > branch and confirm all is OK for your series? > > https://git.dpdk.org/next/dpdk-next-virtio/log/ > One small spelling issue, the rest looks good: +* **Added callback API support for interrupt handling in the vhost library.** + + A new callback, guest_notify, is introduced that can be used to handle the + interrupt kick outside of the datapath fast path. In addition, a new API, + rte_vhost_notify_guest(), is added to raise the interrupt outside of the + past path. Last sentence spells ‘past path’ instead of ‘fast path’. > Thanks, > Maxime > >> >>> Thanks, >>> Chenbo >>> >>>> >>>> Thanks, >>>> Maxime >>>> >>>>> Thanks, >>>>> Chenbo >>
Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick
On 1 Jun 2023, at 4:18, Xia, Chenbo wrote: >> -Original Message- >> From: Maxime Coquelin >> Sent: Wednesday, May 31, 2023 5:29 PM >> To: Xia, Chenbo ; Thomas Monjalon >> ; Eelco Chaudron ; >> david.march...@redhat.com >> Cc: dev@dpdk.org >> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt >> kick >> >> >> >> On 5/31/23 08:19, Xia, Chenbo wrote: >>>> -Original Message- >>>> From: Maxime Coquelin >>>> Sent: Tuesday, May 30, 2023 11:17 PM >>>> To: Thomas Monjalon ; Eelco Chaudron >>>> ; Xia, Chenbo ; >>>> david.march...@redhat.com >>>> Cc: dev@dpdk.org >>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the >> interrupt >>>> kick >>>> >>>> >>>> >>>> On 5/30/23 15:16, Thomas Monjalon wrote: >>>>> 30/05/2023 15:02, Maxime Coquelin: >>>>>> >>>>>> On 5/17/23 11:09, Eelco Chaudron wrote: >>>>>>> This patch adds an operation callback which gets called every time >> the >>>>>>> library wants to call eventfd_write(). This eventfd_write() call >> could >>>>>>> result in a system call, which could potentially block the PMD >> thread. >>>>>>> >>>>>>> The callback function can decide whether it's ok to handle the >>>>>>> eventfd_write() now or have the newly introduced function, >>>>>>> rte_vhost_notify_guest(), called at a later time. >>>>>>> >>>>>>> This can be used by 3rd party applications, like OVS, to avoid >> system >>>>>>> calls being called as part of the PMD threads. >>>>>>> >>>>>>> Signed-off-by: Eelco Chaudron >>>>>>> --- >>>>>>> lib/vhost/meson.build |2 ++ >>>>>>> lib/vhost/rte_vhost.h | 23 +- >>>>>>> lib/vhost/socket.c| 63 >>>> ++--- >>>>>>> lib/vhost/version.map |9 +++ >>>>>>> lib/vhost/vhost.c | 38 ++ >>>>>>> lib/vhost/vhost.h | 58 --- >> --- >>>> --- >>>>>>> 6 files changed, 171 insertions(+), 22 deletions(-) >>>>>>> >>>>>> >>>>>> >>>>>> The patch looks good to me, but that's the first time we use function >>>>>> versioning in Vhost library, so I'd like another pair of eyes to be >>>> sure >>>>>> I don't miss anything. >>>>>> >>>>>> Reviewed-by: Maxime Coquelin >>>>>> >>>>>> Thomas, do we need to mention it somewhere in the release note? >>>>> >>>>> If compatibility is kept, I think we don't need to mention it. >>>>> >>>>> >>>> >>>> Thanks Thomas for the information. >>>> >>>> Maxime >>> >>> About release note, except the versioning, there is also one new API >> introduced >>> in this patch, so we still need to mention this in release note >> >> Right, good catch. >> Eelco, let me know what you would put, I'll add it while applying (No >> need for a new revision). > > Btw, the vhost_lib.rst also needs a new item.. What about the following? diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst index e8bb8c9b7b..0f964d7a4a 100644 --- a/doc/guides/prog_guide/vhost_lib.rst +++ b/doc/guides/prog_guide/vhost_lib.rst @@ -334,6 +334,11 @@ The following is an overview of some key Vhost API functions: Clean DMA vChannel finished to use. After this function is called, the specified DMA vChannel should no longer be used by the Vhost library. +* ``rte_vhost_notify_guest(int vid, uint16_t queue_id)`` + + Inject the offloaded interrupt received by the 'guest_notify' callback, + into the vhost device's queue. + Vhost-user Implementations -- Maxime do you want to add it, or do you want a new rev? > Thanks, > Chenbo > >> >> Thanks, >> Maxime >> >>> Thanks, >>> Chenbo
Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick
On 31 May 2023, at 14:48, Maxime Coquelin wrote: > On 5/31/23 14:01, David Marchand wrote: >> Eelco, Maxime, >> >> On Wed, May 17, 2023 at 11:09 AM Eelco Chaudron wrote: >>> @@ -846,6 +848,11 @@ vhost_user_socket_mem_free(struct vhost_user_socket >>> *vsocket) >>> vsocket->path = NULL; >>> } >>> >>> + if (vsocket && vsocket->malloc_notify_ops) { >>> + free(vsocket->malloc_notify_ops); >>> + vsocket->malloc_notify_ops = NULL; >>> + } >>> + >>> if (vsocket) { >>> free(vsocket); >>> vsocket = NULL; >> >> Nit: we had several cleanups in the tree to remove patterns like if >> (ptr) free(ptr). >> Here, this helper could look for vsocket being NULL first thing, then >> call free() unconditionnally. >> And resetting the fields to NULL is probably not that useful, since >> the vsocket is freed at the end. >> Wdyt of: >> >> static void >> vhost_user_socket_mem_free(struct vhost_user_socket *vsocket) >> { >> if (vsocket == NULL) >> return; >> >> free(vsocket->path); >> free(vsocket->malloc_notify_ops); >> free(vsocket); >> } >> >> > > It is indeed better, I can fix while applying if Eelco agrees. Looks good to me. //Eelco
Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick
On 31 May 2023, at 11:29, Maxime Coquelin wrote: > On 5/31/23 08:19, Xia, Chenbo wrote: >>> -Original Message- >>> From: Maxime Coquelin >>> Sent: Tuesday, May 30, 2023 11:17 PM >>> To: Thomas Monjalon ; Eelco Chaudron >>> ; Xia, Chenbo ; >>> david.march...@redhat.com >>> Cc: dev@dpdk.org >>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt >>> kick >>> >>> >>> >>> On 5/30/23 15:16, Thomas Monjalon wrote: >>>> 30/05/2023 15:02, Maxime Coquelin: >>>>> >>>>> On 5/17/23 11:09, Eelco Chaudron wrote: >>>>>> This patch adds an operation callback which gets called every time the >>>>>> library wants to call eventfd_write(). This eventfd_write() call could >>>>>> result in a system call, which could potentially block the PMD thread. >>>>>> >>>>>> The callback function can decide whether it's ok to handle the >>>>>> eventfd_write() now or have the newly introduced function, >>>>>> rte_vhost_notify_guest(), called at a later time. >>>>>> >>>>>> This can be used by 3rd party applications, like OVS, to avoid system >>>>>> calls being called as part of the PMD threads. >>>>>> >>>>>> Signed-off-by: Eelco Chaudron >>>>>> --- >>>>>> lib/vhost/meson.build |2 ++ >>>>>> lib/vhost/rte_vhost.h | 23 +- >>>>>> lib/vhost/socket.c| 63 >>> ++--- >>>>>> lib/vhost/version.map |9 +++ >>>>>> lib/vhost/vhost.c | 38 ++ >>>>>> lib/vhost/vhost.h | 58 -- >>> --- >>>>>> 6 files changed, 171 insertions(+), 22 deletions(-) >>>>>> >>>>> >>>>> >>>>> The patch looks good to me, but that's the first time we use function >>>>> versioning in Vhost library, so I'd like another pair of eyes to be >>> sure >>>>> I don't miss anything. >>>>> >>>>> Reviewed-by: Maxime Coquelin >>>>> >>>>> Thomas, do we need to mention it somewhere in the release note? >>>> >>>> If compatibility is kept, I think we don't need to mention it. >>>> >>>> >>> >>> Thanks Thomas for the information. >>> >>> Maxime >> >> About release note, except the versioning, there is also one new API >> introduced >> in this patch, so we still need to mention this in release note > > Right, good catch. > Eelco, let me know what you would put, I'll add it while applying (No > need for a new revision). Thanks guys! What about the following release note addition: * **Added callback API support for interrupt handling in the vhost library.** A new callback, guest_notify, is introduced that can be used to handle the interrupt kick outside of the datapath fast path. In addition, a new API, rte_vhost_notify_guest(), is added to raise the interrupt outside of the past path. Please change if you see fit. Thanks, Eelco > Thanks, > Maxime > >> Thanks, >> Chenbo
Re: [PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a read/write one
On 31 May 2023, at 11:27, Maxime Coquelin wrote: > On 5/31/23 08:37, Xia, Chenbo wrote: >> Hi Eelco, >> >>> -Original Message- >>> From: Eelco Chaudron >>> Sent: Wednesday, May 17, 2023 5:09 PM >>> To: maxime.coque...@redhat.com; Xia, Chenbo ; >>> david.march...@redhat.com >>> Cc: dev@dpdk.org >>> Subject: [PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a >>> read/write one >>> >>> This change will allow the vhost interrupt datapath handling to be split >>> between two processed without one of them holding an explicit lock. >>> >>> Signed-off-by: Eelco Chaudron >>> --- >>> lib/eal/include/generic/rte_rwlock.h | 17 ++ >>> lib/vhost/vhost.c| 46 + >>> lib/vhost/vhost.h|4 +- >>> lib/vhost/vhost_user.c | 14 +++-- >>> lib/vhost/virtio_net.c | 90 + >>> - >>> 5 files changed, 94 insertions(+), 77 deletions(-) >>> >>> diff --git a/lib/eal/include/generic/rte_rwlock.h >>> b/lib/eal/include/generic/rte_rwlock.h >>> index 71e2d8d5f4..9e083bbc61 100644 >>> --- a/lib/eal/include/generic/rte_rwlock.h >>> +++ b/lib/eal/include/generic/rte_rwlock.h >>> @@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl) >>> __atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE); >>> } >>> >>> +/** >>> + * Test if the write lock is taken. >>> + * >>> + * @param rwl >>> + * A pointer to a rwlock structure. >>> + * @return >>> + * 1 if the write lock is currently taken; 0 otherwise. >>> + */ >>> +static inline int >>> +rte_rwlock_write_is_locked(rte_rwlock_t *rwl) >>> +{ >>> + if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE) >>> + return 1; >>> + >>> + return 0; >>> +} >>> + >> >> Again we need to update release note as it's a new EAL API. >> >>> /** >>>* Try to execute critical section in a hardware memory transaction, if >>> it >>>* fails or not available take a read lock >>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c >>> index ef37943817..74bdbfd810 100644 >>> --- a/lib/vhost/vhost.c >>> +++ b/lib/vhost/vhost.c >>> @@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue >>> *vq) >>> else >>> rte_free(vq->shadow_used_split); >>> >>> - rte_spinlock_lock(&vq->access_lock); >>> + rte_rwlock_write_lock(&vq->access_lock); >>> vhost_free_async_mem(vq); >>> - rte_spinlock_unlock(&vq->access_lock); >>> + rte_rwlock_write_unlock(&vq->access_lock); >>> rte_free(vq->batch_copy_elems); >>> vhost_user_iotlb_destroy(vq); >>> rte_free(vq->log_cache); >>> @@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t >>> vring_idx) >>> >>> dev->virtqueue[i] = vq; >>> init_vring_queue(dev, vq, i); >>> - rte_spinlock_init(&vq->access_lock); >>> + rte_rwlock_init(&vq->access_lock); >>> vq->avail_wrap_counter = 1; >>> vq->used_wrap_counter = 1; >>> vq->signalled_used_valid = false; >>> @@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) >>> if (!vq) >>> return -1; >>> >>> - rte_spinlock_lock(&vq->access_lock); >>> + rte_rwlock_read_lock(&vq->access_lock); >>> >>> if (vq_is_packed(dev)) >>> vhost_vring_call_packed(dev, vq); >>> else >>> vhost_vring_call_split(dev, vq); >>> >>> - rte_spinlock_unlock(&vq->access_lock); >>> + rte_rwlock_read_unlock(&vq->access_lock); >> >> Not sure about this. vhost_ring_call_packed/split is changing some field in >> Vq. Should we use write lock here? > > I don't think so, the purpose of the access_lock is not to make the > datapath threads-safe, but to protect the datapath from metadata changes > by the control path. Thanks Chinbo for the review, and see Maxime’s comment above. Does this clarify your concern/question? >> >> Thanks, >> Chenbo >>
Re: [PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a read/write one
On 17 May 2023, at 19:33, Maxime Coquelin wrote: > Hi Eelco, > > On 5/17/23 11:08, Eelco Chaudron wrote: >> This change will allow the vhost interrupt datapath handling to be split >> between two processed without one of them holding an explicit lock. >> > > As I had a tuned PVP benchmarking setup at hand, I ran a 0.02% loss > RFC2544 test with and without this patch to ensure moving to RX locks > would not introduce performance regression. > > I can confirm there are no performance regression introduced with this > patch applied: > Tested-by: Maxime Coquelin > > And the patch looks good to me: > Reviewed-by: Maxime Coquelin Thanks for running the test and doing the review! //Eelco >> Signed-off-by: Eelco Chaudron >> --- >> lib/eal/include/generic/rte_rwlock.h | 17 ++ >> lib/vhost/vhost.c| 46 + >> lib/vhost/vhost.h|4 +- >> lib/vhost/vhost_user.c | 14 +++-- >> lib/vhost/virtio_net.c | 90 >> +- >> 5 files changed, 94 insertions(+), 77 deletions(-) >>
Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
On 16 May 2023, at 13:36, Eelco Chaudron wrote: > On 16 May 2023, at 12:12, David Marchand wrote: > >> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron wrote: >>> On 10 May 2023, at 13:44, David Marchand wrote: >> >> [snip] >> >>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket >>>>> *vsocket) >>>>> vsocket->path = NULL; >>>>> } >>>>> >>>>> + if (vsocket && vsocket->alloc_notify_ops) { >>>>> +#pragma GCC diagnostic push >>>>> +#pragma GCC diagnostic ignored "-Wcast-qual" >>>>> + free((struct rte_vhost_device_ops *)vsocket->notify_ops); >>>>> +#pragma GCC diagnostic pop >>>>> + vsocket->notify_ops = NULL; >>>>> + } >>>> >>>> Rather than select the behavior based on a boolean (and here force the >>>> compiler to close its eyes), I would instead add a non const pointer >>>> to ops (let's say alloc_notify_ops) in vhost_user_socket. >>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops); >>> >>> Good idea, I will make the change in v3. >> >> Feel free to use a better name for this field :-). >> >>> >>>>> + >>>>> if (vsocket) { >>>>> free(vsocket); >>>>> vsocket = NULL; >> >> [snip] >> >>>>> + /* >>>>> +* Although the ops structure is a const structure, we do need to >>>>> +* override the guest_notify operation. This is because with the >>>>> +* previous APIs it was "reserved" and if any garbage value was >>>>> passed, >>>>> +* it could crash the application. >>>>> +*/ >>>>> + if (ops && !ops->guest_notify) { >>>> >>>> Hum, as described in the comment above, I don't think we should look >>>> at ops->guest_notify value at all. >>>> Checking ops != NULL should be enough. >>> >>> Not sure I get you here. If the guest_notify passed by the user is NULL, it >>> means the previously ‘reserved[1]’ field is NULL, so we do not need to use >>> a new structure. >>> >>> I guess your comment would be true if we would introduce a new field in the >>> data structure, not replacing a reserved one. >> >> Hum, I don't understand my comment either o_O'. >> Too many days off... or maybe my evil twin took over the keyboard. >> >> >>> >>>>> + struct rte_vhost_device_ops *new_ops; >>>>> + >>>>> + new_ops = malloc(sizeof(*new_ops)); >>>> >>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc. >>>> I am unclear of the impact though. >>> >>> Don’t think there is a portable API that we can use to determine the NUMA >>> for the ops memory and then allocate this on the same numa? >>> >>> Any thoughts or ideas on how to solve this? I hope most people will >>> memset() the ops structure and the reserved[1] part is zero, but it might >>> be a problem in the future if more extensions get added. >> >> Determinining current numa is doable, via 'ops' >> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in >> numa_realloc(). >> The problem is how to allocate on this numa with the libc allocator >> for which I have no idea... >> We could go with the dpdk allocator (again, like numa_realloc()). >> >> >> In practice, the passed ops will be probably from a const variable in >> the program .data section (for which I think fields are set to 0 >> unless explicitly initialised), or a memset() will be called for a >> dynamic allocation from good citizens. >> So we can probably live with the current proposal. >> Plus, this is only for one release, since in 23.11 with the ABI bump, >> we will drop this compat code. >> >> Maxime, Chenbo, what do you think? > > Wait for their response, but for now I assume we can just keep the numa > unaware malloc(). > >> >> [snip] >> >>>> >>>> But putting indentation aside, is this change equivalent? >>>> - if ((vhost_need_event(vhost_used_event(vq), new, old) && >>>> - (vq->callfd >= 0)) || >>>> - unlikely(!signalled_used_valid)) { >>>> + if ((vhost_need_event(vhost_used_event(vq), new, old) || >>>> + unlikely(!signalled_used_valid)) && >>>> + vq->callfd >= 0) { >>> >>> They are not equal, but in the past eventfd_write() should also not have >>> been called with callfd < 0, guess this was an existing bug ;) >> >> I think this should be a separate fix. > > ACK, will add a separate patch in this series to fix it. FYI I sent out the v3 patch. //Eelco
[PATCH v3 4/4] vhost: add device op to offload the interrupt kick
This patch adds an operation callback which gets called every time the library wants to call eventfd_write(). This eventfd_write() call could result in a system call, which could potentially block the PMD thread. The callback function can decide whether it's ok to handle the eventfd_write() now or have the newly introduced function, rte_vhost_notify_guest(), called at a later time. This can be used by 3rd party applications, like OVS, to avoid system calls being called as part of the PMD threads. Signed-off-by: Eelco Chaudron --- lib/vhost/meson.build |2 ++ lib/vhost/rte_vhost.h | 23 +- lib/vhost/socket.c| 63 ++--- lib/vhost/version.map |9 +++ lib/vhost/vhost.c | 38 ++ lib/vhost/vhost.h | 58 - 6 files changed, 171 insertions(+), 22 deletions(-) diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build index 0d1abf6283..05679447db 100644 --- a/lib/vhost/meson.build +++ b/lib/vhost/meson.build @@ -38,3 +38,5 @@ driver_sdk_headers = files( 'vdpa_driver.h', ) deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev'] + +use_function_versioning = true diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index 58a5d4be92..7a10bc36cf 100644 --- a/lib/vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost.h @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { */ void (*guest_notified)(int vid); - void *reserved[1]; /**< Reserved for future extension */ + /** +* If this callback is registered, notification to the guest can +* be handled by the front-end calling rte_vhost_notify_guest(). +* If it's not handled, 'false' should be returned. This can be used +* to remove the "slow" eventfd_write() syscall from the datapath. +*/ + bool (*guest_notify)(int vid, uint16_t queue_id); }; /** @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice. + * + * Inject the offloaded interrupt into the vhost device's queue. For more + * details see the 'guest_notify' vhost device operation. + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + */ +__rte_experimental +void rte_vhost_notify_guest(int vid, uint16_t queue_id); + /** * Register vhost driver. path could be different for multiple * instance support. diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c index 669c322e12..f2c02075fe 100644 --- a/lib/vhost/socket.c +++ b/lib/vhost/socket.c @@ -15,6 +15,7 @@ #include #include +#include #include #include "fd_man.h" @@ -59,6 +60,7 @@ struct vhost_user_socket { struct rte_vdpa_device *vdpa_dev; struct rte_vhost_device_ops const *notify_ops; + struct rte_vhost_device_ops *malloc_notify_ops; }; struct vhost_user_connection { @@ -846,6 +848,11 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket) vsocket->path = NULL; } + if (vsocket && vsocket->malloc_notify_ops) { + free(vsocket->malloc_notify_ops); + vsocket->malloc_notify_ops = NULL; + } + if (vsocket) { free(vsocket); vsocket = NULL; @@ -1099,21 +1106,69 @@ rte_vhost_driver_unregister(const char *path) /* * Register ops so that we can add/remove device to data core. */ -int -rte_vhost_driver_callback_register(const char *path, - struct rte_vhost_device_ops const * const ops) +static int +vhost_driver_callback_register(const char *path, + struct rte_vhost_device_ops const * const ops, + struct rte_vhost_device_ops *malloc_ops) { struct vhost_user_socket *vsocket; pthread_mutex_lock(&vhost_user.mutex); vsocket = find_vhost_user_socket(path); - if (vsocket) + if (vsocket) { vsocket->notify_ops = ops; + free(vsocket->malloc_notify_ops); + vsocket->malloc_notify_ops = malloc_ops; + } pthread_mutex_unlock(&vhost_user.mutex); return vsocket ? 0 : -1; } +int __vsym +rte_vhost_driver_callback_register_v24(const char *path, + struct rte_vhost_device_ops const * const ops) +{ + return vhost_driver_callback_register(path, ops, NULL); +} + +int __vsym +rte_vhost_driver_callback_register_v23(const char *path, + struct rte_vhost_device_ops const * const ops) +{ + int ret; + + /* +* Although the ops structure is a const structure, we do need to +* override the guest_notify operation. This is because with the +
[PATCH v3 3/4] vhost: fix invalid call FD handling
This patch fixes cases where IRQ injection is tried while the call FD is not valid, which should not happen. Fixes: b1cce26af1dc ("vhost: add notification for packed ring") Fixes: e37ff954405a ("vhost: support virtqueue interrupt/notification suppression") Signed-off-by: Maxime Coquelin Signed-off-by: Eelco Chaudron --- lib/vhost/vhost.h |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 37609c7c8d..23a4e2b1a7 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -903,9 +903,9 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) "%s: used_event_idx=%d, old=%d, new=%d\n", __func__, vhost_used_event(vq), old, new); - if ((vhost_need_event(vhost_used_event(vq), new, old) && - (vq->callfd >= 0)) || - unlikely(!signalled_used_valid)) { + if ((vhost_need_event(vhost_used_event(vq), new, old) || + unlikely(!signalled_used_valid)) && + vq->callfd >= 0) { eventfd_write(vq->callfd, (eventfd_t) 1); if (dev->flags & VIRTIO_DEV_STATS_ENABLED) __atomic_fetch_add(&vq->stats.guest_notifications, @@ -974,7 +974,7 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) if (vhost_need_event(off, new, old)) kick = true; kick: - if (kick) { + if (kick && vq->callfd >= 0) { eventfd_write(vq->callfd, (eventfd_t)1); if (dev->notify_ops->guest_notified) dev->notify_ops->guest_notified(dev->vid);
[PATCH v3 2/4] vhost: make the guest_notifications statistic counter atomic
Making the guest_notifications statistic counter atomic, allows it to be safely incremented while holding the read access_lock. Signed-off-by: Eelco Chaudron --- lib/vhost/vhost.c |8 lib/vhost/vhost.h |9 ++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index 74bdbfd810..8ff6434c93 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -2086,6 +2086,10 @@ rte_vhost_vring_stats_get(int vid, uint16_t queue_id, rte_rwlock_write_lock(&vq->access_lock); for (i = 0; i < VHOST_NB_VQ_STATS; i++) { + /* +* No need to the read atomic counters as such, due to the +* above write access_lock preventing them to be updated. +*/ stats[i].value = *(uint64_t *)(((char *)vq) + vhost_vq_stat_strings[i].offset); stats[i].id = i; @@ -2112,6 +2116,10 @@ int rte_vhost_vring_stats_reset(int vid, uint16_t queue_id) vq = dev->virtqueue[queue_id]; rte_rwlock_write_lock(&vq->access_lock); + /* +* No need to the reset atomic counters as such, due to the +* above write access_lock preventing them to be updated. +*/ memset(&vq->stats, 0, sizeof(vq->stats)); rte_rwlock_write_unlock(&vq->access_lock); diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 5c939ef06f..37609c7c8d 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -135,11 +135,12 @@ struct virtqueue_stats { uint64_t broadcast; /* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */ uint64_t size_bins[8]; - uint64_t guest_notifications; uint64_t iotlb_hits; uint64_t iotlb_misses; uint64_t inflight_submitted; uint64_t inflight_completed; + /* Counters below are atomic, and should be incremented as such. */ + uint64_t guest_notifications; }; /** @@ -907,7 +908,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) unlikely(!signalled_used_valid)) { eventfd_write(vq->callfd, (eventfd_t) 1); if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - vq->stats.guest_notifications++; + __atomic_fetch_add(&vq->stats.guest_notifications, + 1, __ATOMIC_RELAXED); if (dev->notify_ops->guest_notified) dev->notify_ops->guest_notified(dev->vid); } @@ -917,7 +919,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) && (vq->callfd >= 0)) { eventfd_write(vq->callfd, (eventfd_t)1); if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - vq->stats.guest_notifications++; + __atomic_fetch_add(&vq->stats.guest_notifications, + 1, __ATOMIC_RELAXED); if (dev->notify_ops->guest_notified) dev->notify_ops->guest_notified(dev->vid); }
[PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a read/write one
This change will allow the vhost interrupt datapath handling to be split between two processed without one of them holding an explicit lock. Signed-off-by: Eelco Chaudron --- lib/eal/include/generic/rte_rwlock.h | 17 ++ lib/vhost/vhost.c| 46 + lib/vhost/vhost.h|4 +- lib/vhost/vhost_user.c | 14 +++-- lib/vhost/virtio_net.c | 90 +- 5 files changed, 94 insertions(+), 77 deletions(-) diff --git a/lib/eal/include/generic/rte_rwlock.h b/lib/eal/include/generic/rte_rwlock.h index 71e2d8d5f4..9e083bbc61 100644 --- a/lib/eal/include/generic/rte_rwlock.h +++ b/lib/eal/include/generic/rte_rwlock.h @@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl) __atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE); } +/** + * Test if the write lock is taken. + * + * @param rwl + * A pointer to a rwlock structure. + * @return + * 1 if the write lock is currently taken; 0 otherwise. + */ +static inline int +rte_rwlock_write_is_locked(rte_rwlock_t *rwl) +{ + if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE) + return 1; + + return 0; +} + /** * Try to execute critical section in a hardware memory transaction, if it * fails or not available take a read lock diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index ef37943817..74bdbfd810 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq) else rte_free(vq->shadow_used_split); - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); vhost_free_async_mem(vq); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_write_unlock(&vq->access_lock); rte_free(vq->batch_copy_elems); vhost_user_iotlb_destroy(vq); rte_free(vq->log_cache); @@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) dev->virtqueue[i] = vq; init_vring_queue(dev, vq, i); - rte_spinlock_init(&vq->access_lock); + rte_rwlock_init(&vq->access_lock); vq->avail_wrap_counter = 1; vq->used_wrap_counter = 1; vq->signalled_used_valid = false; @@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) if (!vq) return -1; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_read_lock(&vq->access_lock); if (vq_is_packed(dev)) vhost_vring_call_packed(dev, vq); else vhost_vring_call_split(dev, vq); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_read_unlock(&vq->access_lock); return 0; } @@ -1334,7 +1334,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx) if (!vq) return -1; - if (!rte_spinlock_trylock(&vq->access_lock)) + if (rte_rwlock_read_trylock(&vq->access_lock)) return -EAGAIN; if (vq_is_packed(dev)) @@ -1342,7 +1342,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx) else vhost_vring_call_split(dev, vq); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_read_unlock(&vq->access_lock); return 0; } @@ -1365,7 +1365,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) if (!vq) return 0; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); if (unlikely(!vq->enabled || vq->avail == NULL)) goto out; @@ -1373,7 +1373,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) ret = *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; out: - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_write_unlock(&vq->access_lock); return ret; } @@ -1457,12 +1457,12 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) if (!vq) return -1; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); vq->notif_enable = enable; ret = vhost_enable_guest_notification(dev, vq, enable); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_write_unlock(&vq->access_lock); return ret; } @@ -1520,7 +1520,7 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid) if (vq == NULL) return 0; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); if (unlikely(!vq->enabled || vq->avail
[PATCH v3 0/4] vhost: add device op to offload the interrupt kick
This series adds an operation callback which gets called every time the library wants to call eventfd_write(). This eventfd_write() call could result in a system call, which could potentially block the PMD thread. The callback function can decide whether it's ok to handle the eventfd_write() now or have the newly introduced function, rte_vhost_notify_guest(), called at a later time. This can be used by 3rd party applications, like OVS, to avoid system calls being called as part of the PMD threads. v3: - Changed ABI compatibility code to no longer use a boolean to avoid having to disable specific GCC warnings. - Moved the fd check fix to a separate patch (patch 3/4). - Fixed some coding style issues. v2: - Used vhost_virtqueue->index to find index for operation. - Aligned function name to VDUSE RFC patchset. - Added error and offload statistics counter. - Mark new API as experimental. - Change the virtual queue spin lock to read/write spin lock. - Made shared counters atomic. - Add versioned rte_vhost_driver_callback_register() for ABI compliance. Eelco Chaudron (4): vhost: change vhost_virtqueue access lock to a read/write one vhost: make the guest_notifications statistic counter atomic vhost: fix invalid call FD handling vhost: add device op to offload the interrupt kick lib/eal/include/generic/rte_rwlock.h | 17 + lib/vhost/meson.build| 2 + lib/vhost/rte_vhost.h| 23 ++- lib/vhost/socket.c | 63 +-- lib/vhost/version.map| 9 +++ lib/vhost/vhost.c| 92 +--- lib/vhost/vhost.h| 69 ++--- lib/vhost/vhost_user.c | 14 ++--- lib/vhost/virtio_net.c | 90 +-- 9 files changed, 278 insertions(+), 101 deletions(-)
Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
On 16 May 2023, at 13:45, Maxime Coquelin wrote: > On 5/16/23 13:36, Eelco Chaudron wrote: >> >> >> On 16 May 2023, at 12:12, David Marchand wrote: >> >>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron wrote: >>>> On 10 May 2023, at 13:44, David Marchand wrote: >>> >>> [snip] >>> >>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket >>>>>> *vsocket) >>>>>> vsocket->path = NULL; >>>>>> } >>>>>> >>>>>> + if (vsocket && vsocket->alloc_notify_ops) { >>>>>> +#pragma GCC diagnostic push >>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual" >>>>>> + free((struct rte_vhost_device_ops *)vsocket->notify_ops); >>>>>> +#pragma GCC diagnostic pop >>>>>> + vsocket->notify_ops = NULL; >>>>>> + } >>>>> >>>>> Rather than select the behavior based on a boolean (and here force the >>>>> compiler to close its eyes), I would instead add a non const pointer >>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket. >>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops); >>>> >>>> Good idea, I will make the change in v3. >>> >>> Feel free to use a better name for this field :-). >>> >>>> >>>>>> + >>>>>> if (vsocket) { >>>>>> free(vsocket); >>>>>> vsocket = NULL; >>> >>> [snip] >>> >>>>>> + /* >>>>>> +* Although the ops structure is a const structure, we do need to >>>>>> +* override the guest_notify operation. This is because with the >>>>>> +* previous APIs it was "reserved" and if any garbage value was >>>>>> passed, >>>>>> +* it could crash the application. >>>>>> +*/ >>>>>> + if (ops && !ops->guest_notify) { >>>>> >>>>> Hum, as described in the comment above, I don't think we should look >>>>> at ops->guest_notify value at all. >>>>> Checking ops != NULL should be enough. >>>> >>>> Not sure I get you here. If the guest_notify passed by the user is NULL, >>>> it means the previously ‘reserved[1]’ field is NULL, so we do not need to >>>> use a new structure. >>>> >>>> I guess your comment would be true if we would introduce a new field in >>>> the data structure, not replacing a reserved one. >>> >>> Hum, I don't understand my comment either o_O'. >>> Too many days off... or maybe my evil twin took over the keyboard. >>> >>> >>>> >>>>>> + struct rte_vhost_device_ops *new_ops; >>>>>> + >>>>>> + new_ops = malloc(sizeof(*new_ops)); >>>>> >>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc. >>>>> I am unclear of the impact though. >>>> >>>> Don’t think there is a portable API that we can use to determine the NUMA >>>> for the ops memory and then allocate this on the same numa? >>>> >>>> Any thoughts or ideas on how to solve this? I hope most people will >>>> memset() the ops structure and the reserved[1] part is zero, but it might >>>> be a problem in the future if more extensions get added. >>> >>> Determinining current numa is doable, via 'ops' >>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in >>> numa_realloc(). >>> The problem is how to allocate on this numa with the libc allocator >>> for which I have no idea... >>> We could go with the dpdk allocator (again, like numa_realloc()). >>> >>> >>> In practice, the passed ops will be probably from a const variable in >>> the program .data section (for which I think fields are set to 0 >>> unless explicitly initialised), or a memset() will be called for a >>> dynamic allocation from good citizens. >>> So we can probably live with the current proposal. >>> Plus, this is only for one release, since in 23.11 with the ABI
Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
On 16 May 2023, at 12:12, David Marchand wrote: > On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron wrote: >> On 10 May 2023, at 13:44, David Marchand wrote: > > [snip] > >>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket >>>> *vsocket) >>>> vsocket->path = NULL; >>>> } >>>> >>>> + if (vsocket && vsocket->alloc_notify_ops) { >>>> +#pragma GCC diagnostic push >>>> +#pragma GCC diagnostic ignored "-Wcast-qual" >>>> + free((struct rte_vhost_device_ops *)vsocket->notify_ops); >>>> +#pragma GCC diagnostic pop >>>> + vsocket->notify_ops = NULL; >>>> + } >>> >>> Rather than select the behavior based on a boolean (and here force the >>> compiler to close its eyes), I would instead add a non const pointer >>> to ops (let's say alloc_notify_ops) in vhost_user_socket. >>> The code can then unconditionnally call free(vsocket->alloc_notify_ops); >> >> Good idea, I will make the change in v3. > > Feel free to use a better name for this field :-). > >> >>>> + >>>> if (vsocket) { >>>> free(vsocket); >>>> vsocket = NULL; > > [snip] > >>>> + /* >>>> +* Although the ops structure is a const structure, we do need to >>>> +* override the guest_notify operation. This is because with the >>>> +* previous APIs it was "reserved" and if any garbage value was >>>> passed, >>>> +* it could crash the application. >>>> +*/ >>>> + if (ops && !ops->guest_notify) { >>> >>> Hum, as described in the comment above, I don't think we should look >>> at ops->guest_notify value at all. >>> Checking ops != NULL should be enough. >> >> Not sure I get you here. If the guest_notify passed by the user is NULL, it >> means the previously ‘reserved[1]’ field is NULL, so we do not need to use a >> new structure. >> >> I guess your comment would be true if we would introduce a new field in the >> data structure, not replacing a reserved one. > > Hum, I don't understand my comment either o_O'. > Too many days off... or maybe my evil twin took over the keyboard. > > >> >>>> + struct rte_vhost_device_ops *new_ops; >>>> + >>>> + new_ops = malloc(sizeof(*new_ops)); >>> >>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc. >>> I am unclear of the impact though. >> >> Don’t think there is a portable API that we can use to determine the NUMA >> for the ops memory and then allocate this on the same numa? >> >> Any thoughts or ideas on how to solve this? I hope most people will memset() >> the ops structure and the reserved[1] part is zero, but it might be a >> problem in the future if more extensions get added. > > Determinining current numa is doable, via 'ops' > get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in > numa_realloc(). > The problem is how to allocate on this numa with the libc allocator > for which I have no idea... > We could go with the dpdk allocator (again, like numa_realloc()). > > > In practice, the passed ops will be probably from a const variable in > the program .data section (for which I think fields are set to 0 > unless explicitly initialised), or a memset() will be called for a > dynamic allocation from good citizens. > So we can probably live with the current proposal. > Plus, this is only for one release, since in 23.11 with the ABI bump, > we will drop this compat code. > > Maxime, Chenbo, what do you think? Wait for their response, but for now I assume we can just keep the numa unaware malloc(). > > [snip] > >>> >>> But putting indentation aside, is this change equivalent? >>> - if ((vhost_need_event(vhost_used_event(vq), new, old) && >>> - (vq->callfd >= 0)) || >>> - unlikely(!signalled_used_valid)) { >>> + if ((vhost_need_event(vhost_used_event(vq), new, old) || >>> + unlikely(!signalled_used_valid)) && >>> + vq->callfd >= 0) { >> >> They are not equal, but in the past eventfd_write() should also not have >> been called with callfd < 0, guess this was an existing bug ;) > > I think this should be a separate fix. ACK, will add a separate patch in this series to fix it. > >> >>>> + vhost_vring_inject_irq(dev, vq); > > > -- > David Marchand
Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
On 10 May 2023, at 13:44, David Marchand wrote: > On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron wrote: >> >> This patch adds an operation callback which gets called every time the >> library wants to call eventfd_write(). This eventfd_write() call could >> result in a system call, which could potentially block the PMD thread. >> >> The callback function can decide whether it's ok to handle the >> eventfd_write() now or have the newly introduced function, >> rte_vhost_notify_guest(), called at a later time. >> >> This can be used by 3rd party applications, like OVS, to avoid system >> calls being called as part of the PMD threads. >> >> Signed-off-by: Eelco Chaudron >> --- >> lib/vhost/meson.build |2 + >> lib/vhost/rte_vhost.h | 23 +++- >> lib/vhost/socket.c| 72 >> ++--- >> lib/vhost/version.map |9 ++ >> lib/vhost/vhost.c | 38 ++ >> lib/vhost/vhost.h | 65 +++- >> 6 files changed, 184 insertions(+), 25 deletions(-) >> >> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build >> index 197a51d936..e93ba6b078 100644 >> --- a/lib/vhost/meson.build >> +++ b/lib/vhost/meson.build >> @@ -39,3 +39,5 @@ driver_sdk_headers = files( >> 'vdpa_driver.h', >> ) >> deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev'] >> + >> +use_function_versioning = true >> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h >> index 58a5d4be92..7a10bc36cf 100644 >> --- a/lib/vhost/rte_vhost.h >> +++ b/lib/vhost/rte_vhost.h >> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { >> */ >> void (*guest_notified)(int vid); >> >> - void *reserved[1]; /**< Reserved for future extension */ >> + /** >> +* If this callback is registered, notification to the guest can >> +* be handled by the front-end calling rte_vhost_notify_guest(). >> +* If it's not handled, 'false' should be returned. This can be used >> +* to remove the "slow" eventfd_write() syscall from the datapath. >> +*/ >> + bool (*guest_notify)(int vid, uint16_t queue_id); >> }; >> >> /** >> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t >> vring_idx, >> >> int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int >> enable); >> >> +/** >> + * @warning >> + * @b EXPERIMENTAL: this API may change, or be removed, without prior >> notice. >> + * >> + * Inject the offloaded interrupt into the vhost device's queue. For more >> + * details see the 'guest_notify' vhost device operation. >> + * >> + * @param vid >> + * vhost device ID >> + * @param queue_id >> + * virtio queue index >> + */ >> +__rte_experimental >> +void rte_vhost_notify_guest(int vid, uint16_t queue_id); >> + >> /** >> * Register vhost driver. path could be different for multiple >> * instance support. >> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c >> index 669c322e12..787d6bacf8 100644 >> --- a/lib/vhost/socket.c >> +++ b/lib/vhost/socket.c >> @@ -15,6 +15,7 @@ >> #include >> #include >> >> +#include >> #include >> >> #include "fd_man.h" >> @@ -43,6 +44,7 @@ struct vhost_user_socket { >> bool async_copy; >> bool net_compliant_ol_flags; >> bool stats_enabled; >> + bool alloc_notify_ops; >> >> /* >> * The "supported_features" indicates the feature bits the >> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket >> *vsocket) >> vsocket->path = NULL; >> } >> >> + if (vsocket && vsocket->alloc_notify_ops) { >> +#pragma GCC diagnostic push >> +#pragma GCC diagnostic ignored "-Wcast-qual" >> + free((struct rte_vhost_device_ops *)vsocket->notify_ops); >> +#pragma GCC diagnostic pop >> + vsocket->notify_ops = NULL; >> + } > > Rather than select the behavior based on a boolean (and here force the > compiler to close its eyes), I would instead add a non const pointer > to ops (let's say alloc_notify_ops) in vhost_user_socket. > The code can then unconditionn
Re: [PATCH v2 0/3] vhost: add device op to offload the interrupt kick
On 5 Apr 2023, at 14:40, Eelco Chaudron wrote: > This series adds an operation callback which gets called every time the > library wants to call eventfd_write(). This eventfd_write() call could > result in a system call, which could potentially block the PMD thread. > > The callback function can decide whether it's ok to handle the > eventfd_write() now or have the newly introduced function, > rte_vhost_notify_guest(), called at a later time. > > This can be used by 3rd party applications, like OVS, to avoid system > calls being called as part of the PMD threads. Wondering if anyone had a chance to look at this patchset. Cheers, Eelco > v2: - Used vhost_virtqueue->index to find index for operation. > - Aligned function name to VDUSE RFC patchset. > - Added error and offload statistics counter. > - Mark new API as experimental. > - Change the virtual queue spin lock to read/write spin lock. > - Made shared counters atomic. > - Add versioned rte_vhost_driver_callback_register() for > ABI compliance. > > Eelco Chaudron (3): > vhost: Change vhost_virtqueue access lock to a read/write one. > vhost: make the guest_notifications statistic counter atomic. > vhost: add device op to offload the interrupt kick > > > lib/eal/include/generic/rte_rwlock.h | 17 + > lib/vhost/meson.build| 2 + > lib/vhost/rte_vhost.h| 23 ++- > lib/vhost/socket.c | 72 -- > lib/vhost/version.map| 9 +++ > lib/vhost/vhost.c| 92 +--- > lib/vhost/vhost.h| 70 ++--- > lib/vhost/vhost_user.c | 14 ++--- > lib/vhost/virtio_net.c | 90 +-- > 9 files changed, 288 insertions(+), 101 deletions(-)
[PATCH v2 3/3] vhost: add device op to offload the interrupt kick
This patch adds an operation callback which gets called every time the library wants to call eventfd_write(). This eventfd_write() call could result in a system call, which could potentially block the PMD thread. The callback function can decide whether it's ok to handle the eventfd_write() now or have the newly introduced function, rte_vhost_notify_guest(), called at a later time. This can be used by 3rd party applications, like OVS, to avoid system calls being called as part of the PMD threads. Signed-off-by: Eelco Chaudron --- lib/vhost/meson.build |2 + lib/vhost/rte_vhost.h | 23 +++- lib/vhost/socket.c| 72 ++--- lib/vhost/version.map |9 ++ lib/vhost/vhost.c | 38 ++ lib/vhost/vhost.h | 65 +++- 6 files changed, 184 insertions(+), 25 deletions(-) diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build index 197a51d936..e93ba6b078 100644 --- a/lib/vhost/meson.build +++ b/lib/vhost/meson.build @@ -39,3 +39,5 @@ driver_sdk_headers = files( 'vdpa_driver.h', ) deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev'] + +use_function_versioning = true diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index 58a5d4be92..7a10bc36cf 100644 --- a/lib/vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost.h @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { */ void (*guest_notified)(int vid); - void *reserved[1]; /**< Reserved for future extension */ + /** +* If this callback is registered, notification to the guest can +* be handled by the front-end calling rte_vhost_notify_guest(). +* If it's not handled, 'false' should be returned. This can be used +* to remove the "slow" eventfd_write() syscall from the datapath. +*/ + bool (*guest_notify)(int vid, uint16_t queue_id); }; /** @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice. + * + * Inject the offloaded interrupt into the vhost device's queue. For more + * details see the 'guest_notify' vhost device operation. + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + */ +__rte_experimental +void rte_vhost_notify_guest(int vid, uint16_t queue_id); + /** * Register vhost driver. path could be different for multiple * instance support. diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c index 669c322e12..787d6bacf8 100644 --- a/lib/vhost/socket.c +++ b/lib/vhost/socket.c @@ -15,6 +15,7 @@ #include #include +#include #include #include "fd_man.h" @@ -43,6 +44,7 @@ struct vhost_user_socket { bool async_copy; bool net_compliant_ol_flags; bool stats_enabled; + bool alloc_notify_ops; /* * The "supported_features" indicates the feature bits the @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket) vsocket->path = NULL; } + if (vsocket && vsocket->alloc_notify_ops) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + free((struct rte_vhost_device_ops *)vsocket->notify_ops); +#pragma GCC diagnostic pop + vsocket->notify_ops = NULL; + } + if (vsocket) { free(vsocket); vsocket = NULL; @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path) /* * Register ops so that we can add/remove device to data core. */ -int -rte_vhost_driver_callback_register(const char *path, - struct rte_vhost_device_ops const * const ops) +static int +rte_vhost_driver_callback_register__(const char *path, + struct rte_vhost_device_ops const * const ops, bool ops_allocated) { struct vhost_user_socket *vsocket; pthread_mutex_lock(&vhost_user.mutex); vsocket = find_vhost_user_socket(path); - if (vsocket) + if (vsocket) { + if (vsocket->alloc_notify_ops) { + vsocket->alloc_notify_ops = false; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + free((struct rte_vhost_device_ops *)vsocket->notify_ops); +#pragma GCC diagnostic pop + } vsocket->notify_ops = ops; + if (ops_allocated) + vsocket->alloc_notify_ops = true; + } pthread_mutex_unlock(&vhost_user.mutex); return vsocket ? 0 : -1; } +int __vsym +rte_vhost_driver_callback_register_v24(const char *path, + str
[PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic.
Making the guest_notifications statistic counter atomic, allows it to be safely incremented while holding the read access_lock. Signed-off-by: Eelco Chaudron --- lib/vhost/vhost.c |8 lib/vhost/vhost.h |9 ++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index 74bdbfd810..8ff6434c93 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -2086,6 +2086,10 @@ rte_vhost_vring_stats_get(int vid, uint16_t queue_id, rte_rwlock_write_lock(&vq->access_lock); for (i = 0; i < VHOST_NB_VQ_STATS; i++) { + /* +* No need to the read atomic counters as such, due to the +* above write access_lock preventing them to be updated. +*/ stats[i].value = *(uint64_t *)(((char *)vq) + vhost_vq_stat_strings[i].offset); stats[i].id = i; @@ -2112,6 +2116,10 @@ int rte_vhost_vring_stats_reset(int vid, uint16_t queue_id) vq = dev->virtqueue[queue_id]; rte_rwlock_write_lock(&vq->access_lock); + /* +* No need to the reset atomic counters as such, due to the +* above write access_lock preventing them to be updated. +*/ memset(&vq->stats, 0, sizeof(vq->stats)); rte_rwlock_write_unlock(&vq->access_lock); diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 5c939ef06f..37609c7c8d 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -135,11 +135,12 @@ struct virtqueue_stats { uint64_t broadcast; /* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */ uint64_t size_bins[8]; - uint64_t guest_notifications; uint64_t iotlb_hits; uint64_t iotlb_misses; uint64_t inflight_submitted; uint64_t inflight_completed; + /* Counters below are atomic, and should be incremented as such. */ + uint64_t guest_notifications; }; /** @@ -907,7 +908,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) unlikely(!signalled_used_valid)) { eventfd_write(vq->callfd, (eventfd_t) 1); if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - vq->stats.guest_notifications++; + __atomic_fetch_add(&vq->stats.guest_notifications, + 1, __ATOMIC_RELAXED); if (dev->notify_ops->guest_notified) dev->notify_ops->guest_notified(dev->vid); } @@ -917,7 +919,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) && (vq->callfd >= 0)) { eventfd_write(vq->callfd, (eventfd_t)1); if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - vq->stats.guest_notifications++; + __atomic_fetch_add(&vq->stats.guest_notifications, + 1, __ATOMIC_RELAXED); if (dev->notify_ops->guest_notified) dev->notify_ops->guest_notified(dev->vid); }
[PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one.
This change will allow the vhost interrupt datapath handling to be split between two processed without one of them holding an explicit lock. Signed-off-by: Eelco Chaudron --- lib/eal/include/generic/rte_rwlock.h | 17 ++ lib/vhost/vhost.c| 46 + lib/vhost/vhost.h|4 +- lib/vhost/vhost_user.c | 14 +++-- lib/vhost/virtio_net.c | 90 +- 5 files changed, 94 insertions(+), 77 deletions(-) diff --git a/lib/eal/include/generic/rte_rwlock.h b/lib/eal/include/generic/rte_rwlock.h index d45c22c189..3a5f0e568f 100644 --- a/lib/eal/include/generic/rte_rwlock.h +++ b/lib/eal/include/generic/rte_rwlock.h @@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl) __atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE); } +/** + * Test if the write lock is taken. + * + * @param rwl + * A pointer to a rwlock structure. + * @return + * 1 if the write lock is currently taken; 0 otherwise. + */ +static inline int +rte_rwlock_write_is_locked(rte_rwlock_t *rwl) +{ + if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE) + return 1; + + return 0; +} + /** * Try to execute critical section in a hardware memory transaction, if it * fails or not available take a read lock diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index ef37943817..74bdbfd810 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq) else rte_free(vq->shadow_used_split); - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); vhost_free_async_mem(vq); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_write_unlock(&vq->access_lock); rte_free(vq->batch_copy_elems); vhost_user_iotlb_destroy(vq); rte_free(vq->log_cache); @@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) dev->virtqueue[i] = vq; init_vring_queue(dev, vq, i); - rte_spinlock_init(&vq->access_lock); + rte_rwlock_init(&vq->access_lock); vq->avail_wrap_counter = 1; vq->used_wrap_counter = 1; vq->signalled_used_valid = false; @@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) if (!vq) return -1; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_read_lock(&vq->access_lock); if (vq_is_packed(dev)) vhost_vring_call_packed(dev, vq); else vhost_vring_call_split(dev, vq); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_read_unlock(&vq->access_lock); return 0; } @@ -1334,7 +1334,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx) if (!vq) return -1; - if (!rte_spinlock_trylock(&vq->access_lock)) + if (rte_rwlock_read_trylock(&vq->access_lock)) return -EAGAIN; if (vq_is_packed(dev)) @@ -1342,7 +1342,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx) else vhost_vring_call_split(dev, vq); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_read_unlock(&vq->access_lock); return 0; } @@ -1365,7 +1365,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) if (!vq) return 0; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); if (unlikely(!vq->enabled || vq->avail == NULL)) goto out; @@ -1373,7 +1373,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) ret = *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; out: - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_write_unlock(&vq->access_lock); return ret; } @@ -1457,12 +1457,12 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) if (!vq) return -1; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); vq->notif_enable = enable; ret = vhost_enable_guest_notification(dev, vq, enable); - rte_spinlock_unlock(&vq->access_lock); + rte_rwlock_write_unlock(&vq->access_lock); return ret; } @@ -1520,7 +1520,7 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid) if (vq == NULL) return 0; - rte_spinlock_lock(&vq->access_lock); + rte_rwlock_write_lock(&vq->access_lock); if (unlikely(!vq->enabled || vq->avail
[PATCH v2 0/3] vhost: add device op to offload the interrupt kick
This series adds an operation callback which gets called every time the library wants to call eventfd_write(). This eventfd_write() call could result in a system call, which could potentially block the PMD thread. The callback function can decide whether it's ok to handle the eventfd_write() now or have the newly introduced function, rte_vhost_notify_guest(), called at a later time. This can be used by 3rd party applications, like OVS, to avoid system calls being called as part of the PMD threads. v2: - Used vhost_virtqueue->index to find index for operation. - Aligned function name to VDUSE RFC patchset. - Added error and offload statistics counter. - Mark new API as experimental. - Change the virtual queue spin lock to read/write spin lock. - Made shared counters atomic. - Add versioned rte_vhost_driver_callback_register() for ABI compliance. Eelco Chaudron (3): vhost: Change vhost_virtqueue access lock to a read/write one. vhost: make the guest_notifications statistic counter atomic. vhost: add device op to offload the interrupt kick lib/eal/include/generic/rte_rwlock.h | 17 + lib/vhost/meson.build| 2 + lib/vhost/rte_vhost.h| 23 ++- lib/vhost/socket.c | 72 -- lib/vhost/version.map| 9 +++ lib/vhost/vhost.c| 92 +--- lib/vhost/vhost.h| 70 ++--- lib/vhost/vhost_user.c | 14 ++--- lib/vhost/virtio_net.c | 90 +-- 9 files changed, 288 insertions(+), 101 deletions(-)
Re: [EXT] [PATCH] vhost: add device op to offload the interrupt kick
On 27 Mar 2023, at 18:35, Maxime Coquelin wrote: > On 3/27/23 18:04, Eelco Chaudron wrote: >> >> >> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote: >> >>> Hi Eelco, >>> >>>> +void >>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) { >>>> + struct virtio_net *dev = get_device(vid); >>>> + struct vhost_virtqueue *vq; >>>> + >>>> + if (!dev || queue_id >= VHOST_MAX_VRING) >>>> + return; >>>> + >>>> + vq = dev->virtqueue[queue_id]; >>>> + if (!vq) >>>> + return; >>>> + >>>> + rte_spinlock_lock(&vq->access_lock); >>>> + >>> >>> Is spin lock needed here before system call ? >> >> I assumed access_lock is protecting all the following fields in this >> structure, so I need the lock to read the vq->callfd, however, I can/should >> move the eventfd_write outside of the lock. > > The FD might be closed between the check and the call to eventfd_write > though, but I agree this is not optimal to call the eventfd_write under > the spinlock in your case, as you will block the pmd thread if it tries > to enqueue/dequeue packets on this queue, defeating the purpose of this > patch. > > Maybe the solution is to change to read-write locks for the access_lock > spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst) > and this API would use the read version, meaning they won't lock each > other, and the control path (lib/vhost/vhost_user.c) will use the write > version. > > Does that make sense? Hi Maxime, I prepped a patch, but not the read/write part yet, https://github.com/chaudron/dpdk/commit/d51c93b4ff08b43daed33e3c0fee193a6d039c25#. I was thinking that even a read/write lock does not work (or maybe we need a combination of taking the read and write lock). As we need to update statistics, which need protection. For example here you see the split (with just two locks, but the sys call could be called in the read lock): https://github.com/chaudron/dpdk/blob/d51c93b4ff08b43daed33e3c0fee193a6d039c25/lib/vhost/vhost.c#L1493 The best would be to not have a lock when calling the system call, but that does not seem safe. I do not see a clear solution, guess also as I have some trouble understanding the lock rules around vq’s. Do you have some more insights to share? I can ping you offline and discuss this. Cheers, Eelco >> >>>> + if (vq->callfd >= 0) >>>> + eventfd_write(vq->callfd, (eventfd_t)1); >>>> + >>>> + rte_spinlock_unlock(&vq->access_lock); >>>> +} >>>> + >>> >>> Thanks. >>
Re: [EXT] [PATCH] vhost: add device op to offload the interrupt kick
On 27 Mar 2023, at 18:35, Maxime Coquelin wrote: > On 3/27/23 18:04, Eelco Chaudron wrote: >> >> >> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote: >> >>> Hi Eelco, >>> >>>> +void >>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) { >>>> + struct virtio_net *dev = get_device(vid); >>>> + struct vhost_virtqueue *vq; >>>> + >>>> + if (!dev || queue_id >= VHOST_MAX_VRING) >>>> + return; >>>> + >>>> + vq = dev->virtqueue[queue_id]; >>>> + if (!vq) >>>> + return; >>>> + >>>> + rte_spinlock_lock(&vq->access_lock); >>>> + >>> >>> Is spin lock needed here before system call ? >> >> I assumed access_lock is protecting all the following fields in this >> structure, so I need the lock to read the vq->callfd, however, I can/should >> move the eventfd_write outside of the lock. > > The FD might be closed between the check and the call to eventfd_write > though, but I agree this is not optimal to call the eventfd_write under > the spinlock in your case, as you will block the pmd thread if it tries > to enqueue/dequeue packets on this queue, defeating the purpose of this > patch. > > Maybe the solution is to change to read-write locks for the access_lock > spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst) > and this API would use the read version, meaning they won't lock each > other, and the control path (lib/vhost/vhost_user.c) will use the write > version. > > Does that make sense? Yes, this makes sense, let me investigate this and get back. >>>> + if (vq->callfd >= 0) >>>> + eventfd_write(vq->callfd, (eventfd_t)1); >>>> + >>>> + rte_spinlock_unlock(&vq->access_lock); >>>> +} >>>> + >>> >>> Thanks. >>
Re: [EXT] [PATCH] vhost: add device op to offload the interrupt kick
On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote: > Hi Eelco, > >> +void >> +rte_vhost_notify_guest(int vid, uint16_t queue_id) { >> +struct virtio_net *dev = get_device(vid); >> +struct vhost_virtqueue *vq; >> + >> +if (!dev || queue_id >= VHOST_MAX_VRING) >> +return; >> + >> +vq = dev->virtqueue[queue_id]; >> +if (!vq) >> +return; >> + >> +rte_spinlock_lock(&vq->access_lock); >> + > > Is spin lock needed here before system call ? I assumed access_lock is protecting all the following fields in this structure, so I need the lock to read the vq->callfd, however, I can/should move the eventfd_write outside of the lock. >> +if (vq->callfd >= 0) >> +eventfd_write(vq->callfd, (eventfd_t)1); >> + >> +rte_spinlock_unlock(&vq->access_lock); >> +} >> + > > Thanks.
Re: [PATCH] vhost: add device op to offload the interrupt kick
On 27 Mar 2023, at 15:21, Maxime Coquelin wrote: > Hi Eelco, > > On 3/27/23 14:51, Eelco Chaudron wrote: >> This patch adds an operation callback which gets called every time the >> library wants to call eventfd_write(). This eventfd_write() call could >> result in a system call, which could potentially block the PMD thread. >> >> The callback function can decide whether it's ok to handle the >> eventfd_write() now or have the newly introduced function, >> rte_vhost_notify_guest(), called at a later time. >> >> This can be used by 3rd party applications, like OVS, to avoid system >> calls being called as part of the PMD threads. > > That's a good idea, please find some comments inline: Thanks for the review, see inline. > >> Signed-off-by: Eelco Chaudron >> --- >> lib/vhost/rte_vhost.h | 10 +- >> lib/vhost/vhost.c | 21 + >> lib/vhost/vhost.h | 43 --- >> 3 files changed, 58 insertions(+), 16 deletions(-) >> >> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h >> index 58a5d4be92..af7a394d0f 100644 >> --- a/lib/vhost/rte_vhost.h >> +++ b/lib/vhost/rte_vhost.h >> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { >> */ >> void (*guest_notified)(int vid); >> - void *reserved[1]; /**< Reserved for future extension */ >> +/** >> + * If this callback is registered, notification to the guest can >> + * be handled by the front-end calling rte_vhost_notify_guest(). >> + * If it's not handled, 'false' should be returned. This can be used >> + * to remove the "slow" eventfd_write() syscall from the datapath. >> + */ >> +bool (*guest_notify)(int vid, uint16_t queue_id); >> }; >>/** >> @@ -433,6 +439,8 @@ void rte_vhost_log_used_vring(int vid, uint16_t >> vring_idx, >>int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int >> enable); >> +void rte_vhost_notify_guest(int vid, uint16_t queue_id); > > The new API needs to be tagged as experimental, and also documented. (I > see rte_vhost_enable_guest_notification is not properly documented, so > not a good example!) I used exactly that as an example, so thought it should be good ;) Will add this in the next revision... >> + >> /** >>* Register vhost driver. path could be different for multiple >>* instance support. >> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c >> index ef37943817..ee090d78ef 100644 >> --- a/lib/vhost/vhost.c >> +++ b/lib/vhost/vhost.c >> @@ -1467,6 +1467,27 @@ rte_vhost_enable_guest_notification(int vid, uint16_t >> queue_id, int enable) >> return ret; >> } >> +void >> +rte_vhost_notify_guest(int vid, uint16_t queue_id) >> +{ >> +struct virtio_net *dev = get_device(vid); >> +struct vhost_virtqueue *vq; >> + >> +if (!dev || queue_id >= VHOST_MAX_VRING) >> +return; >> + >> +vq = dev->virtqueue[queue_id]; >> +if (!vq) >> +return; >> + >> +rte_spinlock_lock(&vq->access_lock); >> + >> +if (vq->callfd >= 0) >> +eventfd_write(vq->callfd, (eventfd_t)1); > > Maybe we should return an error of callfd is invalid or eventfd_write() > failed. See below >> + >> +rte_spinlock_unlock(&vq->access_lock); >> +} >> + >> void >> rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) >> { >> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h >> index 8fdab13c70..39ad8260a1 100644 >> --- a/lib/vhost/vhost.h >> +++ b/lib/vhost/vhost.h >> @@ -883,6 +883,30 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, >> uint16_t old) >> return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); >> } >> +static __rte_always_inline void >> +vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq) >> +{ >> +if (dev->notify_ops->guest_notify) { >> +uint16_t qid; >> +for (qid = 0; qid < dev->nr_vring; qid++) { >> +if (dev->virtqueue[qid] == vq) { >> +if (dev->notify_ops->guest_notify(dev->vid, >> + qid)) >> +goto done; >> +break; >> +} >> +
[PATCH] vhost: add device op to offload the interrupt kick
This patch adds an operation callback which gets called every time the library wants to call eventfd_write(). This eventfd_write() call could result in a system call, which could potentially block the PMD thread. The callback function can decide whether it's ok to handle the eventfd_write() now or have the newly introduced function, rte_vhost_notify_guest(), called at a later time. This can be used by 3rd party applications, like OVS, to avoid system calls being called as part of the PMD threads. Signed-off-by: Eelco Chaudron --- lib/vhost/rte_vhost.h | 10 +- lib/vhost/vhost.c | 21 + lib/vhost/vhost.h | 43 --- 3 files changed, 58 insertions(+), 16 deletions(-) diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index 58a5d4be92..af7a394d0f 100644 --- a/lib/vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost.h @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { */ void (*guest_notified)(int vid); - void *reserved[1]; /**< Reserved for future extension */ + /** +* If this callback is registered, notification to the guest can +* be handled by the front-end calling rte_vhost_notify_guest(). +* If it's not handled, 'false' should be returned. This can be used +* to remove the "slow" eventfd_write() syscall from the datapath. +*/ + bool (*guest_notify)(int vid, uint16_t queue_id); }; /** @@ -433,6 +439,8 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); +void rte_vhost_notify_guest(int vid, uint16_t queue_id); + /** * Register vhost driver. path could be different for multiple * instance support. diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index ef37943817..ee090d78ef 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -1467,6 +1467,27 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) return ret; } +void +rte_vhost_notify_guest(int vid, uint16_t queue_id) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_virtqueue *vq; + + if (!dev || queue_id >= VHOST_MAX_VRING) + return; + + vq = dev->virtqueue[queue_id]; + if (!vq) + return; + + rte_spinlock_lock(&vq->access_lock); + + if (vq->callfd >= 0) + eventfd_write(vq->callfd, (eventfd_t)1); + + rte_spinlock_unlock(&vq->access_lock); +} + void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) { diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 8fdab13c70..39ad8260a1 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -883,6 +883,30 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); } +static __rte_always_inline void +vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (dev->notify_ops->guest_notify) { + uint16_t qid; + for (qid = 0; qid < dev->nr_vring; qid++) { + if (dev->virtqueue[qid] == vq) { + if (dev->notify_ops->guest_notify(dev->vid, + qid)) + goto done; + break; + } + } + } + eventfd_write(vq->callfd, (eventfd_t) 1); + +done: + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) + vq->stats.guest_notifications++; + if (dev->notify_ops->guest_notified) + dev->notify_ops->guest_notified(dev->vid); +} + + static __rte_always_inline void vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -905,21 +929,13 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) if ((vhost_need_event(vhost_used_event(vq), new, old) && (vq->callfd >= 0)) || unlikely(!signalled_used_valid)) { - eventfd_write(vq->callfd, (eventfd_t) 1); - if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - vq->stats.guest_notifications++; - if (dev->notify_ops->guest_notified) - dev->notify_ops->guest_notified(dev->vid); + vhost_vring_kick_guest(dev, vq); } } else { /* Kick the guest if necessary. */ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) && (vq->callfd >= 0)) { -
Re: [dpdk-dev] [RFC v2] meter: fix ABI break due to experimental tag removal
On 30 Jan 2020, at 21:20, Thomas Monjalon wrote: Yes, sorry, this is what I meant. Given 19.11.0 already has the symbol as experimental, and that applications like OVS had to accept it as experimental, why removing experimental tag in 19.11.1? I think it was mentioned that it was preferred not to suppress the compiler warning to avoid any accidental use in the future, but the OVS maintainer(s) should answer as I might remember wrongly. Yes this is the reason, OVS compiles with -Werror so we would like to avoid the warnings. You can not disable them per include, it’s global for all of DPDK. Yes but anyway OVS must accept the experimental function as the next release will use it with DPDK 19.11.0. Yes, we do for now, but we would like to remove it ASAP as it opens up the code base for the experimental APIs to slip in…
Re: [dpdk-dev] [RFC v2] meter: fix ABI break due to experimental tag removal
On 30 Jan 2020, at 17:04, Luca Boccassi wrote: On Thu, 2020-01-30 at 16:55 +0100, Thomas Monjalon wrote: 30/01/2020 15:21, Luca Boccassi: On Thu, 2020-01-30 at 15:17 +0100, Thomas Monjalon wrote: 30/01/2020 13:57, Luca Boccassi: On Thu, 2020-01-30 at 13:33 +0100, Thomas Monjalon wrote: Hi, I disagree with the need of this patch. The symbol was experimental, meaning we can change it. Removing experimental tag is not an ABI break. Hi, This symbol change was requested for backport in 19.11.x, and experimental or not I'm not too keen on backward incompatible changes to the public interface in an _LTS point release_. The compromise was to see if we could support both symbols version, which makes the change backward compatible. If you prefer not to have this patch in mainline I'm also fine in taking it just for the LTS. I agree with you that it is not required for mainline releases (although nicer for me if it's a backport rather than a new change). I would like to avoid opening the door for maintaining the experimental ABI in the mainline. Please take it directly in the LTS. The next question is to know whether we really want to have such patch in LTS. Anyway, 19.11.0 has this symbol as experimental. How adding a non-experimental version of the function in 19.11.1 will change the ABI status of the whole 19.11 branch? The problem is not adding the new symbol, but removing the experimental one. Changing the version of the symbol was requested by OVS for inclusion in 19.11. Yes, sorry, this is what I meant. Given 19.11.0 already has the symbol as experimental, and that applications like OVS had to accept it as experimental, why removing experimental tag in 19.11.1? I think it was mentioned that it was preferred not to suppress the compiler warning to avoid any accidental use in the future, but the OVS maintainer(s) should answer as I might remember wrongly. Yes this is the reason, OVS compiles with -Werror so we would like to avoid the warnings. You can not disable them per include, it’s global for all of DPDK. //Eelco
Re: [dpdk-dev] [PATCH] vhost: fix deadlock on port deletion
On 14 Jan 2020, at 19:53, Maxime Coquelin wrote: > If the vhost-user application (e.g. OVS) deletes the vhost-user > port while Qemu sends a vhost-user request, a deadlock can > happen if the request handler tries to acquire vhost-user's > global mutex, which is also locked by the vhost-user port > deletion API (rte_vhost_driver_unregister). > > This patch prevents the deadlock by making > rte_vhost_driver_unregister() to release the mutex and try > again if a request is being handled to give a chance to > the request handler to complete. > > Fixes: 8b4b949144b8 ("vhost: fix dead lock on closing in server mode") > Fixes: 5fbb3941da9f ("vhost: introduce driver features related APIs") > Cc: sta...@dpdk.org > > Signed-off-by: Maxime Coquelin Acked-by: Eelco Chaudron
Re: [dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface
Thanks! On 18 Dec 2019, at 3:57, Ye Xiaolong wrote: On 12/18, Zhang, Xiao wrote: -Original Message- From: Eelco Chaudron [mailto:echau...@redhat.com] Sent: Tuesday, November 19, 2019 9:45 PM To: Xing, Beilei ; Zhang, Qi Z Cc: Zhang, Xiao ; dev@dpdk.org Subject: [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface During a kernel PF reset, this event is propagated to the VF. The DPDK VF PMD will execute the reset task before the PF is done with his. This results in the admin queue message not being responded to leaving the port in "promiscuous" mode. This patch makes sure the promiscuous mode is configured independently of the current admin state. Signed-off-by: Eelco Chaudron Reviewed-by: Xiao Zhang Applied to dpdk-next-net-intel, Thanks.
Re: [dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface
Trying again? On 4 Dec 2019, at 16:18, Eelco Chaudron wrote: Any update on this patch? On 19 Nov 2019, at 14:45, Eelco Chaudron wrote: During a kernel PF reset, this event is propagated to the VF. The DPDK VF PMD will execute the reset task before the PF is done with his. This results in the admin queue message not being responded to leaving the port in "promiscuous" mode. This patch makes sure the promiscuous mode is configured independently of the current admin state. Signed-off-by: Eelco Chaudron --- v1-v2: In the earlier patch, we would force the vf->promisc_unicast_enabled state to false after a reset. Based on the review comments it was prefered to not take the current state into account when programming. v1 patch was called: net/i40e: force promiscuous state after VF reset drivers/net/i40e/i40e_ethdev_vf.c | 16 1 file changed, 16 deletions(-) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 5dba092..43f7ab5 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2162,10 +2162,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (vf->promisc_unicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, 1, vf->promisc_multicast_enabled); if (ret == 0) vf->promisc_unicast_enabled = TRUE; @@ -2181,10 +2177,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If disabled, just return */ - if (!vf->promisc_unicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, 0, vf->promisc_multicast_enabled); if (ret == 0) vf->promisc_unicast_enabled = FALSE; @@ -2200,10 +2192,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (vf->promisc_multicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 1); if (ret == 0) vf->promisc_multicast_enabled = TRUE; @@ -2219,10 +2207,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (!vf->promisc_multicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 0); if (ret == 0) vf->promisc_multicast_enabled = FALSE;
[dpdk-dev] [PATCH] meter: move RFC4115 trTCM APIs as none experimental
Moved RFC4115 APIs to none experimental as they have been there since 19.02. Also, these APIs are the same as the none RFC4115 APIs. Signed-off-by: Eelco Chaudron --- lib/librte_meter/rte_meter.h |6 -- lib/librte_meter/rte_meter_version.map |4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index d69b118..62c8c1e 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -140,7 +140,6 @@ struct rte_meter_trtcm_rfc4115_params { * @return *0 upon success, error code otherwise */ -__rte_experimental int rte_meter_trtcm_rfc4115_profile_config( struct rte_meter_trtcm_rfc4115_profile *p, @@ -187,7 +186,6 @@ struct rte_meter_trtcm_rfc4115_params { * @return *0 upon success, error code otherwise */ -__rte_experimental int rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm_rfc4115 *m, struct rte_meter_trtcm_rfc4115_profile *p); @@ -295,7 +293,6 @@ struct rte_meter_trtcm_rfc4115_params { * @return *Color assigned to the current IP packet */ -__rte_experimental static inline enum rte_color rte_meter_trtcm_rfc4115_color_blind_check( struct rte_meter_trtcm_rfc4115 *m, @@ -322,7 +319,6 @@ struct rte_meter_trtcm_rfc4115_params { * @return *Color assigned to the current IP packet */ -__rte_experimental static inline enum rte_color rte_meter_trtcm_rfc4115_color_aware_check( struct rte_meter_trtcm_rfc4115 *m, @@ -582,7 +578,6 @@ struct rte_meter_trtcm_rfc4115 { return RTE_COLOR_GREEN; } -__rte_experimental static inline enum rte_color rte_meter_trtcm_rfc4115_color_blind_check( struct rte_meter_trtcm_rfc4115 *m, @@ -626,7 +621,6 @@ struct rte_meter_trtcm_rfc4115 { return RTE_COLOR_RED; } -__rte_experimental static inline enum rte_color rte_meter_trtcm_rfc4115_color_aware_check( struct rte_meter_trtcm_rfc4115 *m, diff --git a/lib/librte_meter/rte_meter_version.map b/lib/librte_meter/rte_meter_version.map index 46410b0..fc12cc0 100644 --- a/lib/librte_meter/rte_meter_version.map +++ b/lib/librte_meter/rte_meter_version.map @@ -13,11 +13,11 @@ DPDK_20.0 { local: *; }; -EXPERIMENTAL { +DPDK_20.0.1 { global: rte_meter_trtcm_rfc4115_color_aware_check; rte_meter_trtcm_rfc4115_color_blind_check; rte_meter_trtcm_rfc4115_config; rte_meter_trtcm_rfc4115_profile_config; -}; +} DPDK_20.0;
Re: [dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface
Any update on this patch? On 19 Nov 2019, at 14:45, Eelco Chaudron wrote: During a kernel PF reset, this event is propagated to the VF. The DPDK VF PMD will execute the reset task before the PF is done with his. This results in the admin queue message not being responded to leaving the port in "promiscuous" mode. This patch makes sure the promiscuous mode is configured independently of the current admin state. Signed-off-by: Eelco Chaudron --- v1-v2: In the earlier patch, we would force the vf->promisc_unicast_enabled state to false after a reset. Based on the review comments it was prefered to not take the current state into account when programming. v1 patch was called: net/i40e: force promiscuous state after VF reset drivers/net/i40e/i40e_ethdev_vf.c | 16 1 file changed, 16 deletions(-) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 5dba092..43f7ab5 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2162,10 +2162,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (vf->promisc_unicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, 1, vf->promisc_multicast_enabled); if (ret == 0) vf->promisc_unicast_enabled = TRUE; @@ -2181,10 +2177,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If disabled, just return */ - if (!vf->promisc_unicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, 0, vf->promisc_multicast_enabled); if (ret == 0) vf->promisc_unicast_enabled = FALSE; @@ -2200,10 +2192,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (vf->promisc_multicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 1); if (ret == 0) vf->promisc_multicast_enabled = TRUE; @@ -2219,10 +2207,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (!vf->promisc_multicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 0); if (ret == 0) vf->promisc_multicast_enabled = FALSE;
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
On 13 Nov 2019, at 2:14, Zhang, Xiao wrote: -Original Message- From: Eelco Chaudron [mailto:echau...@redhat.com] Sent: Tuesday, November 12, 2019 7:09 PM To: Zhang, Xiao Cc: Zhang, Qi Z ; dev@dpdk.org; Xing, Beilei Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset On 12 Nov 2019, at 1:52, Zhang, Xiao wrote: Hi Eelco, Seems you missed this mail. Not sure why I missed this email, as it does not show up in my email client :( See below… Hi Eelco, I think you may need add more detailed message in the commit log or comments. My interpretation of the request was that Beilei wanted to know why disabling promiscuous mode in HW was failing. Beilei can you comment, is the additional description from Xiao enough? Yes, promisc_unicast_enabled flag is not cleared during vf reset because fail to disable promiscuous mode, So I think we need to root cause why fail to disable promiscuous mode first. This patch looks like a workaround but not a fix. This was debugged together with Xiao and from what I understand is that DPDK fails to reset promiscuous mode in hardware as PF and VF operations are not synced between kernel and DPDK. Xiao told me this could not be fixed in another way, Xiao can you comment? Checked again, the root cause is not synced issue between kernel and DPDK during reset. What is the root cause, so I can update the patch description for the next revision. Kernel PF do reset once VF mac changed and send reset event to VF, VF do reset once received even if reset task not done by PF, so admin q message will not get response. Thanks, added this to the v2 commit message. Suggest to remove the checking and setting of promisc_unicast_enabled flag, since this flag is only used when enable/disable promiscuous mode. Considering the un-synced issue, it will be more clean if remove the flag. Also same with flag promisc_multicast_enabled. So if I understand it correctly remove the following code: 2203/* If enabled, just return */ 2204if (vf->promisc_multicast_enabled) 2205return 0; and /* If enabled, just return */ 2223if (!vf->promisc_multicast_enabled) 2224return 0; Or to remove the flags from the i40e_vf structure (and relative code): 1051bool promisc_unicast_enabled; 1052bool promisc_multicast_enabled; Let me know and I craft up a patch… You can remove promisc_unicast_enabled/promisc_multicast_enabled and related code. Removing the flags did not work, as they are needed for programming one or the other, so I kept them. In the end, I removed the “If enabled” checks above, see the v2 patch with the name “[PATCH v2] net/i40e: always re-program promiscuous mode on VF interface”
[dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface
During a kernel PF reset, this event is propagated to the VF. The DPDK VF PMD will execute the reset task before the PF is done with his. This results in the admin queue message not being responded to leaving the port in "promiscuous" mode. This patch makes sure the promiscuous mode is configured independently of the current admin state. Signed-off-by: Eelco Chaudron --- v1-v2: In the earlier patch, we would force the vf->promisc_unicast_enabled state to false after a reset. Based on the review comments it was prefered to not take the current state into account when programming. v1 patch was called: net/i40e: force promiscuous state after VF reset drivers/net/i40e/i40e_ethdev_vf.c | 16 1 file changed, 16 deletions(-) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 5dba092..43f7ab5 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2162,10 +2162,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (vf->promisc_unicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, 1, vf->promisc_multicast_enabled); if (ret == 0) vf->promisc_unicast_enabled = TRUE; @@ -2181,10 +2177,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If disabled, just return */ - if (!vf->promisc_unicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, 0, vf->promisc_multicast_enabled); if (ret == 0) vf->promisc_unicast_enabled = FALSE; @@ -2200,10 +2192,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (vf->promisc_multicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 1); if (ret == 0) vf->promisc_multicast_enabled = TRUE; @@ -2219,10 +2207,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); int ret; - /* If enabled, just return */ - if (!vf->promisc_multicast_enabled) - return 0; - ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 0); if (ret == 0) vf->promisc_multicast_enabled = FALSE;
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
On 12 Nov 2019, at 1:52, Zhang, Xiao wrote: Hi Eelco, Seems you missed this mail. Not sure why I missed this email, as it does not show up in my email client :( See below… Hi Eelco, I think you may need add more detailed message in the commit log or comments. My interpretation of the request was that Beilei wanted to know why disabling promiscuous mode in HW was failing. Beilei can you comment, is the additional description from Xiao enough? Yes, promisc_unicast_enabled flag is not cleared during vf reset because fail to disable promiscuous mode, So I think we need to root cause why fail to disable promiscuous mode first. This patch looks like a workaround but not a fix. This was debugged together with Xiao and from what I understand is that DPDK fails to reset promiscuous mode in hardware as PF and VF operations are not synced between kernel and DPDK. Xiao told me this could not be fixed in another way, Xiao can you comment? Checked again, the root cause is not synced issue between kernel and DPDK during reset. What is the root cause, so I can update the patch description for the next revision. Suggest to remove the checking and setting of promisc_unicast_enabled flag, since this flag is only used when enable/disable promiscuous mode. Considering the un-synced issue, it will be more clean if remove the flag. Also same with flag promisc_multicast_enabled. So if I understand it correctly remove the following code: 2203/* If enabled, just return */ 2204if (vf->promisc_multicast_enabled) 2205return 0; and /* If enabled, just return */ 2223if (!vf->promisc_multicast_enabled) 2224return 0; Or to remove the flags from the i40e_vf structure (and relative code): 1051bool promisc_unicast_enabled; 1052bool promisc_multicast_enabled; Let me know and I craft up a patch… Cheers, Eelco
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
On 1 Nov 2019, at 9:12, Eelco Chaudron wrote: + /* +* Even though the device reset is successful disabling promiscuous +* mode might not always succeed, causing enabling it after reset to I think we need to root cause why fail to disable promiscuous mode and try to fix it first. I’ve copied in Xiao who helped me identify the issue in your driver. The issue is because the change from kernel pf was not synced to DPDK vf during the closing period of reset, so we get this failure. Xiao can you add more details? The root cause is when kernel pf generate DPDK vf reset event, the flag vf->promisc_unicast_enabled will not be cleared but promiscuous mode is disabled. When trying to enable promiscuous mode next time by calling i40evf_dev_promiscuous_enable won't work because it will check the vf->promisc_unicast_enabled flag first. static int i40evf_dev_promiscuous_enable(struct rte_eth_dev *dev) { ... /* If enabled, just return */ if (vf->promisc_unicast_enabled) return 0; ... } Hi Eelco, I think you may need add more detailed message in the commit log or comments. My interpretation of the request was that Beilei wanted to know why disabling promiscuous mode in HW was failing. Beilei can you comment, is the additional description from Xiao enough? Yes, promisc_unicast_enabled flag is not cleared during vf reset because fail to disable promiscuous mode, So I think we need to root cause why fail to disable promiscuous mode first. This patch looks like a workaround but not a fix. This was debugged together with Xiao and from what I understand is that DPDK fails to reset promiscuous mode in hardware as PF and VF operations are not synced between kernel and DPDK. Xiao told me this could not be fixed in another way, Xiao can you comment? Xiao any update you can add to this so it’s more clear for Beilei? Thanks, Eelco
Re: [dpdk-dev] [PATCH] net/af_xdp: enforce an MTU of 1500
On 7 Nov 2019, at 14:27, Ciara Loftus wrote: Packets larger than this will fail to be transmitted by AF_XDP, so limit the MTU to 1500. Signed-off-by: Ciara Loftus --- drivers/net/af_xdp/rte_eth_af_xdp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index 2b1245ee4..62c801500 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -595,7 +595,7 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->max_tx_queues = internals->queue_cnt; dev_info->min_mtu = RTE_ETHER_MIN_MTU; - dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM; + dev_info->max_mtu = 1500; Why is this limited to 1500 in the AF_XDP PMD? For native AF_XDP in OVS the limit is mainly by the driver/XDP page limit: min(PAGE_SIZE, ETH_AF_XDP_FRAME_SIZE ) - ETH_XDP_DATA_HEADROOM - XDP_PACKET_HEADROOM dev_info->default_rxportconf.nb_queues = 1; dev_info->default_txportconf.nb_queues = 1; @@ -1328,6 +1328,8 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev) return -1; } + eth_dev_mtu_set(eth_dev, 1500); + rte_eth_dev_probing_finish(eth_dev); return 0; -- 2.17.1
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
On 1 Nov 2019, at 3:38, Xing, Beilei wrote: -Original Message- From: Eelco Chaudron [mailto:echau...@redhat.com] Sent: Tuesday, October 29, 2019 3:47 PM To: Zhang, Xiao ; Xing, Beilei Cc: Zhang, Qi Z ; dev@dpdk.org Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset On 29 Oct 2019, at 8:39, Zhang, Xiao wrote: -Original Message- From: Eelco Chaudron Sent: Friday, October 25, 2019 5:24 PM To: Xing, Beilei ; Zhang, Xiao Cc: Zhang, Qi Z ; dev@dpdk.org Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset On 17 Oct 2019, at 8:34, Xing, Beilei wrote: On 17 Sep 2019, at 9:40, Eelco Chaudron wrote: Even though the device reset is successful, disabling promiscuous mode might not always succeed, causing enabling it after reset to fail. This would happen when the kernel driver requires a reset of the VF. This patch resets the internal state, so next time promiscuous mode is configured it will be enabled. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 551f6fa..e0f99a4 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) i40evf_dev_reset(struct rte_eth_dev *dev) { int ret; + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); ret = i40evf_dev_uninit(dev); if (ret) return ret; + /* +* Even though the device reset is successful disabling promiscuous +* mode might not always succeed, causing enabling it after reset to I think we need to root cause why fail to disable promiscuous mode and try to fix it first. I’ve copied in Xiao who helped me identify the issue in your driver. The issue is because the change from kernel pf was not synced to DPDK vf during the closing period of reset, so we get this failure. Xiao can you add more details? The root cause is when kernel pf generate DPDK vf reset event, the flag vf->promisc_unicast_enabled will not be cleared but promiscuous mode is disabled. When trying to enable promiscuous mode next time by calling i40evf_dev_promiscuous_enable won't work because it will check the vf->promisc_unicast_enabled flag first. static int i40evf_dev_promiscuous_enable(struct rte_eth_dev *dev) { ... /* If enabled, just return */ if (vf->promisc_unicast_enabled) return 0; ... } Hi Eelco, I think you may need add more detailed message in the commit log or comments. My interpretation of the request was that Beilei wanted to know why disabling promiscuous mode in HW was failing. Beilei can you comment, is the additional description from Xiao enough? Yes, promisc_unicast_enabled flag is not cleared during vf reset because fail to disable promiscuous mode, So I think we need to root cause why fail to disable promiscuous mode first. This patch looks like a workaround but not a fix. This was debugged together with Xiao and from what I understand is that DPDK fails to reset promiscuous mode in hardware as PF and VF operations are not synced between kernel and DPDK. Xiao told me this could not be fixed in another way, Xiao can you comment? +* fail. This would happen when the kernel driver requires a reset +* of the VF. +*/ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + vf->promisc_unicast_enabled = FALSE; + ret = i40evf_dev_init(dev); return ret; -- 1.8.3.1
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
On 29 Oct 2019, at 8:39, Zhang, Xiao wrote: -Original Message- From: Eelco Chaudron Sent: Friday, October 25, 2019 5:24 PM To: Xing, Beilei ; Zhang, Xiao Cc: Zhang, Qi Z ; dev@dpdk.org Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset On 17 Oct 2019, at 8:34, Xing, Beilei wrote: On 17 Sep 2019, at 9:40, Eelco Chaudron wrote: Even though the device reset is successful, disabling promiscuous mode might not always succeed, causing enabling it after reset to fail. This would happen when the kernel driver requires a reset of the VF. This patch resets the internal state, so next time promiscuous mode is configured it will be enabled. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 551f6fa..e0f99a4 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) i40evf_dev_reset(struct rte_eth_dev *dev) { int ret; + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); ret = i40evf_dev_uninit(dev); if (ret) return ret; + /* +* Even though the device reset is successful disabling promiscuous + * mode might not always succeed, causing enabling it after reset to I think we need to root cause why fail to disable promiscuous mode and try to fix it first. I’ve copied in Xiao who helped me identify the issue in your driver. The issue is because the change from kernel pf was not synced to DPDK vf during the closing period of reset, so we get this failure. Xiao can you add more details? The root cause is when kernel pf generate DPDK vf reset event, the flag vf->promisc_unicast_enabled will not be cleared but promiscuous mode is disabled. When trying to enable promiscuous mode next time by calling i40evf_dev_promiscuous_enable won't work because it will check the vf->promisc_unicast_enabled flag first. static int i40evf_dev_promiscuous_enable(struct rte_eth_dev *dev) { ... /* If enabled, just return */ if (vf->promisc_unicast_enabled) return 0; ... } Hi Eelco, I think you may need add more detailed message in the commit log or comments. My interpretation of the request was that Beilei wanted to know why disabling promiscuous mode in HW was failing. Beilei can you comment, is the additional description from Xiao enough? + * fail. This would happen when the kernel driver requires a reset +* of the VF. +*/ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + vf->promisc_unicast_enabled = FALSE; + ret = i40evf_dev_init(dev); return ret; -- 1.8.3.1
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
On 17 Oct 2019, at 8:34, Xing, Beilei wrote: On 17 Sep 2019, at 9:40, Eelco Chaudron wrote: Even though the device reset is successful, disabling promiscuous mode might not always succeed, causing enabling it after reset to fail. This would happen when the kernel driver requires a reset of the VF. This patch resets the internal state, so next time promiscuous mode is configured it will be enabled. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 551f6fa..e0f99a4 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) i40evf_dev_reset(struct rte_eth_dev *dev) { int ret; + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); ret = i40evf_dev_uninit(dev); if (ret) return ret; + /* + * Even though the device reset is successful disabling promiscuous + * mode might not always succeed, causing enabling it after reset to I think we need to root cause why fail to disable promiscuous mode and try to fix it first. I’ve copied in Xiao who helped me identify the issue in your driver. The issue is because the change from kernel pf was not synced to DPDK vf during the closing period of reset, so we get this failure. Xiao can you add more details? +* fail. This would happen when the kernel driver requires a reset +* of the VF. +*/ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + vf->promisc_unicast_enabled = FALSE; + ret = i40evf_dev_init(dev); return ret; -- 1.8.3.1
Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
Any update on this patch? Thanks, Eelco On 17 Sep 2019, at 9:40, Eelco Chaudron wrote: Even though the device reset is successful, disabling promiscuous mode might not always succeed, causing enabling it after reset to fail. This would happen when the kernel driver requires a reset of the VF. This patch resets the internal state, so next time promiscuous mode is configured it will be enabled. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 551f6fa..e0f99a4 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) i40evf_dev_reset(struct rte_eth_dev *dev) { int ret; + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); ret = i40evf_dev_uninit(dev); if (ret) return ret; + /* +* Even though the device reset is successful disabling promiscuous +* mode might not always succeed, causing enabling it after reset to +* fail. This would happen when the kernel driver requires a reset +* of the VF. +*/ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + vf->promisc_unicast_enabled = FALSE; + ret = i40evf_dev_init(dev); return ret; -- 1.8.3.1
[dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset
Even though the device reset is successful, disabling promiscuous mode might not always succeed, causing enabling it after reset to fail. This would happen when the kernel driver requires a reset of the VF. This patch resets the internal state, so next time promiscuous mode is configured it will be enabled. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 551f6fa..e0f99a4 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct rte_pci_device *pci_dev) i40evf_dev_reset(struct rte_eth_dev *dev) { int ret; + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); ret = i40evf_dev_uninit(dev); if (ret) return ret; + /* +* Even though the device reset is successful disabling promiscuous +* mode might not always succeed, causing enabling it after reset to +* fail. This would happen when the kernel driver requires a reset +* of the VF. +*/ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + vf->promisc_unicast_enabled = FALSE; + ret = i40evf_dev_init(dev); return ret; -- 1.8.3.1
Re: [dpdk-dev] [PATCH] net/i40e: downgrade unnecessary error log
Forgot to include the stable alias to get this in 18.11, sorry Kevin //Eelco On 12 Sep 2019, at 14:40, Ye Xiaolong wrote: On 09/12, David Marchand wrote: On Thu, Sep 12, 2019 at 12:38 PM Eelco Chaudron wrote: When receiving the unsupported AQ messages, it's taken as an error. It's not appropriate and triggers too much unnecessary print. This commit is similar to commit e130425300b0 ("net/i40e: downgrade unnecessary error log") which made the same change for the PF instance. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index c77b30c54..7b1d485c6 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -1411,7 +1411,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev) } break; default: - PMD_DRV_LOG(ERR, "Request %u is not supported yet", + PMD_DRV_LOG(DEBUG, "Request %u is not supported yet", aq_opc); break; } -- 2.18.1 Reviewed-by: David Marchand -- David Marchand Reviewed-by: Xiaolong Ye Applied to dpdk-next-net-intel.
Re: [dpdk-dev] [PATCH] net/i40e: downgrade unnecessary error log
On 12 Sep 2019, at 14:40, Ye Xiaolong wrote: On 09/12, David Marchand wrote: On Thu, Sep 12, 2019 at 12:38 PM Eelco Chaudron wrote: When receiving the unsupported AQ messages, it's taken as an error. It's not appropriate and triggers too much unnecessary print. This commit is similar to commit e130425300b0 ("net/i40e: downgrade unnecessary error log") which made the same change for the PF instance. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index c77b30c54..7b1d485c6 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -1411,7 +1411,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev) } break; default: - PMD_DRV_LOG(ERR, "Request %u is not supported yet", + PMD_DRV_LOG(DEBUG, "Request %u is not supported yet", aq_opc); break; } -- 2.18.1 Reviewed-by: David Marchand -- David Marchand Reviewed-by: Xiaolong Ye Applied to dpdk-next-net-intel. Thanks David/Ye
[dpdk-dev] [PATCH] net/i40e: downgrade unnecessary error log
When receiving the unsupported AQ messages, it's taken as an error. It's not appropriate and triggers too much unnecessary print. This commit is similar to commit e130425300b0 ("net/i40e: downgrade unnecessary error log") which made the same change for the PF instance. Signed-off-by: Eelco Chaudron --- drivers/net/i40e/i40e_ethdev_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index c77b30c54..7b1d485c6 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -1411,7 +1411,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev) } break; default: - PMD_DRV_LOG(ERR, "Request %u is not supported yet", + PMD_DRV_LOG(DEBUG, "Request %u is not supported yet", aq_opc); break; } -- 2.18.1
[dpdk-dev] [PATCH] vhost: add device opr when notification to guest is sent
This patch adds an operation callback which gets called every time the library is waking up the guest trough an eventfd_write() call. This can be used by 3rd party application, like OVS, to track the number of times interrupts where generated. This might be of interest to find out system-call were called in the fast path. Signed-off-by: Eelco Chaudron --- lib/librte_vhost/rte_vhost.h | 10 +- lib/librte_vhost/vhost.h | 15 --- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index 7fb1729..878e339 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -172,7 +172,15 @@ struct vhost_device_ops { int (*new_connection)(int vid); void (*destroy_connection)(int vid); - void *reserved[2]; /**< Reserved for future extension */ + /** +* This callback gets called each time a guest gets notified +* about waiting packets. This is the interrupt handling trough +* the eventfd_write(callfd), which can be used for counting these +* "slow" syscalls. +*/ + void (*guest_notified)(int vid); + + void *reserved[1]; /**< Reserved for future extension */ }; /** diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 884befa..5131a97 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -543,13 +543,19 @@ void *vhost_alloc_copy_ind_table(struct virtio_net *dev, if ((vhost_need_event(vhost_used_event(vq), new, old) && (vq->callfd >= 0)) || - unlikely(!signalled_used_valid)) + unlikely(!signalled_used_valid)) { eventfd_write(vq->callfd, (eventfd_t) 1); + if (dev->notify_ops->guest_notified) + dev->notify_ops->guest_notified(dev->vid); + } } else { /* Kick the guest if necessary. */ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) + && (vq->callfd >= 0)) { eventfd_write(vq->callfd, (eventfd_t)1); + if (dev->notify_ops->guest_notified) + dev->notify_ops->guest_notified(dev->vid); + } } } @@ -600,8 +606,11 @@ void *vhost_alloc_copy_ind_table(struct virtio_net *dev, if (vhost_need_event(off, new, old)) kick = true; kick: - if (kick) + if (kick) { eventfd_write(vq->callfd, (eventfd_t)1); + if (dev->notify_ops->guest_notified) + dev->notify_ops->guest_notified(dev->vid); + } } static __rte_always_inline void -- 1.8.3.1
Re: [dpdk-dev] i40e driver with VFs and their changing MAC
Intel, any feedback on the below? Thanks, Eelco On 24 Jun 2019, at 17:39, Eelco Chaudron wrote: Hi, I’m running into an issue where I have a VF used by OVS-DPDK and from the kernel side change the MAC. I used the following command to change the mac: ip link set p5p2 vf 0 mac 52:54:00:92:d3:30 In the kernel log I see the following: [ 3420.075736] i40e :05:00.1: Bring down and up the VF interface to make this change effective. This seems to bring down the VF, which seems to make the DPDK side go crazy: 2019-06-24T15:16:55.052Z|1|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|2|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|3|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|4|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|5|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|6|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|7|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|8|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|9|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00010|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00011|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00012|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00013|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00014|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00015|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00016|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00017|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00018|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00019|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00020|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00021|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00022|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00023|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00024|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00025|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00026|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00027|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00028|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00029|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00030|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00031|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00032|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00033|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00034|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00035|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00036|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00037|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00038|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00039|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00040|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00041|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00042|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00043|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00044|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00045|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00046|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00047|dpdk|ERR
[dpdk-dev] i40e driver with VFs and their changing MAC
Hi, I’m running into an issue where I have a VF used by OVS-DPDK and from the kernel side change the MAC. I used the following command to change the mac: ip link set p5p2 vf 0 mac 52:54:00:92:d3:30 In the kernel log I see the following: [ 3420.075736] i40e :05:00.1: Bring down and up the VF interface to make this change effective. This seems to bring down the VF, which seems to make the DPDK side go crazy: 2019-06-24T15:16:55.052Z|1|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|2|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|3|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|4|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|5|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|6|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|7|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|8|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|9|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00010|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00011|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00012|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00013|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00014|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00015|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00016|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00017|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00018|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00019|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00020|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00021|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00022|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00023|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00024|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00025|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00026|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00027|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00028|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00029|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00030|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00031|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00032|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00033|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00034|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00035|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00036|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00037|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00038|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00039|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00040|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00041|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00042|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00043|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00044|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00045|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00046|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00047|dpdk|ERR|i40evf_handle_aq_msg(): Request 0 is not supported yet 2019-06-24T15:16:55.052Z|00048|dpdk|ERR|i40evf_handle_aq_msg(
Re: [dpdk-dev] [dpdk-stable] [PATCH 2/2] net/ice: fix dropped packets statistics name
Looks good to me… Acked-by: Eelco Chaudron On 3 Jun 2019, at 10:31, David Marchand wrote: > Copy/paste from i40e, let's align with the fix on i40e. > > Fixes: a37bde56314d ("net/ice: support statistics") > Cc: sta...@dpdk.org > > Signed-off-by: David Marchand > --- > drivers/net/ice/ice_ethdev.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c > index bdbceb4..6cd1c74 100644 > --- a/drivers/net/ice/ice_ethdev.c > +++ b/drivers/net/ice/ice_ethdev.c > @@ -153,13 +153,13 @@ struct ice_xstats_name_off { > {"rx_unicast_packets", offsetof(struct ice_eth_stats, rx_unicast)}, > {"rx_multicast_packets", offsetof(struct ice_eth_stats, rx_multicast)}, > {"rx_broadcast_packets", offsetof(struct ice_eth_stats, rx_broadcast)}, > - {"rx_dropped", offsetof(struct ice_eth_stats, rx_discards)}, > + {"rx_dropped_packets", offsetof(struct ice_eth_stats, rx_discards)}, > {"rx_unknown_protocol_packets", offsetof(struct ice_eth_stats, > rx_unknown_protocol)}, > {"tx_unicast_packets", offsetof(struct ice_eth_stats, tx_unicast)}, > {"tx_multicast_packets", offsetof(struct ice_eth_stats, tx_multicast)}, > {"tx_broadcast_packets", offsetof(struct ice_eth_stats, tx_broadcast)}, > - {"tx_dropped", offsetof(struct ice_eth_stats, tx_discards)}, > + {"tx_dropped_packets", offsetof(struct ice_eth_stats, tx_discards)}, > }; > > #define ICE_NB_ETH_XSTATS (sizeof(ice_stats_strings) / \ > -- > 1.8.3.1
Re: [dpdk-dev] [dpdk-stable] [PATCH 1/2] net/i40e: fix dropped packets statistics name
Looks good to me… Acked-by: Eelco Chaudron On 3 Jun 2019, at 10:31, David Marchand wrote: i40e and i40evf currently use two different names for the statistic on dropped packets on the rx and tx sides. Let's prefer i40evf so that all statistics are suffixed with _packets. This also avoids a statistic name conflict in OVS. Fixes: f4a91c38b4ad ("i40e: add extended stats") Cc: sta...@dpdk.org Signed-off-by: David Marchand --- drivers/net/i40e/i40e_ethdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c index 7fa9e1b..2384d4d 100644 --- a/drivers/net/i40e/i40e_ethdev.c +++ b/drivers/net/i40e/i40e_ethdev.c @@ -526,13 +526,13 @@ struct rte_i40e_xstats_name_off { {"rx_unicast_packets", offsetof(struct i40e_eth_stats, rx_unicast)}, {"rx_multicast_packets", offsetof(struct i40e_eth_stats, rx_multicast)}, {"rx_broadcast_packets", offsetof(struct i40e_eth_stats, rx_broadcast)}, - {"rx_dropped", offsetof(struct i40e_eth_stats, rx_discards)}, + {"rx_dropped_packets", offsetof(struct i40e_eth_stats, rx_discards)}, {"rx_unknown_protocol_packets", offsetof(struct i40e_eth_stats, rx_unknown_protocol)}, {"tx_unicast_packets", offsetof(struct i40e_eth_stats, tx_unicast)}, {"tx_multicast_packets", offsetof(struct i40e_eth_stats, tx_multicast)}, {"tx_broadcast_packets", offsetof(struct i40e_eth_stats, tx_broadcast)}, - {"tx_dropped", offsetof(struct i40e_eth_stats, tx_discards)}, + {"tx_dropped_packets", offsetof(struct i40e_eth_stats, tx_discards)}, }; #define I40E_NB_ETH_XSTATS (sizeof(rte_i40e_stats_strings) / \ -- 1.8.3.1
[dpdk-dev] [PATCH v3] lib/librte_meter: fix divide by zero for RFC4115 meter
RFC 4115 allows a meter with either cir and/or eir configured. When only one is configured a divide by zero would occur. Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM") Cc: echau...@redhat.com Signed-off-by: Eelco Chaudron --- v3 - Rather than using a 0 check, set up profile data such that no check at runtime is needed. v2 - Removed configuration change that got included by accident lib/librte_meter/rte_meter.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index e55f9be65..45679444e 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -19,7 +19,15 @@ static void rte_meter_get_tb_params(uint64_t hz, uint64_t rate, uint64_t *tb_period, uint64_t *tb_bytes_per_period) { - double period = ((double) hz) / ((double) rate); + double period; + + if (rate == 0) { + *tb_bytes_per_period = 0; + *tb_period = RTE_METER_TB_PERIOD_MIN; + return; + } + + period = ((double) hz) / ((double) rate); if (period >= RTE_METER_TB_PERIOD_MIN) { *tb_bytes_per_period = 1;
Re: [dpdk-dev] [PATCH v2] lib/librte_meter: fix divide by zero for RFC4115 meter
On 28 Feb 2019, at 19:50, Dumitrescu, Cristian wrote: Hi Eelco, Sorry for my delayed reply No problem I was OOO also so hence my late reply ;) -Original Message- From: Eelco Chaudron [mailto:echau...@redhat.com] Sent: Friday, February 1, 2019 4:07 PM To: Dumitrescu, Cristian Cc: dev@dpdk.org Subject: [PATCH v2] lib/librte_meter: fix divide by zero for RFC4115 meter RFC 4115 allows a meter with either cir and/or eir configured. When only one is configured a divide by zero would occur. Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM") Cc: echau...@redhat.com Signed-off-by: Eelco Chaudron --- v2 - Removed configuration change that got included by accident lib/librte_meter/rte_meter.h |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 005e4..56d85ecf0 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -597,8 +597,8 @@ rte_meter_trtcm_rfc4115_color_blind_check( /* Bucket update */ time_diff_tc = time - m->time_tc; time_diff_te = time - m->time_te; - n_periods_tc = time_diff_tc / p->cir_period; - n_periods_te = time_diff_te / p->eir_period; + n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0; + n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0; m->time_tc += n_periods_tc * p->cir_period; m->time_te += n_periods_te * p->eir_period; @@ -641,8 +641,8 @@ rte_meter_trtcm_rfc4115_color_aware_check( /* Bucket update */ time_diff_tc = time - m->time_tc; time_diff_te = time - m->time_te; - n_periods_tc = time_diff_tc / p->cir_period; - n_periods_te = time_diff_te / p->eir_period; + n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0; + n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0; m->time_tc += n_periods_tc * p->cir_period; m->time_te += n_periods_te * p->eir_period; Yes, this is indeed an issue, good catch! For performance reasons, we'd like to avoid a test on the fast path (rte_meter_xyz_check) and replace it with more work done on the configuration stage (rte_meter_profile_xyz_config), if possible. We can intercept the null CIR or EIR cases very early in rte_meter_trtcm_rfc4115_profile_config() and deal with them separately by setting the CIR/EIR period and bytes_per_period to some neutral values and skipping the call to rte_meter_get_tb_params(): period = RTE_METER_TB_PERIOD_MIN, bytes_per_period = 0. Excellent idea, will sent out a v3 soon, after some testing… //Eelco
[dpdk-dev] [PATCH v2] lib/librte_meter: fix divide by zero for RFC4115 meter
RFC 4115 allows a meter with either cir and/or eir configured. When only one is configured a divide by zero would occur. Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM") Cc: echau...@redhat.com Signed-off-by: Eelco Chaudron --- v2 - Removed configuration change that got included by accident lib/librte_meter/rte_meter.h |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 005e4..56d85ecf0 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -597,8 +597,8 @@ rte_meter_trtcm_rfc4115_color_blind_check( /* Bucket update */ time_diff_tc = time - m->time_tc; time_diff_te = time - m->time_te; - n_periods_tc = time_diff_tc / p->cir_period; - n_periods_te = time_diff_te / p->eir_period; + n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0; + n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0; m->time_tc += n_periods_tc * p->cir_period; m->time_te += n_periods_te * p->eir_period; @@ -641,8 +641,8 @@ rte_meter_trtcm_rfc4115_color_aware_check( /* Bucket update */ time_diff_tc = time - m->time_tc; time_diff_te = time - m->time_te; - n_periods_tc = time_diff_tc / p->cir_period; - n_periods_te = time_diff_te / p->eir_period; + n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0; + n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0; m->time_tc += n_periods_tc * p->cir_period; m->time_te += n_periods_te * p->eir_period;
[dpdk-dev] [PATCH] lib/librte_meter: fix divide by zero for RFC4115 meter
RFC 4115 allows a meter with either cir and/or eir configured. When only one is configured a divide by zero would occur. Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM") Cc: echau...@redhat.com Signed-off-by: Eelco Chaudron --- config/common_linuxapp |4 ++-- lib/librte_meter/rte_meter.h |8 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/common_linuxapp b/config/common_linuxapp index 6c1c8d0f4..186323993 100644 --- a/config/common_linuxapp +++ b/config/common_linuxapp @@ -7,9 +7,9 @@ CONFIG_RTE_EXEC_ENV="linuxapp" CONFIG_RTE_EXEC_ENV_LINUXAPP=y CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y -CONFIG_RTE_EAL_IGB_UIO=y +CONFIG_RTE_EAL_IGB_UIO=n CONFIG_RTE_EAL_VFIO=y -CONFIG_RTE_KNI_KMOD=y +CONFIG_RTE_KNI_KMOD=n CONFIG_RTE_LIBRTE_KNI=y CONFIG_RTE_LIBRTE_PMD_KNI=y CONFIG_RTE_LIBRTE_VHOST=y diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 005e4..56d85ecf0 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -597,8 +597,8 @@ rte_meter_trtcm_rfc4115_color_blind_check( /* Bucket update */ time_diff_tc = time - m->time_tc; time_diff_te = time - m->time_te; - n_periods_tc = time_diff_tc / p->cir_period; - n_periods_te = time_diff_te / p->eir_period; + n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0; + n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0; m->time_tc += n_periods_tc * p->cir_period; m->time_te += n_periods_te * p->eir_period; @@ -641,8 +641,8 @@ rte_meter_trtcm_rfc4115_color_aware_check( /* Bucket update */ time_diff_tc = time - m->time_tc; time_diff_te = time - m->time_te; - n_periods_tc = time_diff_tc / p->cir_period; - n_periods_te = time_diff_te / p->eir_period; + n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0; + n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0; m->time_tc += n_periods_tc * p->cir_period; m->time_te += n_periods_te * p->eir_period;
Re: [dpdk-dev] VF of a X520 card does not process VLAN packets
Hi Konstantin, Thanks for your reply… Yes adding the VLAN to the VF from the kernel side will work, however, this is a different behaviour than the XL710. The XL710 will allow all VLANs to pass through once the PMD takes control of the VF. The real use case is using this VF interface trough Open vSwitch which needs to see all packets regardless of the VLAN. It does not make sense to create 4K VLANs through the kernel to get this working. Looking forward to your reply… Cheers, Eelco On 18 Jan 2019, at 12:25, Ananyev, Konstantin wrote: Hi It’s been while since I looked at it last time, but shouldn’t you assign desired vlan tag to the VF first: ip link set vf vlan ? Konstantin From: Eelco Chaudron [mailto:echau...@redhat.com] Sent: Thursday, January 3, 2019 4:42 PM To: Lu, Wenzhuo ; Ananyev, Konstantin Cc: dev Subject: Re: [dpdk-dev] VF of a X520 card does not process VLAN packets Hi maintainers, any feedback on the below? Thanks, Eelco On 18 Dec 2018, at 12:06, Eelco Chaudron wrote: Hi, I’m assigning a VF of an X520 card for DPDK/testpmd/OVS but it's not accepting tagged VLAN packets (it does accept tag 0). Is this a known bug/limitation of the 82599ES chipset? This is how I've tested it: $ echo 1 > /sys/bus/pci/devices/:05:00.0/sriov_numvfs $driverctl -v set-override :05:10.0 vfio-pci $./testpmd -c 7 -n 4 --socket-mem 2048,0 -w :05:10.0 -- \ --burst 64 -i --rxq=1 --txq=1 --rxd=4096 --txd=1024 \ —coremask=6 --auto-start --port-topology=chained EAL: Detected 28 lcore(s) EAL: Detected 1 NUMA nodes EAL: Multi-process socket /var/run/dpdk/rte/mp_socket ... testpmd> I’m sending broadcast ARP packets: Ethernet II, Src: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00), Dst: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) Destination: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) Address: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) ..1. = LG bit: Locally administered address (this is NOT the factory default) ...1 = IG bit: Group address (multicast/broadcast) Source: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00) Address: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00) ..0. = LG bit: Globally unique address (factory default) ...0 = IG bit: Individual address (unicast) Type: 802.1Q Virtual LAN (0x8100) 802.1Q Virtual LAN, PRI: 0, CFI: 0, ID: 10 000. = Priority: Best Effort (default) (0) ...0 = CFI: Canonical (0) 1010 = ID: 10 Type: ARP (0x0806) Padding: 2e2f303132333435363738393a3b Trailer: 3c3d3e3f404142434445464748494a4b4c4d4e4f50515253... Address Resolution Protocol (request/gratuitous ARP) Hardware type: Ethernet (1) Protocol type: IP (0x0800) Hardware size: 6 Protocol size: 4 Opcode: request (1) [Is gratuitous: True] Sender MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00) Sender IP address: 0.0.0.0 (0.0.0.0) Target MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00) Target IP address: 0.0.0.0 (0.0.0.0) No traffic is received, if you replace tag 10 with 0, packets are received. Note that this is a simple reproducer with testpmd, but it's reported as part of an OVS integration. This works fine on the physical port, or with a VF on an XL710 card. Any input appreciated. Thanks, Eelco
Re: [dpdk-dev] [RFC v1 00/11] scapy/python extension
Hi Xueming, I was wondering if this patch went anywhere, could not find anything in the list archive? I like the idea, as it’s useful for quick unit testing, without relying on Trex or trafgen. Cheers, Eelco quick guide document: https://github.com/steevenlee/dpdk/blob/master_scapy/doc/guides/howto/scapy.rst github branch: https://github.com/steevenlee/dpdk/tree/master_scapy Xueming Li (11): lib/cmdline: support backspace key lib/cmdline: init parse result memeory lib/cmdline: add echo support in batch loading from file app/testpmd: support command echo in CLI batch loading test: update batch loading test lib/python: add embedded python lib app/testpmd: add python command app/testpmd: add pktgen forwarding engine app/testpmd: add pktgen engine scapy commands test/expect: add expect test scripts doc/scapy: add scapy how-to guide app/test-pmd/Makefile|6 + app/test-pmd/cmdline.c | 80 ++- app/test-pmd/pktgen.c| 1092 ++ app/test-pmd/testpmd.c |1 + app/test-pmd/testpmd.h |5 + config/common_base |6 + doc/guides/howto/scapy.rst | 300 lib/Makefile |2 + lib/librte_cmdline/cmdline_parse.c |2 + lib/librte_cmdline/cmdline_rdline.c |1 + lib/librte_cmdline/cmdline_socket.c |5 +- lib/librte_cmdline/cmdline_socket.h |3 +- lib/librte_cmdline/cmdline_vt100.c |1 + lib/librte_cmdline/cmdline_vt100.h |1 + lib/librte_eal/common/include/rte_log.h |1 + lib/librte_python/Makefile | 60 ++ lib/librte_python/rte_python.c | 387 +++ lib/librte_python/rte_python.h | 71 ++ lib/librte_python/rte_python_version.map | 12 + mk/rte.app.mk|1 + test/expect/init.exp | 28 + test/expect/rx.exp | 134 test/test/test_cmdline_lib.c | 10 +- 23 files changed, 2199 insertions(+), 10 deletions(-) create mode 100644 app/test-pmd/pktgen.c create mode 100644 doc/guides/howto/scapy.rst create mode 100644 lib/librte_python/Makefile create mode 100644 lib/librte_python/rte_python.c create mode 100644 lib/librte_python/rte_python.h create mode 100644 lib/librte_python/rte_python_version.map create mode 100644 test/expect/init.exp create mode 100644 test/expect/rx.exp -- 2.13.3
Re: [dpdk-dev] [PATCH v4 0/2] lib/librte_meter: add RFC4115 trTCM meter support
Thanks Thomas! On 10 Jan 2019, at 0:22, Thomas Monjalon wrote: > 07/01/2019 16:58, Dumitrescu, Cristian: >> From: Eelco Chaudron [mailto:echau...@redhat.com] >>> >>> This patch adds support for RFC4115 trTCM meters. >>> >>> >>> Signed-off-by: Eelco Chaudron >>> >>> v4: >>> - Fixed typo >>> - Cleaned up code to be aligned with RFC diagram >>> >> >> Series Acked-by: Cristian Dumitrescu >> >> Applied to next-qos tree, thanks! > > #include "rte_compat.h" is missing. > Fixing in-tree
Re: [dpdk-dev] [PATCH v4 1/2] lib/librte_meter: add RFC4115 trTCM meter support
On 4 Jan 2019, at 20:42, Stephen Hemminger wrote: On Fri, 4 Jan 2019 13:59:42 + Eelco Chaudron wrote: This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron Fix comment formatting. Hi Stephen, I left these warnings in on purpose, to match the existing comment style in the file. These specific warnings are from a structure copied from one right above which also has the same style and line lengths. As they are warning I though I should leave it as is, Christian what are your thoughts on this? ### [dpdk-dev] [PATCH v4 1/2] lib/librte_meter: add RFC4115 trTCM meter support WARNING:BLOCK_COMMENT_STYLE: Block comments use * on subsequent lines #172: FILE: lib/librte_meter/rte_meter.h:59: +/** trTCM parameters per metered traffic flow. The CIR, EIR, CBS and EBS +parameters only count bytes of IP packets and do not include link specific WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a separate line #174: FILE: lib/librte_meter/rte_meter.h:61: +none-zero respectively.*/ WARNING:LONG_LINE_COMMENT: line over 80 characters #176: FILE: lib/librte_meter/rte_meter.h:63: + uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ WARNING:LONG_LINE_COMMENT: line over 80 characters #177: FILE: lib/librte_meter/rte_meter.h:64: + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ WARNING:LONG_LINE_COMMENT: line over 80 characters #355: FILE: lib/librte_meter/rte_meter.h:406: + /**< Number of bytes currently available in the excess(E) token bucket */ total: 0 errors, 5 warnings, 356 lines checked
Re: [dpdk-dev] [PATCH v3 1/2] lib/librte_meter: add RFC4115 trTCM meter support
On 21 Dec 2018, at 17:15, Dumitrescu, Cristian wrote: Hi Eelco, +/** trTCM parameters per metered traffic flow. The CIR, EIT, CBS and EBS Small typo here: EIT to be replaced by EIR. +parameters only count bytes of IP packets and do not include link specific +headers. The CBS and EBS need to be greater than zero if CIR and EIR are +none-zero respectively.*/ +struct rte_meter_trtcm_rfc4115_params { + uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ +}; + +static inline enum rte_meter_color __rte_experimental +rte_meter_trtcm_rfc4115_color_blind_check( + struct rte_meter_trtcm_rfc4115 *m, + struct rte_meter_trtcm_rfc4115_profile *p, + uint64_t time, + uint32_t pkt_len) +{ + uint64_t time_diff_tc, time_diff_te, n_periods_tc, n_periods_te, tc, te; + + /* Bucket update */ + time_diff_tc = time - m->time_tc; + time_diff_te = time - m->time_te; + n_periods_tc = time_diff_tc / p->cir_period; + n_periods_te = time_diff_te / p->eir_period; + m->time_tc += n_periods_tc * p->cir_period; + m->time_te += n_periods_te * p->eir_period; + + tc = m->tc + n_periods_tc * p->cir_bytes_per_period; + if (tc > p->cbs) + tc = p->cbs; + + te = m->te + n_periods_te * p->eir_bytes_per_period; + if (te > p->ebs) + te = p->ebs; + + /* Color logic */ + if (tc >= pkt_len) { + m->tc = tc - pkt_len; + m->te = te; + return e_RTE_METER_GREEN; + } else if (te >= pkt_len) { + m->tc = tc; + m->te = te - pkt_len; + return e_RTE_METER_YELLOW; + } + + /* If we end up here the color is RED */ + m->tc = tc; + m->te = te; + return e_RTE_METER_RED; +} + Since the branch (tc >= pkt_len) == TRUE always returns, I suggest we remove the following "else", as it is redundant: /* Color logic */ if (tc >= pkt_len) { m->tc = tc - pkt_len; m->te = te; return e_RTE_METER_GREEN; } if (te >= pkt_len) { m->tc = tc; m->te = te - pkt_len; return e_RTE_METER_YELLOW; } /* If we end up here the color is RED */ m->tc = tc; m->te = te; return e_RTE_METER_RED; +static inline enum rte_meter_color __rte_experimental +rte_meter_trtcm_rfc4115_color_aware_check( + struct rte_meter_trtcm_rfc4115 *m, + struct rte_meter_trtcm_rfc4115_profile *p, + uint64_t time, + uint32_t pkt_len, + enum rte_meter_color pkt_color) +{ + uint64_t time_diff_tc, time_diff_te, n_periods_tc, n_periods_te, tc, te; + + /* Bucket update */ + time_diff_tc = time - m->time_tc; + time_diff_te = time - m->time_te; + n_periods_tc = time_diff_tc / p->cir_period; + n_periods_te = time_diff_te / p->eir_period; + m->time_tc += n_periods_tc * p->cir_period; + m->time_te += n_periods_te * p->eir_period; + + tc = m->tc + n_periods_tc * p->cir_bytes_per_period; + if (tc > p->cbs) + tc = p->cbs; + + te = m->te + n_periods_te * p->eir_bytes_per_period; + if (te > p->ebs) + te = p->ebs; + + /* Color logic */ + if (pkt_color == e_RTE_METER_GREEN) { + if (tc >= pkt_len) { + m->tc = tc - pkt_len; + m->te = te; + return e_RTE_METER_GREEN; + } else if (te >= pkt_len) { + m->tc = tc; + m->te = te - pkt_len; + return e_RTE_METER_YELLOW; + } + } else if (pkt_color == e_RTE_METER_YELLOW && te >= pkt_len) { + m->tc = tc; + m->te = te - pkt_len; + return e_RTE_METER_YELLOW; + } + + /* If we end up here the color is RED */ + m->tc = tc; + m->te = te; + return e_RTE_METER_RED; +} + + I suggest we follow the logic from the diagram in the RFC rather than the logic in the text preceding the diagram. Although the two descriptions are equivalent (after a bit of thinking), the diagram seems more optimal to me: /* Color logic */ if ((pkt_color == e_RTE_METER_GREEN) && (tc >= pkt_len)) { m->tc = tc - pkt_len; m->te = te; return e_RTE_METER_GREEN; } if ((pkt_color != e_RTE_METER_RED) && (te >= pkt_len)) { m->tc = tc; m->te = te - pkt_len; return e_RTE_METER_YELLOW; } /* If we end up here
[dpdk-dev] [PATCH v4 2/2] test/test_meter: update meter test to include RFC4115 meters
Add test cases for RFC4115 meters Signed-off-by: Eelco Chaudron --- test/test/test_meter.c | 212 1 file changed, 212 insertions(+) diff --git a/test/test/test_meter.c b/test/test/test_meter.c index 8bb47e75c..f935faa53 100644 --- a/test/test/test_meter.c +++ b/test/test/test_meter.c @@ -32,8 +32,10 @@ #define TM_TEST_TRTCM_CIR_DF 4600 #define TM_TEST_TRTCM_PIR_DF 6900 +#define TM_TEST_TRTCM_EIR_DF 6900 #define TM_TEST_TRTCM_CBS_DF 2048 #define TM_TEST_TRTCM_PBS_DF 4096 +#define TM_TEST_TRTCM_EBS_DF 4096 static struct rte_meter_srtcm_params sparams = {.cir = TM_TEST_SRTCM_CIR_DF, @@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams= .cbs = TM_TEST_TRTCM_CBS_DF, .pbs = TM_TEST_TRTCM_PBS_DF,}; +static struct rte_meter_trtcm_rfc4115_params rfc4115params = + {.cir = TM_TEST_TRTCM_CIR_DF, +.eir = TM_TEST_TRTCM_EIR_DF, +.cbs = TM_TEST_TRTCM_CBS_DF, +.ebs = TM_TEST_TRTCM_EBS_DF,}; + /** * functional test for rte_meter_srtcm_config */ @@ -148,6 +156,45 @@ tm_test_trtcm_config(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_config + */ +static inline int +tm_test_trtcm_rfc4115_config(void) +{ + struct rte_meter_trtcm_rfc4115_profile tp; + struct rte_meter_trtcm_rfc4115_params rfc4115params1; +#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config" + + /* invalid parameter test */ + if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* +* cbs and pbs should be none-zero if cir and eir are none-zero +* respectively +*/ + rfc4115params1 = rfc4115params; + rfc4115params1.cbs = 0; + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + rfc4115params1 = rfc4115params; + rfc4115params1.ebs = 0; + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* usual parameter, should be successful */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_CFG_MSG); + + return 0; +} + /** * functional test for rte_meter_srtcm_color_blind_check */ @@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_color_blind_check + */ +static inline int +tm_test_trtcm_rfc4115_color_blind_check(void) +{ +#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check" + + uint64_t time; + struct rte_meter_trtcm_rfc4115_profile tp; + struct rte_meter_trtcm_rfc4115 tm; + uint64_t hz = rte_get_tsc_hz(); + + /* Test green */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1) + != e_RTE_METER_GREEN) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN"); + + /* Test yellow */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + /*
[dpdk-dev] [PATCH v4 0/2] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron v4: - Fixed typo - Cleaned up code to be aligned with RFC diagram v3: - Gave the rfc4115 meter it's own data structures v2: - Marked all functions with __rte_experimental, and added "EXPERIMENTAL:..." to doxygen comments. - Removed library version change, and merged it with first patch - Do not call rte_meter_trtcm_rfc4115_color_aware_check in rte_meter_trtcm_rfc4115_color_blind_check to avoid error with it being marked as experimental Eelco Chaudron (2): lib/librte_meter: add RFC4115 trTCM meter support test/test_meter: update meter test to include RFC4115 meters lib/librte_meter/rte_meter.c | 42 ++ lib/librte_meter/rte_meter.h | 233 lib/librte_meter/rte_meter_version.map |9 + test/test/test_meter.c | 212 + 4 files changed, 494 insertions(+), 2 deletions(-)
[dpdk-dev] [PATCH v4 1/2] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron --- lib/librte_meter/rte_meter.c | 42 ++ lib/librte_meter/rte_meter.h | 233 lib/librte_meter/rte_meter_version.map |9 + 3 files changed, 282 insertions(+), 2 deletions(-) diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index 473f69aba..e55f9be65 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -110,3 +110,45 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m, return 0; } + +int __rte_experimental +rte_meter_trtcm_rfc4115_profile_config( + struct rte_meter_trtcm_rfc4115_profile *p, + struct rte_meter_trtcm_rfc4115_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir != 0 && params->cbs == 0) || + (params->eir != 0 && params->ebs == 0)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + p->cbs = params->cbs; + p->ebs = params->ebs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + rte_meter_get_tb_params(hz, params->eir, &p->eir_period, + &p->eir_bytes_per_period); + + return 0; +} + +int __rte_experimental +rte_meter_trtcm_rfc4115_config( + struct rte_meter_trtcm_rfc4115 *m, + struct rte_meter_trtcm_rfc4115_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + m->time_tc = m->time_te = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->te = p->ebs; + + return 0; +} diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 58a051583..c7eee896a 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -1,3 +1,4 @@ + /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2010-2014 Intel Corporation */ @@ -16,6 +17,7 @@ extern "C" { * Traffic metering algorithms: *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697 *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698 + *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115 * ***/ @@ -49,10 +51,21 @@ be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. * struct rte_meter_trtcm_params { uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ - uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ }; +/** trTCM parameters per metered traffic flow. The CIR, EIR, CBS and EBS +parameters only count bytes of IP packets and do not include link specific +headers. The CBS and EBS need to be greater than zero if CIR and EIR are +none-zero respectively.*/ +struct rte_meter_trtcm_rfc4115_params { + uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ +}; + /** * Internal data structure storing the srTCM configuration profile. Typically * shared by multiple srTCM objects. @@ -65,12 +78,24 @@ struct rte_meter_srtcm_profile; */ struct rte_meter_trtcm_profile; +/** + * Internal data structure storing the trTCM RFC4115 configuration profile. + * Typically shared by multiple trTCM objects. + */ +struct rte_meter_trtcm_rfc4115_profile; + /** Internal data structure storing the srTCM run-time context per metered traffic flow. */ struct rte_meter_srtcm; /** Internal data structure storing the trTCM run-time context per metered traffic flow. */ struct rte_meter_trtcm; +/** + * Internal data structure storing the trTCM RFC4115 run-time context per + * metered traffic flow. + */ +struct rte_meter_trtcm_rfc4115; + /** * srTCM profile configuration * @@ -98,6 +123,23 @@ rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *params); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * trTCM RFC 4115 profile configuration + * + * @param p + *Pointer to pre-allocated trTCM profile data structure + * @
Re: [dpdk-dev] VF of a X520 card does not process VLAN packets
Hi maintainers, any feedback on the below? Thanks, Eelco On 18 Dec 2018, at 12:06, Eelco Chaudron wrote: Hi, I’m assigning a VF of an X520 card for DPDK/testpmd/OVS but it's not accepting tagged VLAN packets (it does accept tag 0). Is this a known bug/limitation of the 82599ES chipset? This is how I've tested it: $ echo 1 > /sys/bus/pci/devices/\:05\:00.0/sriov_numvfs $driverctl -v set-override :05:10.0 vfio-pci $./testpmd -c 7 -n 4 --socket-mem 2048,0 -w :05:10.0 -- \ --burst 64 -i --rxq=1 --txq=1 --rxd=4096 --txd=1024 \ —coremask=6 --auto-start --port-topology=chained EAL: Detected 28 lcore(s) EAL: Detected 1 NUMA nodes EAL: Multi-process socket /var/run/dpdk/rte/mp_socket ... testpmd> I’m sending broadcast ARP packets: Ethernet II, Src: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00), Dst: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) Destination: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) Address: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) ..1. = LG bit: Locally administered address (this is NOT the factory default) ...1 = IG bit: Group address (multicast/broadcast) Source: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00) Address: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00) ..0. = LG bit: Globally unique address (factory default) ...0 = IG bit: Individual address (unicast) Type: 802.1Q Virtual LAN (0x8100) 802.1Q Virtual LAN, PRI: 0, CFI: 0, ID: 10 000. = Priority: Best Effort (default) (0) ...0 = CFI: Canonical (0) 1010 = ID: 10 Type: ARP (0x0806) Padding: 2e2f303132333435363738393a3b Trailer: 3c3d3e3f404142434445464748494a4b4c4d4e4f50515253... Address Resolution Protocol (request/gratuitous ARP) Hardware type: Ethernet (1) Protocol type: IP (0x0800) Hardware size: 6 Protocol size: 4 Opcode: request (1) [Is gratuitous: True] Sender MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00) Sender IP address: 0.0.0.0 (0.0.0.0) Target MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00) Target IP address: 0.0.0.0 (0.0.0.0) No traffic is received, if you replace tag 10 with 0, packets are received. Note that this is a simple reproducer with testpmd, but it's reported as part of an OVS integration. This works fine on the physical port, or with a VF on an XL710 card. Any input appreciated. Thanks, Eelco
[dpdk-dev] [PATCH v3 2/2] test/test_meter: update meter test to include RFC4115 meters
Add test cases for RFC4115 meters Signed-off-by: Eelco Chaudron --- test/test/test_meter.c | 212 1 file changed, 212 insertions(+) diff --git a/test/test/test_meter.c b/test/test/test_meter.c index 8bb47e75c..f935faa53 100644 --- a/test/test/test_meter.c +++ b/test/test/test_meter.c @@ -32,8 +32,10 @@ #define TM_TEST_TRTCM_CIR_DF 4600 #define TM_TEST_TRTCM_PIR_DF 6900 +#define TM_TEST_TRTCM_EIR_DF 6900 #define TM_TEST_TRTCM_CBS_DF 2048 #define TM_TEST_TRTCM_PBS_DF 4096 +#define TM_TEST_TRTCM_EBS_DF 4096 static struct rte_meter_srtcm_params sparams = {.cir = TM_TEST_SRTCM_CIR_DF, @@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams= .cbs = TM_TEST_TRTCM_CBS_DF, .pbs = TM_TEST_TRTCM_PBS_DF,}; +static struct rte_meter_trtcm_rfc4115_params rfc4115params = + {.cir = TM_TEST_TRTCM_CIR_DF, +.eir = TM_TEST_TRTCM_EIR_DF, +.cbs = TM_TEST_TRTCM_CBS_DF, +.ebs = TM_TEST_TRTCM_EBS_DF,}; + /** * functional test for rte_meter_srtcm_config */ @@ -148,6 +156,45 @@ tm_test_trtcm_config(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_config + */ +static inline int +tm_test_trtcm_rfc4115_config(void) +{ + struct rte_meter_trtcm_rfc4115_profile tp; + struct rte_meter_trtcm_rfc4115_params rfc4115params1; +#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config" + + /* invalid parameter test */ + if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* +* cbs and pbs should be none-zero if cir and eir are none-zero +* respectively +*/ + rfc4115params1 = rfc4115params; + rfc4115params1.cbs = 0; + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + rfc4115params1 = rfc4115params; + rfc4115params1.ebs = 0; + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* usual parameter, should be successful */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_CFG_MSG); + + return 0; +} + /** * functional test for rte_meter_srtcm_color_blind_check */ @@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_color_blind_check + */ +static inline int +tm_test_trtcm_rfc4115_color_blind_check(void) +{ +#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check" + + uint64_t time; + struct rte_meter_trtcm_rfc4115_profile tp; + struct rte_meter_trtcm_rfc4115 tm; + uint64_t hz = rte_get_tsc_hz(); + + /* Test green */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1) + != e_RTE_METER_GREEN) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN"); + + /* Test yellow */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + /*
[dpdk-dev] [PATCH v3 1/2] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron --- lib/librte_meter/rte_meter.c | 42 ++ lib/librte_meter/rte_meter.h | 235 lib/librte_meter/rte_meter_version.map |9 + 3 files changed, 284 insertions(+), 2 deletions(-) diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index 473f69aba..e55f9be65 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -110,3 +110,45 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m, return 0; } + +int __rte_experimental +rte_meter_trtcm_rfc4115_profile_config( + struct rte_meter_trtcm_rfc4115_profile *p, + struct rte_meter_trtcm_rfc4115_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir != 0 && params->cbs == 0) || + (params->eir != 0 && params->ebs == 0)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + p->cbs = params->cbs; + p->ebs = params->ebs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + rte_meter_get_tb_params(hz, params->eir, &p->eir_period, + &p->eir_bytes_per_period); + + return 0; +} + +int __rte_experimental +rte_meter_trtcm_rfc4115_config( + struct rte_meter_trtcm_rfc4115 *m, + struct rte_meter_trtcm_rfc4115_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + m->time_tc = m->time_te = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->te = p->ebs; + + return 0; +} diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 58a051583..e39919da0 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -16,6 +16,7 @@ extern "C" { * Traffic metering algorithms: *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697 *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698 + *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115 * ***/ @@ -49,10 +50,21 @@ be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. * struct rte_meter_trtcm_params { uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ - uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ }; +/** trTCM parameters per metered traffic flow. The CIR, EIT, CBS and EBS +parameters only count bytes of IP packets and do not include link specific +headers. The CBS and EBS need to be greater than zero if CIR and EIR are +none-zero respectively.*/ +struct rte_meter_trtcm_rfc4115_params { + uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ +}; + /** * Internal data structure storing the srTCM configuration profile. Typically * shared by multiple srTCM objects. @@ -65,12 +77,24 @@ struct rte_meter_srtcm_profile; */ struct rte_meter_trtcm_profile; +/** + * Internal data structure storing the trTCM RFC4115 configuration profile. + * Typically shared by multiple trTCM objects. + */ +struct rte_meter_trtcm_rfc4115_profile; + /** Internal data structure storing the srTCM run-time context per metered traffic flow. */ struct rte_meter_srtcm; /** Internal data structure storing the trTCM run-time context per metered traffic flow. */ struct rte_meter_trtcm; +/** + * Internal data structure storing the trTCM RFC4115 run-time context per + * metered traffic flow. + */ +struct rte_meter_trtcm_rfc4115; + /** * srTCM profile configuration * @@ -98,6 +122,23 @@ rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *params); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * trTCM RFC 4115 profile configuration + * + * @param p + *Pointer to pre-allocated trTCM profile data structure + * @param params + *trTCM profile parameters + * @return + *0 upon success, error code otherwise + */ +
[dpdk-dev] [PATCH v3 0/2] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron v3: - Gave the rfc4115 meter it's own data structures v2: - Marked all functions with __rte_experimental, and added "EXPERIMENTAL:..." to doxygen comments. - Removed library version change, and merged it with first patch - Do not call rte_meter_trtcm_rfc4115_color_aware_check in rte_meter_trtcm_rfc4115_color_blind_check to avoid error with it being marked as experimental Eelco Chaudron (2): lib/librte_meter: add RFC4115 trTCM meter support test/test_meter: update meter test to include RFC4115 meters lib/librte_meter/rte_meter.c | 42 ++ lib/librte_meter/rte_meter.h | 235 lib/librte_meter/rte_meter_version.map |9 + test/test/test_meter.c | 212 + 4 files changed, 496 insertions(+), 2 deletions(-)
[dpdk-dev] VF of a X520 card does not process VLAN packets
Hi, I’m assigning a VF of an X520 card for DPDK/testpmd/OVS but it's not accepting tagged VLAN packets (it does accept tag 0). Is this a known bug/limitation of the 82599ES chipset? This is how I've tested it: $ echo 1 > /sys/bus/pci/devices/\:05\:00.0/sriov_numvfs $driverctl -v set-override :05:10.0 vfio-pci $./testpmd -c 7 -n 4 --socket-mem 2048,0 -w :05:10.0 -- \ --burst 64 -i --rxq=1 --txq=1 --rxd=4096 --txd=1024 \ —coremask=6 --auto-start --port-topology=chained EAL: Detected 28 lcore(s) EAL: Detected 1 NUMA nodes EAL: Multi-process socket /var/run/dpdk/rte/mp_socket ... testpmd> I’m sending broadcast ARP packets: Ethernet II, Src: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00), Dst: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) Destination: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) Address: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff) ..1. = LG bit: Locally administered address (this is NOT the factory default) ...1 = IG bit: Group address (multicast/broadcast) Source: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00) Address: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00) ..0. = LG bit: Globally unique address (factory default) ...0 = IG bit: Individual address (unicast) Type: 802.1Q Virtual LAN (0x8100) 802.1Q Virtual LAN, PRI: 0, CFI: 0, ID: 10 000. = Priority: Best Effort (default) (0) ...0 = CFI: Canonical (0) 1010 = ID: 10 Type: ARP (0x0806) Padding: 2e2f303132333435363738393a3b Trailer: 3c3d3e3f404142434445464748494a4b4c4d4e4f50515253... Address Resolution Protocol (request/gratuitous ARP) Hardware type: Ethernet (1) Protocol type: IP (0x0800) Hardware size: 6 Protocol size: 4 Opcode: request (1) [Is gratuitous: True] Sender MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00) Sender IP address: 0.0.0.0 (0.0.0.0) Target MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00) Target IP address: 0.0.0.0 (0.0.0.0) No traffic is received, if you replace tag 10 with 0, packets are received. Note that this is a simple reproducer with testpmd, but it's reported as part of an OVS integration. This works fine on the physical port, or with a VF on an XL710 card. Any input appreciated. Thanks, Eelco
Re: [dpdk-dev] [PATCH v2 1/2] lib/librte_meter: add RFC4115 trTCM meter support
On 17 Dec 2018, at 12:23, Dumitrescu, Cristian wrote: Hi Eelco, -Original Message- From: Eelco Chaudron [mailto:echau...@redhat.com] Sent: Thursday, November 29, 2018 11:29 AM To: Dumitrescu, Cristian Cc: dev@dpdk.org Subject: [PATCH v2 1/2] lib/librte_meter: add RFC4115 trTCM meter support This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron --- lib/librte_meter/rte_meter.c | 40 + lib/librte_meter/rte_meter.h | 236 ++-- lib/librte_meter/rte_meter_version.map |9 + 3 files changed, 269 insertions(+), 16 deletions(-) diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index 473f69aba..817680dbb 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -110,3 +110,43 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m, return 0; } + +int __rte_experimental +rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir != 0 && params->cbs == 0) || + (params->eir != 0 && params->ebs == 0)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + p->cbs = params->cbs; + p->ebs = params->ebs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + rte_meter_get_tb_params(hz, params->eir, &p->eir_period, + &p->eir_bytes_per_period); + + return 0; +} + +int __rte_experimental +rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + m->time_tc = m->time_te = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->te = p->ebs; + + return 0; +} diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 58a051583..c3f88d0cb 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -16,6 +16,7 @@ extern "C" { * Traffic metering algorithms: *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697 *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698 + *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115 * ***/ @@ -43,14 +44,25 @@ struct rte_meter_srtcm_params { uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ }; -/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and PBS parameters -only count bytes of IP packets and do not include link specific headers. PIR has to -be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. */ +/** trTCM parameters per metered traffic flow. The CIR, PIR/EIT, CBS and PBS/EBS +parameters only count bytes of IP packets and do not include link specific +headers. + +- For RFC2698 operations PIR has to be greater than or equal to CIR. Both CBS + or EBS have to be greater than zero. +- For RFC4115 operations CBS and EBS need to be greater than zero if CIR and + EIR are none-zero respectively.*/ struct rte_meter_trtcm_params { uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ - uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ - uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */ - uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ + union { + uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ + }; + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + union { + uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ + }; }; In order to prevent complicating the data structures and/or creating false dependencies between algs, please create a separate data structure struct rte_meter_trtcm_rfc4115_params. This is also inline with the naming conventions from librte_ethdev/rte_mtr.h Thanks for the feedback, as this was the thing I wanted to clarify… I’ll try to send a V2 later this week (or next year ;). /** @@ -98,6 +110,22 @@ rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *p
[dpdk-dev] [PATCH v2 1/2] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron --- lib/librte_meter/rte_meter.c | 40 + lib/librte_meter/rte_meter.h | 236 ++-- lib/librte_meter/rte_meter_version.map |9 + 3 files changed, 269 insertions(+), 16 deletions(-) diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index 473f69aba..817680dbb 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -110,3 +110,43 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m, return 0; } + +int __rte_experimental +rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir != 0 && params->cbs == 0) || + (params->eir != 0 && params->ebs == 0)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + p->cbs = params->cbs; + p->ebs = params->ebs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + rte_meter_get_tb_params(hz, params->eir, &p->eir_period, + &p->eir_bytes_per_period); + + return 0; +} + +int __rte_experimental +rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + m->time_tc = m->time_te = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->te = p->ebs; + + return 0; +} diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 58a051583..c3f88d0cb 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -16,6 +16,7 @@ extern "C" { * Traffic metering algorithms: *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697 *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698 + *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115 * ***/ @@ -43,14 +44,25 @@ struct rte_meter_srtcm_params { uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ }; -/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and PBS parameters -only count bytes of IP packets and do not include link specific headers. PIR has to -be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. */ +/** trTCM parameters per metered traffic flow. The CIR, PIR/EIT, CBS and PBS/EBS +parameters only count bytes of IP packets and do not include link specific +headers. + +- For RFC2698 operations PIR has to be greater than or equal to CIR. Both CBS + or EBS have to be greater than zero. +- For RFC4115 operations CBS and EBS need to be greater than zero if CIR and + EIR are none-zero respectively.*/ struct rte_meter_trtcm_params { uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ - uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ - uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */ - uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ + union { + uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ + }; + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + union { + uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ + }; }; /** @@ -98,6 +110,22 @@ rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *params); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * trTCM RFC 4115 profile configuration + * + * @param p + *Pointer to pre-allocated trTCM profile data structure + * @param params + *trTCM profile parameters + * @return + *0 upon success, error code otherwise + */ +int __rte_experimental +rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params); /** * srTCM configuration per metered traffic flow @@ -127,6 +155,23 @@ int rte_meter_trtcm_config(struct rte_meter_trtcm *m, struct rte_meter_trtcm_profile *p); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice
[dpdk-dev] [PATCH v2 2/2] test/test_meter: update meter test to include RFC4115 meters
Add test cases for RFC4115 meters Signed-off-by: Eelco Chaudron --- test/test/test_meter.c | 212 1 file changed, 212 insertions(+) diff --git a/test/test/test_meter.c b/test/test/test_meter.c index 8bb47e75c..097da0143 100644 --- a/test/test/test_meter.c +++ b/test/test/test_meter.c @@ -32,8 +32,10 @@ #define TM_TEST_TRTCM_CIR_DF 4600 #define TM_TEST_TRTCM_PIR_DF 6900 +#define TM_TEST_TRTCM_EIR_DF 6900 #define TM_TEST_TRTCM_CBS_DF 2048 #define TM_TEST_TRTCM_PBS_DF 4096 +#define TM_TEST_TRTCM_EBS_DF 4096 static struct rte_meter_srtcm_params sparams = {.cir = TM_TEST_SRTCM_CIR_DF, @@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams= .cbs = TM_TEST_TRTCM_CBS_DF, .pbs = TM_TEST_TRTCM_PBS_DF,}; +static struct rte_meter_trtcm_params rfc4115params = + {.cir = TM_TEST_TRTCM_CIR_DF, +.eir = TM_TEST_TRTCM_EIR_DF, +.cbs = TM_TEST_TRTCM_CBS_DF, +.ebs = TM_TEST_TRTCM_EBS_DF,}; + /** * functional test for rte_meter_srtcm_config */ @@ -148,6 +156,45 @@ tm_test_trtcm_config(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_config + */ +static inline int +tm_test_trtcm_rfc4115_config(void) +{ + struct rte_meter_trtcm_profile tp; + struct rte_meter_trtcm_params rfc4115params1; +#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config" + + /* invalid parameter test */ + if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* +* cbs and pbs should be none-zero if cir and eir are none-zero +* respectively +*/ + rfc4115params1 = rfc4115params; + rfc4115params1.cbs = 0; + if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + rfc4115params1 = rfc4115params; + rfc4115params1.pbs = 0; + if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* usual parameter, should be successful */ + if (rte_meter_trtcm_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_CFG_MSG); + + return 0; +} + /** * functional test for rte_meter_srtcm_color_blind_check */ @@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_color_blind_check + */ +static inline int +tm_test_trtcm_rfc4115_color_blind_check(void) +{ +#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check" + + uint64_t time; + struct rte_meter_trtcm_profile tp; + struct rte_meter_trtcm tm; + uint64_t hz = rte_get_tsc_hz(); + + /* Test green */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1) + != e_RTE_METER_GREEN) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN"); + + /* Test yellow */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + /* Test red */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp
[dpdk-dev] [PATCH v2 0/2] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron v2: - Marked all functions with __rte_experimental, and added "EXPERIMENTAL:..." to doxygen comments. - Removed library version change, and merged it with first patch - Do not call rte_meter_trtcm_rfc4115_color_aware_check in rte_meter_trtcm_rfc4115_color_blind_check to avoid error with it being marked as experimental Eelco Chaudron (2): lib/librte_meter: add RFC4115 trTCM meter support test/test_meter: update meter test to include RFC4115 meters lib/librte_meter/rte_meter.c | 40 + lib/librte_meter/rte_meter.h | 236 ++-- lib/librte_meter/rte_meter_version.map |9 + test/test/test_meter.c | 212 + 4 files changed, 481 insertions(+), 16 deletions(-)
Re: [dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function
On 28 Nov 2018, at 13:51, Thomas Monjalon wrote: > 28/11/2018 13:40, Eelco Chaudron: >> >> On 28 Nov 2018, at 11:09, Thomas Monjalon wrote: >> >>> 28/11/2018 10:27, Eelco Chaudron: >>>> On 28 Nov 2018, at 9:38, David Marchand wrote: >>>>> On Tue, Nov 27, 2018 at 4:22 PM Eelco Chaudron >>>>> wrote: >>>>>> --- a/lib/librte_meter/Makefile >>>>>> +++ b/lib/librte_meter/Makefile >>>>>> -LIBABIVER := 2 >>>>>> +LIBABIVER := 3 >>>>> >>>>> As far as I understood the policy around the EXPERIMENTAL section, >>>>> you >>>>> don't need to bump the library version. >>>> >>>> Thought I would add the new function as none experimental, i.e. next >>>> version, but checkpatch did not allow me to do this. >>>> >>>> Tried to find info on what the right process was, as these functions >>>> are >>>> just another meter implementation using similar existing APIs. If >>>> anyone >>>> has any background on this please point me to it. >>> >>> It is documented here: >>> http://doc.dpdk.org/guides/contributing/versioning.html >>> >>> The case for "similar API" is not handled specifically so far. >>> So you need to introduce it as experimental. >> >> Thanks for the clarification, will update the APIs with >> __rte_experimental in the next iteration. > > You should also add this on top of the doxygen comment: > > * @warning > * @b EXPERIMENTAL: this API may change without prior notice Thanks, done!
Re: [dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function
On 28 Nov 2018, at 11:09, Thomas Monjalon wrote: 28/11/2018 10:27, Eelco Chaudron: On 28 Nov 2018, at 9:38, David Marchand wrote: On Tue, Nov 27, 2018 at 4:22 PM Eelco Chaudron wrote: --- a/lib/librte_meter/Makefile +++ b/lib/librte_meter/Makefile -LIBABIVER := 2 +LIBABIVER := 3 As far as I understood the policy around the EXPERIMENTAL section, you don't need to bump the library version. Thought I would add the new function as none experimental, i.e. next version, but checkpatch did not allow me to do this. Tried to find info on what the right process was, as these functions are just another meter implementation using similar existing APIs. If anyone has any background on this please point me to it. It is documented here: http://doc.dpdk.org/guides/contributing/versioning.html The case for "similar API" is not handled specifically so far. So you need to introduce it as experimental. Thanks for the clarification, will update the APIs with __rte_experimental in the next iteration. I changed the library version as an existing data structure changed (which in theory should not change the location of the data), but the ABI check popped warnings so I decided to increase the version. It deserves to analyze why the ABI check raises a warning. If it really needs to bump the ABI version, you should justify it in the commit message, and explain what changed in the ABI section of the release notes, plus update the version in the release notes. Will look at it more closely and update it for the next iteration.
Re: [dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function
On 28 Nov 2018, at 9:38, David Marchand wrote: Hello Eelco, On Tue, Nov 27, 2018 at 4:22 PM Eelco Chaudron wrote: Update the ABI to include the new RFC4115 meter functions --- lib/librte_meter/Makefile |2 +- lib/librte_meter/meson.build |2 +- lib/librte_meter/rte_meter_version.map |9 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/librte_meter/Makefile b/lib/librte_meter/Makefile index 2dc071e8e..79ad79744 100644 --- a/lib/librte_meter/Makefile +++ b/lib/librte_meter/Makefile @@ -16,7 +16,7 @@ LDLIBS += -lrte_eal EXPORT_MAP := rte_meter_version.map -LIBABIVER := 2 +LIBABIVER := 3 # # all source are stored in SRCS-y As far as I understood the policy around the EXPERIMENTAL section, you don't need to bump the library version. Thought I would add the new function as none experimental, i.e. next version, but checkpatch did not allow me to do this. Tried to find info on what the right process was, as these functions are just another meter implementation using similar existing APIs. If anyone has any background on this please point me to it. I changed the library version as an existing data structure changed (which in theory should not change the location of the data), but the ABI check popped warnings so I decided to increase the version. + you should squash this into the first patch. I’ll do this on the next version. Thanks, Eelco
[dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function
Update the ABI to include the new RFC4115 meter functions --- lib/librte_meter/Makefile |2 +- lib/librte_meter/meson.build |2 +- lib/librte_meter/rte_meter_version.map |9 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/librte_meter/Makefile b/lib/librte_meter/Makefile index 2dc071e8e..79ad79744 100644 --- a/lib/librte_meter/Makefile +++ b/lib/librte_meter/Makefile @@ -16,7 +16,7 @@ LDLIBS += -lrte_eal EXPORT_MAP := rte_meter_version.map -LIBABIVER := 2 +LIBABIVER := 3 # # all source are stored in SRCS-y diff --git a/lib/librte_meter/meson.build b/lib/librte_meter/meson.build index 947bc19e2..422123e20 100644 --- a/lib/librte_meter/meson.build +++ b/lib/librte_meter/meson.build @@ -1,6 +1,6 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -version = 2 +version = 3 sources = files('rte_meter.c') headers = files('rte_meter.h') diff --git a/lib/librte_meter/rte_meter_version.map b/lib/librte_meter/rte_meter_version.map index cb79f0c2b..4b460d580 100644 --- a/lib/librte_meter/rte_meter_version.map +++ b/lib/librte_meter/rte_meter_version.map @@ -17,3 +17,12 @@ DPDK_18.08 { rte_meter_srtcm_profile_config; rte_meter_trtcm_profile_config; }; + +EXPERIMENTAL { + global: + + rte_meter_trtcm_rfc4115_color_aware_check; + rte_meter_trtcm_rfc4115_color_blind_check; + rte_meter_trtcm_rfc4115_config; + rte_meter_trtcm_rfc4115_profile_config; +};
[dpdk-dev] [PATCH 1/3] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron --- lib/librte_meter/rte_meter.c | 40 + lib/librte_meter/rte_meter.h | 193 +++--- 2 files changed, 217 insertions(+), 16 deletions(-) diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c index 473f69aba..aeeac69fe 100644 --- a/lib/librte_meter/rte_meter.c +++ b/lib/librte_meter/rte_meter.c @@ -110,3 +110,43 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m, return 0; } + +int +rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir != 0 && params->cbs == 0) || + (params->eir != 0 && params->ebs == 0)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + p->cbs = params->cbs; + p->ebs = params->ebs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + rte_meter_get_tb_params(hz, params->eir, &p->eir_period, + &p->eir_bytes_per_period); + + return 0; +} + +int +rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + m->time_tc = m->time_te = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->te = p->ebs; + + return 0; +} diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h index 58a051583..ff1ca3f77 100644 --- a/lib/librte_meter/rte_meter.h +++ b/lib/librte_meter/rte_meter.h @@ -16,6 +16,7 @@ extern "C" { * Traffic metering algorithms: *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697 *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698 + *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115 * ***/ @@ -43,14 +44,25 @@ struct rte_meter_srtcm_params { uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ }; -/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and PBS parameters -only count bytes of IP packets and do not include link specific headers. PIR has to -be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. */ +/** trTCM parameters per metered traffic flow. The CIR, PIR/EIT, CBS and PBS/EBS +parameters only count bytes of IP packets and do not include link specific +headers. + +- For RFC2698 operations PIR has to be greater than or equal to CIR. Both CBS + or EBS have to be greater than zero. +- For RFC4115 operations CBS and EBS need to be greater than zero if CIR and + EIR are none-zero respectively.*/ struct rte_meter_trtcm_params { uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ - uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ - uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */ - uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ + union { + uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ + uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per second. */ + }; + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + union { + uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ + }; }; /** @@ -98,6 +110,19 @@ rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, int rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, struct rte_meter_trtcm_params *params); +/** + * trTCM RFC 4115 profile configuration + * + * @param p + *Pointer to pre-allocated trTCM profile data structure + * @param params + *trTCM profile parameters + * @return + *0 upon success, error code otherwise + */ +int +rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params); /** * srTCM configuration per metered traffic flow @@ -127,6 +152,20 @@ int rte_meter_trtcm_config(struct rte_meter_trtcm *m, struct rte_meter_trtcm_profile *p); +/** + * trTCM RFC 4115 configuration per metered traffic flow + * + * @param m + *Pointer to pre-allocated trTCM data structure + * @param p + *trTCM profile. Needs to be valid. + * @return + *0 upon success, error code otherwise + */ +int +rte_meter_trtcm
[dpdk-dev] [PATCH 0/3] lib/librte_meter: add RFC4115 trTCM meter support
This patch adds support for RFC4115 trTCM meters. Signed-off-by: Eelco Chaudron Eelco Chaudron (3): lib/librte_meter: add RFC4115 trTCM meter support test/test_meter: update meter test to include RFC4115 meters lib/librte_meter: update abi to include new rfc4115 function lib/librte_meter/Makefile |2 lib/librte_meter/meson.build |2 lib/librte_meter/rte_meter.c | 40 ++ lib/librte_meter/rte_meter.h | 193 +++-- lib/librte_meter/rte_meter_version.map |9 + test/test/test_meter.c | 212 6 files changed, 440 insertions(+), 18 deletions(-)
[dpdk-dev] [PATCH 2/3] test/test_meter: update meter test to include RFC4115 meters
Add test cases for RFC4115 meters Signed-off-by: Eelco Chaudron --- test/test/test_meter.c | 212 1 file changed, 212 insertions(+) diff --git a/test/test/test_meter.c b/test/test/test_meter.c index 8bb47e75c..097da0143 100644 --- a/test/test/test_meter.c +++ b/test/test/test_meter.c @@ -32,8 +32,10 @@ #define TM_TEST_TRTCM_CIR_DF 4600 #define TM_TEST_TRTCM_PIR_DF 6900 +#define TM_TEST_TRTCM_EIR_DF 6900 #define TM_TEST_TRTCM_CBS_DF 2048 #define TM_TEST_TRTCM_PBS_DF 4096 +#define TM_TEST_TRTCM_EBS_DF 4096 static struct rte_meter_srtcm_params sparams = {.cir = TM_TEST_SRTCM_CIR_DF, @@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams= .cbs = TM_TEST_TRTCM_CBS_DF, .pbs = TM_TEST_TRTCM_PBS_DF,}; +static struct rte_meter_trtcm_params rfc4115params = + {.cir = TM_TEST_TRTCM_CIR_DF, +.eir = TM_TEST_TRTCM_EIR_DF, +.cbs = TM_TEST_TRTCM_CBS_DF, +.ebs = TM_TEST_TRTCM_EBS_DF,}; + /** * functional test for rte_meter_srtcm_config */ @@ -148,6 +156,45 @@ tm_test_trtcm_config(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_config + */ +static inline int +tm_test_trtcm_rfc4115_config(void) +{ + struct rte_meter_trtcm_profile tp; + struct rte_meter_trtcm_params rfc4115params1; +#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config" + + /* invalid parameter test */ + if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* +* cbs and pbs should be none-zero if cir and eir are none-zero +* respectively +*/ + rfc4115params1 = rfc4115params; + rfc4115params1.cbs = 0; + if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + rfc4115params1 = rfc4115params; + rfc4115params1.pbs = 0; + if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0) + melog(TRTCM_RFC4115_CFG_MSG); + + /* usual parameter, should be successful */ + if (rte_meter_trtcm_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_CFG_MSG); + + return 0; +} + /** * functional test for rte_meter_srtcm_color_blind_check */ @@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void) return 0; } +/** + * functional test for rte_meter_trtcm_rfc4115_color_blind_check + */ +static inline int +tm_test_trtcm_rfc4115_color_blind_check(void) +{ +#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check" + + uint64_t time; + struct rte_meter_trtcm_profile tp; + struct rte_meter_trtcm tm; + uint64_t hz = rte_get_tsc_hz(); + + /* Test green */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1) + != e_RTE_METER_GREEN) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN"); + + /* Test yellow */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG); + time = rte_get_tsc_cycles() + hz; + if (rte_meter_trtcm_rfc4115_color_blind_check( + &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1) + != e_RTE_METER_YELLOW) + melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW"); + + /* Test red */ + if (rte_meter_trtcm_rfc4115_profile_config(&tp
Re: [dpdk-dev] [dpdk-stable] [PATCH 17.11] mem: fix memory initialization time
On 12 Nov 2018, at 12:26, Eelco Chaudron wrote: On 12 Nov 2018, at 12:18, Alejandro Lucero wrote: When using large amount of hugepage based memory, doing all the hugepages mapping can take quite significant time. The problem is hugepages being initially mmaped to virtual addresses which will be tried later for the final hugepage mmaping. This causes the final mapping requiring calling mmap with another hint address which can happen several times, depending on the amount of memory to mmap, and which each mmmap taking more than a second. This patch changes the hint for the initial hugepage mmaping using a starting address which will not collide with the final mmaping. Fixes: 293c0c4b957f ("mem: use address hint for mapping hugepages") Signed-off-by: Alejandro Lucero Thanks Alejandro for sending the patch. This issue was found in an OVS-DPDK environment. I verified/tested the patch. Acked-by: Eelco Chaudron Tested-by: Eelco Chaudron --- lib/librte_eal/linuxapp/eal/eal_memory.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index bac969a12..0675809b7 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -421,6 +421,21 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, } #endif +#ifdef RTE_ARCH_64 + /* +* Hugepages are first mmaped individually and then re-mmapped to + * another region for having contiguous physical pages in contiguous +* virtual addresses. Setting here vma_addr for the first hugepage + * mapped to a virtual address which will not collide with the second +* mmaping later. The next hugepages will use increments of this +* initial address. +* +* The final virtual address will be based on baseaddr which is +* 0x1. We use a hint here starting at 0x2, leaving + * another 4GB just in case, plus the total available hugepages memory. +*/ + vma_addr = (char *)0x2 + (hpi->hugepage_sz * hpi->num_pages[0]); +#endif for (i = 0; i < hpi->num_pages[0]; i++) { uint64_t hugepage_sz = hpi->hugepage_sz; -- 2.17.1 Adding OVS dev to this thread, as this issue is introduced in DPDK 17.11.4.
Re: [dpdk-dev] [dpdk-stable] [PATCH 17.11] mem: fix memory initialization time
On 12 Nov 2018, at 12:18, Alejandro Lucero wrote: When using large amount of hugepage based memory, doing all the hugepages mapping can take quite significant time. The problem is hugepages being initially mmaped to virtual addresses which will be tried later for the final hugepage mmaping. This causes the final mapping requiring calling mmap with another hint address which can happen several times, depending on the amount of memory to mmap, and which each mmmap taking more than a second. This patch changes the hint for the initial hugepage mmaping using a starting address which will not collide with the final mmaping. Fixes: 293c0c4b957f ("mem: use address hint for mapping hugepages") Signed-off-by: Alejandro Lucero Thanks Alejandro for sending the patch. This issue was found in an OVS-DPDK environment. I verified/tested the patch. Acked-by: Eelco Chaudron Tested-by: Eelco Chaudron --- lib/librte_eal/linuxapp/eal/eal_memory.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index bac969a12..0675809b7 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -421,6 +421,21 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, } #endif +#ifdef RTE_ARCH_64 + /* +* Hugepages are first mmaped individually and then re-mmapped to +* another region for having contiguous physical pages in contiguous +* virtual addresses. Setting here vma_addr for the first hugepage + * mapped to a virtual address which will not collide with the second +* mmaping later. The next hugepages will use increments of this +* initial address. +* +* The final virtual address will be based on baseaddr which is +* 0x1. We use a hint here starting at 0x2, leaving + * another 4GB just in case, plus the total available hugepages memory. +*/ + vma_addr = (char *)0x2 + (hpi->hugepage_sz * hpi->num_pages[0]); +#endif for (i = 0; i < hpi->num_pages[0]; i++) { uint64_t hugepage_sz = hpi->hugepage_sz; -- 2.17.1
Re: [dpdk-dev] RFC 4115 variant for rte_meter_trtcm_color_blind/aware_check()
On 6 Nov 2018, at 11:53, Dumitrescu, Cristian wrote: -Original Message- From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Eelco Chaudron Sent: Tuesday, November 6, 2018 10:32 AM To: dev@dpdk.org Subject: [dpdk-dev] RFC 4115 variant for rte_meter_trtcm_color_blind/aware_check() Hi All, Was wondering if anyone is planning to add an RFC 4115 equivalent of the rte_meter_trtcm_color_blind/aware_check() check APIs? If not I can go ahead and sent a patch to add it, i.e. something like rte_meter_trtcm_rfc4115_color_blind/aware_check(). Cheers, Eelco Hi Eelco, This is somewhere on my ToDo list, but if you're ready to send a patch now, I would be very happy to review it and merge. Thanks for your help! Hi Christian, I have it on my ToDo list also ;) But let me take it of yours, and I’ll investigate what it takes to add it (also from the testing framework side…). //Eelco
[dpdk-dev] RFC 4115 variant for rte_meter_trtcm_color_blind/aware_check()
Hi All, Was wondering if anyone is planning to add an RFC 4115 equivalent of the rte_meter_trtcm_color_blind/aware_check() check APIs? If not I can go ahead and sent a patch to add it, i.e. something like rte_meter_trtcm_rfc4115_color_blind/aware_check(). Cheers, Eelco
Re: [dpdk-dev] [dpdk-stable] [PATCH v4 2/5] bus/pci: use IOVAs check when setting IOVA mode
On 10 Jul 2018, at 19:25, Alejandro Lucero wrote: Although VT-d emulation currently only supports 39 bits, it could be iovas being within that supported range. This patch allows IOVA mode in such a case. Indeed, memory initialization code can be modified for using lower virtual addresses than those used by the kernel for 64 bits processes by default, and therefore memsegs iovas can use 39 bits or less for most system. And this is likely 100% true for VMs. Applicable to v17.11.3 only. Signed-off-by: Alejandro Lucero --- drivers/bus/pci/linux/pci.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 74deef3..792c819 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_filesystem.h" @@ -613,10 +614,12 @@ fclose(fp); mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; - if (mgaw < X86_VA_WIDTH) + + if (!rte_eal_check_dma_mask(mgaw)) + return true; + else return false; - return true; } #elif defined(RTE_ARCH_PPC_64) static bool @@ -640,13 +643,17 @@ { struct rte_pci_device *dev = NULL; struct rte_pci_driver *drv = NULL; + int iommu_dma_mask_check_done = 0; FOREACH_DRIVER_ON_PCIBUS(drv) { FOREACH_DEVICE_ON_PCIBUS(dev) { if (!rte_pci_match(drv, dev)) continue; - if (!pci_one_device_iommu_support_va(dev)) - return false; + if (!iommu_dma_mask_check_done) { + if (pci_one_device_iommu_support_va(dev) < 0) + return false; + iommu_dma_mask_check_done = 1; As I do not know enough on what IOMMU hardware can coexist, I leave this patch for others to review. Here is the previous question/answer: Not sure why this change? Why do we only need to check one device on all the buses? Because there is just one emulated IOMMU hardware. The limitation in this case is not in a specific PCI device. And I do not think it is possible to have two different (emulated or not) IOMMU hardware. Yes, you can have more than one controller but being same IOMMU type. If the above is confirmed, you can consider this patch ack’ed. + } } } return true; -- 1.9.1
Re: [dpdk-dev] [dpdk-stable] [PATCH v4 1/5] mem: add function for checking memsegs IOVAs addresses
On 10 Jul 2018, at 19:25, Alejandro Lucero wrote: > A device can suffer addressing limitations. This functions checks > memsegs have iovas within the supported range based on dma mask. > > PMD should use this during initialization if supported devices > suffer addressing limitations, returning an error if this function > returns memsegs out of range. > > Another potential usage is for emulated IOMMU hardware with addressing > limitations. > > Applicable to v17.11.3 only. > > Signed-off-by: Alejandro Lucero > Acked-by: Anatoly Burakov > --- Looks good to me. Acked-by: Eelco Chaudron
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses
On 10 Jul 2018, at 13:43, Alejandro Lucero wrote: On Tue, Jul 10, 2018 at 12:33 PM, Burakov, Anatoly < anatoly.bura...@intel.com> wrote: On 10-Jul-18 12:14 PM, Eelco Chaudron wrote: On 10 Jul 2018, at 12:52, Alejandro Lucero wrote: On Tue, Jul 10, 2018 at 11:06 AM, Eelco Chaudron wrote: On 10 Jul 2018, at 11:34, Alejandro Lucero wrote: On Tue, Jul 10, 2018 at 9:56 AM, Eelco Chaudron wrote: On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: A device can suffer addressing limitations. This functions checks memsegs have iovas within the supported range based on dma mask. PMD should use this during initialization if supported devices suffer addressing limitations, returning an error if this function returns memsegs out of range. Another potential usage is for emulated IOMMU hardware with addressing limitations. Signed-off-by: Alejandro Lucero Acked-by: Anatoly Burakov --- lib/librte_eal/common/eal_common_memory.c | 33 ++ lib/librte_eal/common/include/rte_memory.h | 3 +++ lib/librte_eal/rte_eal_version.map | 1 + 3 files changed, 37 insertions(+) diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fc6c44d..f5efebe 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -109,6 +109,39 @@ } } +/* check memseg iovas are within the required range based on dma mask */ +int +rte_eal_check_dma_mask(uint8_t maskbits) +{ + + const struct rte_mem_config *mcfg; + uint64_t mask; + int i; + I think we should add some sanity check to the input maskbits, i.e. [64,0) or [64, 32]? What would be a reasonable lower bound. This is not a user's API, so any invocation will be reviewed, but I guess adding a sanity check here does not harm. Not sure about lower bound but upper should 64, although it does not make sense but it is safe. Lower bound is not so problematic. + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + if (mcfg->memseg[i].addr == NULL) + break; Looking at some other code, it looks like NULL entries might exists. So should a continue; rather than a break; be used here? I do not think so. memsegs are allocated sequentially, so first with addr as NULL implies no more memsegs. I was referring to the mem walk functions, rte_memseg_list_walk(). Maybe some having more experience with this area can review/comment. Pre-18.05, all memsegs are allocated continuously. Memseg lists and memseg list walk functions are 18.05+. Alejandro, perhaps it would be worth it to tag your patchset with "pre-18.05" to avoid similar confusion in the future? Yes, that will help. I'm sending a new version shortly and I'll make it clear. Thanks, I’ll review the new version if it’s ready before the end of tomorrow CET, as I will be on PTO.
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 4/6] mem: use address hint for mapping hugepages
On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: > Linux kernel uses a really high address as starting address for > serving mmaps calls. If there exists addressing limitations and > IOVA mode is VA, this starting address is likely too high for > those devices. However, it is possible to use a lower address in > the process virtual address space as with 64 bits there is a lot > of available space. > > This patch adds an address hint as starting address for 64 bits > systems. > > Signed-off-by: Alejandro Lucero > Acked-by: Anatoly Burakov Looks good to me! Cheers, Eelco Acked-by: Eelco Chaudron
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses
On 10 Jul 2018, at 12:52, Alejandro Lucero wrote: On Tue, Jul 10, 2018 at 11:06 AM, Eelco Chaudron wrote: On 10 Jul 2018, at 11:34, Alejandro Lucero wrote: On Tue, Jul 10, 2018 at 9:56 AM, Eelco Chaudron wrote: On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: A device can suffer addressing limitations. This functions checks memsegs have iovas within the supported range based on dma mask. PMD should use this during initialization if supported devices suffer addressing limitations, returning an error if this function returns memsegs out of range. Another potential usage is for emulated IOMMU hardware with addressing limitations. Signed-off-by: Alejandro Lucero Acked-by: Anatoly Burakov --- lib/librte_eal/common/eal_common_memory.c | 33 ++ lib/librte_eal/common/include/rte_memory.h | 3 +++ lib/librte_eal/rte_eal_version.map | 1 + 3 files changed, 37 insertions(+) diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fc6c44d..f5efebe 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -109,6 +109,39 @@ } } +/* check memseg iovas are within the required range based on dma mask */ +int +rte_eal_check_dma_mask(uint8_t maskbits) +{ + + const struct rte_mem_config *mcfg; + uint64_t mask; + int i; + I think we should add some sanity check to the input maskbits, i.e. [64,0) or [64, 32]? What would be a reasonable lower bound. This is not a user's API, so any invocation will be reviewed, but I guess adding a sanity check here does not harm. Not sure about lower bound but upper should 64, although it does not make sense but it is safe. Lower bound is not so problematic. + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + if (mcfg->memseg[i].addr == NULL) + break; Looking at some other code, it looks like NULL entries might exists. So should a continue; rather than a break; be used here? I do not think so. memsegs are allocated sequentially, so first with addr as NULL implies no more memsegs. I was referring to the mem walk functions, rte_memseg_list_walk(). Maybe some having more experience with this area can review/comment. + + if (mcfg->memseg[i].iova & mask) { + RTE_LOG(INFO, EAL, + "memseg[%d] iova %"PRIx64" out of range:\n", + i, mcfg->memseg[i].iova); + + RTE_LOG(INFO, EAL, "\tusing dma mask %"PRIx64"\n", + mask); + return -1; + } + } + + return 0; +} + /* return the number of memory channels */ unsigned rte_memory_get_nchannel(void) { diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index 80a8fc0..b2a0168 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -209,6 +209,9 @@ struct rte_memseg { */ unsigned rte_memory_get_nrank(void); +/* check memsegs iovas are within a range based on dma mask */ +int rte_eal_check_dma_mask(uint8_t maskbits); + /** * Drivers based on uio will not load unless physical * addresses are obtainable. It is only possible to get diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map index f4f46c1..aa6cf87 100644 --- a/lib/librte_eal/rte_eal_version.map +++ b/lib/librte_eal/rte_eal_version.map @@ -184,6 +184,7 @@ DPDK_17.11 { rte_eal_create_uio_dev; rte_bus_get_iommu_class; + rte_eal_check_dma_mask; rte_eal_has_pci; rte_eal_iova_mode; rte_eal_mbuf_default_mempool_ops; -- 1.9.1
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 6/6] net/nfp: support IOVA VA mode
On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: > NFP can handle IOVA as VA. It requires to check those IOVAs > being in the supported range what is done during initialization. > > Signed-off-by: Alejandro Lucero Changes look good to me! Acked-by: Eelco Chaudron
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 5/6] net/nfp: check hugepages IOVAs based on DMA mask
On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: > NFP devices can not handle DMA addresses requiring more than > 40 bits. This patch uses rte_dev_check_dma_mask with 40 bits > and avoids device initialization if memory out of NFP range. > > Signed-off-by: Alejandro Lucero Changes look good to me! Acked-by: Eelco Chaudron
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 3/6] bus/pci: use IOVAs check when setting IOVA mode
On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: Although VT-d emulation currently only supports 39 bits, it could be iovas being within that supported range. This patch allows IOVA mode in such a case. Indeed, memory initialization code can be modified for using lower virtual addresses than those used by the kernel for 64 bits processes by default, and therefore memsegs iovas can use 39 bits or less for most system. And this is likely 100% true for VMs. Signed-off-by: Alejandro Lucero --- drivers/bus/pci/linux/pci.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 74deef3..792c819 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_filesystem.h" @@ -613,10 +614,12 @@ fclose(fp); mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; - if (mgaw < X86_VA_WIDTH) + + if (!rte_eal_check_dma_mask(mgaw)) If think in this case we still need to check the X86_VA_WIDTH, i.e. if (mgaw < X86_VA_WIDTH && !rte_eal_check_dma_mask(mgaw)) + return true; + else return false; - return true; } #elif defined(RTE_ARCH_PPC_64) static bool @@ -640,13 +643,17 @@ { struct rte_pci_device *dev = NULL; struct rte_pci_driver *drv = NULL; + int iommu_dma_mask_check_done = 0; FOREACH_DRIVER_ON_PCIBUS(drv) { FOREACH_DEVICE_ON_PCIBUS(dev) { if (!rte_pci_match(drv, dev)) continue; - if (!pci_one_device_iommu_support_va(dev)) - return false; + if (!iommu_dma_mask_check_done) { + if (pci_one_device_iommu_support_va(dev) < 0) + return false; + iommu_dma_mask_check_done = 1; Not sure why this change? Why do we only need to check one device on the bus? In addition, if this is what was intended, rather than a variable you can return true in this case, or did you intended to clear the iommu_dma_mask_check_done on every PCI BUS iteration? + } } } return true; -- 1.9.1
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses
On 10 Jul 2018, at 11:34, Alejandro Lucero wrote: On Tue, Jul 10, 2018 at 9:56 AM, Eelco Chaudron wrote: On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: A device can suffer addressing limitations. This functions checks memsegs have iovas within the supported range based on dma mask. PMD should use this during initialization if supported devices suffer addressing limitations, returning an error if this function returns memsegs out of range. Another potential usage is for emulated IOMMU hardware with addressing limitations. Signed-off-by: Alejandro Lucero Acked-by: Anatoly Burakov --- lib/librte_eal/common/eal_common_memory.c | 33 ++ lib/librte_eal/common/include/rte_memory.h | 3 +++ lib/librte_eal/rte_eal_version.map | 1 + 3 files changed, 37 insertions(+) diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fc6c44d..f5efebe 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -109,6 +109,39 @@ } } +/* check memseg iovas are within the required range based on dma mask */ +int +rte_eal_check_dma_mask(uint8_t maskbits) +{ + + const struct rte_mem_config *mcfg; + uint64_t mask; + int i; + I think we should add some sanity check to the input maskbits, i.e. [64,0) or [64, 32]? What would be a reasonable lower bound. This is not a user's API, so any invocation will be reviewed, but I guess adding a sanity check here does not harm. Not sure about lower bound but upper should 64, although it does not make sense but it is safe. Lower bound is not so problematic. + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + if (mcfg->memseg[i].addr == NULL) + break; Looking at some other code, it looks like NULL entries might exists. So should a continue; rather than a break; be used here? + + if (mcfg->memseg[i].iova & mask) { + RTE_LOG(INFO, EAL, + "memseg[%d] iova %"PRIx64" out of range:\n", + i, mcfg->memseg[i].iova); + + RTE_LOG(INFO, EAL, "\tusing dma mask %"PRIx64"\n", + mask); + return -1; + } + } + + return 0; +} + /* return the number of memory channels */ unsigned rte_memory_get_nchannel(void) { diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index 80a8fc0..b2a0168 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -209,6 +209,9 @@ struct rte_memseg { */ unsigned rte_memory_get_nrank(void); +/* check memsegs iovas are within a range based on dma mask */ +int rte_eal_check_dma_mask(uint8_t maskbits); + /** * Drivers based on uio will not load unless physical * addresses are obtainable. It is only possible to get diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map index f4f46c1..aa6cf87 100644 --- a/lib/librte_eal/rte_eal_version.map +++ b/lib/librte_eal/rte_eal_version.map @@ -184,6 +184,7 @@ DPDK_17.11 { rte_eal_create_uio_dev; rte_bus_get_iommu_class; + rte_eal_check_dma_mask; rte_eal_has_pci; rte_eal_iova_mode; rte_eal_mbuf_default_mempool_ops; -- 1.9.1
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 2/6] ethdev: add function for checking IOVAs by a device
On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: A PMD should invoke this function for checking memsegs iovas are within the supported range by the device. Signed-off-by: Alejandro Lucero Agree with Andrew here, why not call rte_eal_check_dma_mask() directly in nfp_net_txq_full()? --- lib/librte_ether/rte_ethdev.h | 13 + lib/librte_ether/rte_ethdev_version.map | 1 + 2 files changed, 14 insertions(+) diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h index eba11ca..e51a432 100644 --- a/lib/librte_ether/rte_ethdev.h +++ b/lib/librte_ether/rte_ethdev.h @@ -2799,6 +2799,19 @@ int rte_eth_dev_set_vlan_ether_type(uint16_t port_id, int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on); /** + * check device dma mask within expected range based on dma mask. + * + * @param maskbits + * mask length in bits + * + */ +static inline int +rte_eth_dev_check_dma_mask(uint8_t maskbits) +{ + return rte_eal_check_dma_mask(maskbits); +} + +/** * * Retrieve a burst of input packets from a receive queue of an Ethernet * device. The retrieved packets are stored in *rte_mbuf* structures whose diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map index e9681ac..0b11b8a 100644 --- a/lib/librte_ether/rte_ethdev_version.map +++ b/lib/librte_ether/rte_ethdev_version.map @@ -191,6 +191,7 @@ DPDK_17.08 { DPDK_17.11 { global: + rte_eth_dev_check_dma_mask; rte_eth_dev_get_sec_ctx; rte_eth_dev_pool_ops_supported; rte_eth_dev_reset; -- 1.9.1
Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses
On 4 Jul 2018, at 14:53, Alejandro Lucero wrote: A device can suffer addressing limitations. This functions checks memsegs have iovas within the supported range based on dma mask. PMD should use this during initialization if supported devices suffer addressing limitations, returning an error if this function returns memsegs out of range. Another potential usage is for emulated IOMMU hardware with addressing limitations. Signed-off-by: Alejandro Lucero Acked-by: Anatoly Burakov --- lib/librte_eal/common/eal_common_memory.c | 33 ++ lib/librte_eal/common/include/rte_memory.h | 3 +++ lib/librte_eal/rte_eal_version.map | 1 + 3 files changed, 37 insertions(+) diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fc6c44d..f5efebe 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -109,6 +109,39 @@ } } +/* check memseg iovas are within the required range based on dma mask */ +int +rte_eal_check_dma_mask(uint8_t maskbits) +{ + + const struct rte_mem_config *mcfg; + uint64_t mask; + int i; + I think we should add some sanity check to the input maskbits, i.e. [64,0) or [64, 32]? What would be a reasonable lower bound. + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + if (mcfg->memseg[i].addr == NULL) + break; + + if (mcfg->memseg[i].iova & mask) { + RTE_LOG(INFO, EAL, + "memseg[%d] iova %"PRIx64" out of range:\n", + i, mcfg->memseg[i].iova); + + RTE_LOG(INFO, EAL, "\tusing dma mask %"PRIx64"\n", + mask); + return -1; + } + } + + return 0; +} + /* return the number of memory channels */ unsigned rte_memory_get_nchannel(void) { diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index 80a8fc0..b2a0168 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -209,6 +209,9 @@ struct rte_memseg { */ unsigned rte_memory_get_nrank(void); +/* check memsegs iovas are within a range based on dma mask */ +int rte_eal_check_dma_mask(uint8_t maskbits); + /** * Drivers based on uio will not load unless physical * addresses are obtainable. It is only possible to get diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map index f4f46c1..aa6cf87 100644 --- a/lib/librte_eal/rte_eal_version.map +++ b/lib/librte_eal/rte_eal_version.map @@ -184,6 +184,7 @@ DPDK_17.11 { rte_eal_create_uio_dev; rte_bus_get_iommu_class; + rte_eal_check_dma_mask; rte_eal_has_pci; rte_eal_iova_mode; rte_eal_mbuf_default_mempool_ops; -- 1.9.1