Re: [PATCH 2/3] vhost: fix virtqueue access lock in datapath

2023-10-27 Thread Eelco Chaudron



On 27 Oct 2023, at 11:22, David Marchand wrote:

> On Fri, Oct 27, 2023 at 11:05 AM Eelco Chaudron  wrote:
>>> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
>>> index 759a78e3e3..4116f79d4f 100644
>>> --- a/lib/vhost/virtio_net.c
>>> +++ b/lib/vhost/virtio_net.c
>>> @@ -1694,6 +1694,17 @@ virtio_dev_rx_packed(struct virtio_net *dev,
>>>   return pkt_idx;
>>>  }
>>>
>>> +static void
>>> +virtio_dev_vring_translate(struct virtio_net *dev, struct vhost_virtqueue 
>>> *vq)
>>> +{
>>
>> Would it be an idea to annotate this function that it needs to be called 
>> with the “read locks” (and that it will free them) to avoid the duplicate:
>>
>> +   vhost_user_iotlb_rd_unlock(vq);
>> +   rte_rwlock_read_unlock(&vq->access_lock);
>
> The "unlock" annotations do not express read/write concerns for locks.
> So that would make the code less readable and potentially hide some issues.
>
> I prefer to keep as is, with clear calls to rd_lock / rd_unlock in
> those functions.

ACK, keeping this as is fine by me.

Acked-by: Eelco Chaudron 

>>> + rte_rwlock_write_lock(&vq->access_lock);
>>> + vhost_user_iotlb_rd_lock(vq);
>>> + if (!vq->access_ok)
>>> + vring_translate(dev, vq);
>>> + vhost_user_iotlb_rd_unlock(vq);
>>> + rte_rwlock_write_unlock(&vq->access_lock);
>>> +}
>>> +
>
> -- 
> David Marchand



Re: [PATCH 3/3] vhost: annotate virtqueue access checks

2023-10-27 Thread Eelco Chaudron



On 23 Oct 2023, at 11:55, David Marchand wrote:

> Modifying vq->access_ok should be done with a write lock taken.
> Annotate vring_translate() and vring_invalidate() and add missing locks.
>
> Signed-off-by: David Marchand 

The changes look good to me.

Acked-by: Eelco Chaudron 



Re: [PATCH 2/3] vhost: fix virtqueue access lock in datapath

2023-10-27 Thread Eelco Chaudron



On 23 Oct 2023, at 11:55, David Marchand wrote:

> Now that a r/w lock is used, the access_ok field should only be updated
> under a write lock.
>
> Since the datapath code only takes a read lock on the virtqueue to check
> access_ok, this lock must be released and a write lock taken before
> calling vring_translate().
>
> Fixes: 03f77d66d966 ("vhost: change virtqueue access lock to a read/write 
> one")
> Cc: sta...@dpdk.org
>
> Signed-off-by: David Marchand 

Only one question, but whatever the outcome is the change looks good to me.

Acked-by: Eelco Chaudron 

> ---
>  lib/vhost/virtio_net.c | 60 +++---
>  1 file changed, 44 insertions(+), 16 deletions(-)
>
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 759a78e3e3..4116f79d4f 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -1694,6 +1694,17 @@ virtio_dev_rx_packed(struct virtio_net *dev,
>   return pkt_idx;
>  }
>
> +static void
> +virtio_dev_vring_translate(struct virtio_net *dev, struct vhost_virtqueue 
> *vq)
> +{

Would it be an idea to annotate this function that it needs to be called with 
the “read locks” (and that it will free them) to avoid the duplicate:

+   vhost_user_iotlb_rd_unlock(vq);
+   rte_rwlock_read_unlock(&vq->access_lock);

> + rte_rwlock_write_lock(&vq->access_lock);
> + vhost_user_iotlb_rd_lock(vq);
> + if (!vq->access_ok)
> + vring_translate(dev, vq);
> + vhost_user_iotlb_rd_unlock(vq);
> + rte_rwlock_write_unlock(&vq->access_lock);
> +}
> +
>  static __rte_always_inline uint32_t
>  virtio_dev_rx(struct virtio_net *dev, struct vhost_virtqueue *vq,
>   struct rte_mbuf **pkts, uint32_t count)
> @@ -1708,9 +1719,13 @@ virtio_dev_rx(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>
>   vhost_user_iotlb_rd_lock(vq);
>
> - if (unlikely(!vq->access_ok))
> - if (unlikely(vring_translate(dev, vq) < 0))
> - goto out;
> + if (unlikely(!vq->access_ok)) {
> + vhost_user_iotlb_rd_unlock(vq);
> + rte_rwlock_read_unlock(&vq->access_lock);
> +
> + virtio_dev_vring_translate(dev, vq);
> + goto out_no_unlock;
> + }
>
>   count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
>   if (count == 0)
> @@ -1729,6 +1744,7 @@ virtio_dev_rx(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>  out_access_unlock:
>   rte_rwlock_read_unlock(&vq->access_lock);
>
> +out_no_unlock:
>   return nb_tx;
>  }
>
> @@ -2523,9 +2539,13 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, 
> struct vhost_virtqueue *vq,
>
>   vhost_user_iotlb_rd_lock(vq);
>
> - if (unlikely(!vq->access_ok))
> - if (unlikely(vring_translate(dev, vq) < 0))
> - goto out;
> + if (unlikely(!vq->access_ok)) {
> + vhost_user_iotlb_rd_unlock(vq);
> + rte_rwlock_read_unlock(&vq->access_lock);
> +
> + virtio_dev_vring_translate(dev, vq);
> + goto out_no_unlock;
> + }
>
>   count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
>   if (count == 0)
> @@ -2546,6 +2566,7 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, 
> struct vhost_virtqueue *vq,
>  out_access_unlock:
>   rte_rwlock_write_unlock(&vq->access_lock);
>
> +out_no_unlock:
>   return nb_tx;
>  }
>
> @@ -3576,11 +3597,13 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>
>   vhost_user_iotlb_rd_lock(vq);
>
> - if (unlikely(!vq->access_ok))
> - if (unlikely(vring_translate(dev, vq) < 0)) {
> - count = 0;
> - goto out;
> - }
> + if (unlikely(!vq->access_ok)) {
> + vhost_user_iotlb_rd_unlock(vq);
> + rte_rwlock_read_unlock(&vq->access_lock);
> +
> + virtio_dev_vring_translate(dev, vq);
> + goto out_no_unlock;
> + }
>
>   /*
>* Construct a RARP broadcast packet, and inject it to the "pkts"
> @@ -3641,6 +3664,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>   if (unlikely(rarp_mbuf != NULL))
>   count += 1;
>
> +out_no_unlock:
>   return count;
>  }
>
> @@ -4190,11 +4214,14 @@ rte_vhost_async_try_dequeue_burst(int vid, uint16_t 
> queue_id,
>
>   vhost_user_iotlb_rd_lock(vq);
>
> - if (unlikely(vq->access_ok == 0))
> - if (unlikely(vring_translate(dev, vq) < 0)) {
> -

Re: [PATCH 1/3] vhost: robustify virtqueue access lock asserts

2023-10-27 Thread Eelco Chaudron



On 23 Oct 2023, at 11:55, David Marchand wrote:

> A simple comment in vhost_user_msg_handler() is not that robust.
>
> Add a lock_all_qps property to message handlers so that their
> implementation can add a build check and assert a vq is locked.
>
> Signed-off-by: David Marchand 

This change looks good to me.

Acked-by: Eelco Chaudron 



Re: [PATCH v3] vhost: add IRQ suppression

2023-10-03 Thread Eelco Chaudron



On 29 Sep 2023, at 12:38, Maxime Coquelin wrote:

> Guest notifications offloading, which has been introduced
> in v23.07, aims at offloading syscalls out of the datapath.
>
> This patch optimizes the offloading by not offloading the
> guest notification for a given virtqueue if one is already
> being offloaded by the application.
>
> With a single VDUSE device, we can already see few
> notifications being suppressed when doing throughput
> testing with Iperf3. We can expect to see much more being
> suppressed when the offloading thread is under pressure.
>
> Signed-off-by: Maxime Coquelin 

Thanks for adding this Maxime. I did some tests with OVS and my old determinism 
patchset, and it works perfectly.

I have two small nits, but this change looks good to me.

Acked-by: Eelco Chaudron 

> ---
>
> v3: s/0/false/ (David)
>
>  lib/vhost/vhost.c |  4 
>  lib/vhost/vhost.h | 27 +--
>  2 files changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
> index c03bb9c6eb..7fde412ef3 100644
> --- a/lib/vhost/vhost.c
> +++ b/lib/vhost/vhost.c
> @@ -49,6 +49,8 @@ static const struct vhost_vq_stats_name_off 
> vhost_vq_stat_strings[] = {
>   stats.guest_notifications_offloaded)},
>   {"guest_notifications_error", offsetof(struct vhost_virtqueue,
>   stats.guest_notifications_error)},
> + {"guest_notifications_suppressed", offsetof(struct vhost_virtqueue,
> + stats.guest_notifications_suppressed)},
>   {"iotlb_hits", offsetof(struct vhost_virtqueue, 
> stats.iotlb_hits)},
>   {"iotlb_misses",   offsetof(struct vhost_virtqueue, 
> stats.iotlb_misses)},
>   {"inflight_submitted", offsetof(struct vhost_virtqueue, 
> stats.inflight_submitted)},
> @@ -1517,6 +1519,8 @@ rte_vhost_notify_guest(int vid, uint16_t queue_id)
>
>   rte_rwlock_read_lock(&vq->access_lock);
>
> + __atomic_store_n(&vq->irq_pending, false, __ATOMIC_RELEASE);
> +
>   if (dev->backend_ops->inject_irq(dev, vq)) {
>   if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>   __atomic_fetch_add(&vq->stats.guest_notifications_error,
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 9723429b1c..5fc9035a1f 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -156,6 +156,7 @@ struct virtqueue_stats {
>   uint64_t iotlb_misses;
>   uint64_t inflight_submitted;
>   uint64_t inflight_completed;
> + uint64_t guest_notifications_suppressed;
>   /* Counters below are atomic, and should be incremented as such. */
>   uint64_t guest_notifications;
>   uint64_t guest_notifications_offloaded;
> @@ -346,6 +347,8 @@ struct vhost_virtqueue {
>
>   struct vhost_vring_addr ring_addrs;
>   struct virtqueue_stats  stats;
> +
> + bool irq_pending;

nit: Other elements in this structure have the names aligned, not sure if this 
should be done for this item also.

>  } __rte_cache_aligned;
>
>  /* Virtio device status as per Virtio specification */
> @@ -908,12 +911,24 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, 
> uint16_t old)
>  static __rte_always_inline void
>  vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> - if (dev->notify_ops->guest_notify &&
> - dev->notify_ops->guest_notify(dev->vid, vq->index)) {
> - if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> - 
> __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
> - 1, __ATOMIC_RELAXED);
> - return;
> + bool expected = false;
> +
> + if (dev->notify_ops->guest_notify) {
> + if (__atomic_compare_exchange_n(&vq->irq_pending, &expected, 
> true, 0,
> +   __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
> + if (dev->notify_ops->guest_notify(dev->vid, vq->index)) 
> {
> + if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> + 
> __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
> + 1, __ATOMIC_RELAXED);
> + return;
> + }
> +
> + /* Offloading failed, fallback to direct IRQ injection 
> */

nit: Some comments end with a dot and some do not, not sure what is the 
preference in DPDK.

> + __atomic_store_n(&vq->irq_pending, false, 
> __ATOMIC_RELEASE);
> + } else {
> + vq->stats.guest_notifications_suppressed++;
> + return;
> + }
>   }
>
>   if (dev->backend_ops->inject_irq(dev, vq)) {
> --
> 2.41.0



Re: [PATCH] net/iavf: fix checksum offloading

2023-08-21 Thread Eelco Chaudron



On 18 Aug 2023, at 11:03, David Marchand wrote:

> The only presence of RTE_MBUF_F_TX_IPV4 can't be used as an indicator
> that a checksum offload has been requested by an application.
> Check that RTE_MBUF_F_TX_IP_CKSUM or others flags have been set.
>
> Fixes: 1e728b01120c ("net/iavf: rework Tx path")
> Cc: sta...@dpdk.org

Thanks for fixing this David! I tested and reviewed this change and it works 
now.

Tested-by: Eelco Chaudron 
Acked-by: Eelco Chaudron 

> Signed-off-by: David Marchand 
> ---
>  drivers/net/iavf/iavf_rxtx.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
> index f7df4665d1..b9e2879764 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -2652,6 +2652,9 @@ iavf_build_data_desc_cmd_offset_fields(volatile 
> uint64_t *qw1,
>   offset |= (m->l2_len >> 1)
>   << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
>
> + if ((m->ol_flags & IAVF_TX_CKSUM_OFFLOAD_MASK) == 0)
> + goto skip_cksum;
> +
>   /* Enable L3 checksum offloading inner */
>   if (m->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) {
>   if (m->ol_flags & RTE_MBUF_F_TX_IPV4) {
> @@ -2702,6 +2705,7 @@ iavf_build_data_desc_cmd_offset_fields(volatile 
> uint64_t *qw1,
>   break;
>   }
>
> +skip_cksum:
>   *qw1 = rte_cpu_to_le_64uint64_t)command <<
>   IAVF_TXD_DATA_QW1_CMD_SHIFT) & IAVF_TXD_DATA_QW1_CMD_MASK) |
>   (((uint64_t)offset << IAVF_TXD_DATA_QW1_OFFSET_SHIFT) &
> -- 
> 2.41.0



Re: [PATCH v3 0/4] vhost: add device op to offload the interrupt kick

2023-06-01 Thread Eelco Chaudron



On 1 Jun 2023, at 22:00, Maxime Coquelin wrote:

> On 5/17/23 11:08, Eelco Chaudron wrote:
>> This series adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>>
>> v3:
>>  - Changed ABI compatibility code to no longer use a boolean
>>to avoid having to disable specific GCC warnings.
>>  - Moved the fd check fix to a separate patch (patch 3/4).
>>  - Fixed some coding style issues.
>>
>> v2: - Used vhost_virtqueue->index to find index for operation.
>>  - Aligned function name to VDUSE RFC patchset.
>>  - Added error and offload statistics counter.
>>  - Mark new API as experimental.
>>  - Change the virtual queue spin lock to read/write spin lock.
>>  - Made shared counters atomic.
>>  - Add versioned rte_vhost_driver_callback_register() for
>>ABI compliance.
>>
>> Eelco Chaudron (4):
>>vhost: change vhost_virtqueue access lock to a read/write one
>>vhost: make the guest_notifications statistic counter atomic
>>vhost: fix invalid call FD handling
>>vhost: add device op to offload the interrupt kick
>>
>>
>>   lib/eal/include/generic/rte_rwlock.h | 17 +
>>   lib/vhost/meson.build|  2 +
>>   lib/vhost/rte_vhost.h| 23 ++-
>>   lib/vhost/socket.c   | 63 +--
>>   lib/vhost/version.map|  9 +++
>>   lib/vhost/vhost.c| 92 +---
>>   lib/vhost/vhost.h| 69 ++---
>>   lib/vhost/vhost_user.c   | 14 ++---
>>   lib/vhost/virtio_net.c   | 90 +--
>>   9 files changed, 278 insertions(+), 101 deletions(-)
>>
>
>
> Applied to dpdk-next-virtio/main.

Thanks Maxime!

//Eelco



Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick

2023-06-01 Thread Eelco Chaudron



On 1 Jun 2023, at 10:29, Maxime Coquelin wrote:

> On 6/1/23 10:15, Eelco Chaudron wrote:
>>
>>
>> On 1 Jun 2023, at 4:18, Xia, Chenbo wrote:
>>
>>>> -Original Message-
>>>> From: Maxime Coquelin 
>>>> Sent: Wednesday, May 31, 2023 5:29 PM
>>>> To: Xia, Chenbo ; Thomas Monjalon
>>>> ; Eelco Chaudron ;
>>>> david.march...@redhat.com
>>>> Cc: dev@dpdk.org
>>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt
>>>> kick
>>>>
>>>>
>>>>
>>>> On 5/31/23 08:19, Xia, Chenbo wrote:
>>>>>> -Original Message-
>>>>>> From: Maxime Coquelin 
>>>>>> Sent: Tuesday, May 30, 2023 11:17 PM
>>>>>> To: Thomas Monjalon ; Eelco Chaudron
>>>>>> ; Xia, Chenbo ;
>>>>>> david.march...@redhat.com
>>>>>> Cc: dev@dpdk.org
>>>>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the
>>>> interrupt
>>>>>> kick
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 5/30/23 15:16, Thomas Monjalon wrote:
>>>>>>> 30/05/2023 15:02, Maxime Coquelin:
>>>>>>>>
>>>>>>>> On 5/17/23 11:09, Eelco Chaudron wrote:
>>>>>>>>> This patch adds an operation callback which gets called every time
>>>> the
>>>>>>>>> library wants to call eventfd_write(). This eventfd_write() call
>>>> could
>>>>>>>>> result in a system call, which could potentially block the PMD
>>>> thread.
>>>>>>>>>
>>>>>>>>> The callback function can decide whether it's ok to handle the
>>>>>>>>> eventfd_write() now or have the newly introduced function,
>>>>>>>>> rte_vhost_notify_guest(), called at a later time.
>>>>>>>>>
>>>>>>>>> This can be used by 3rd party applications, like OVS, to avoid
>>>> system
>>>>>>>>> calls being called as part of the PMD threads.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Eelco Chaudron 
>>>>>>>>> ---
>>>>>>>>>  lib/vhost/meson.build |2 ++
>>>>>>>>>  lib/vhost/rte_vhost.h |   23 +-
>>>>>>>>>  lib/vhost/socket.c|   63
>>>>>> ++---
>>>>>>>>>  lib/vhost/version.map |9 +++
>>>>>>>>>  lib/vhost/vhost.c |   38 ++
>>>>>>>>>  lib/vhost/vhost.h |   58 ---
>>>> ---
>>>>>> ---
>>>>>>>>>  6 files changed, 171 insertions(+), 22 deletions(-)
>>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> The patch looks good to me, but that's the first time we use function
>>>>>>>> versioning in Vhost library, so I'd like another pair of eyes to be
>>>>>> sure
>>>>>>>> I don't miss anything.
>>>>>>>>
>>>>>>>> Reviewed-by: Maxime Coquelin 
>>>>>>>>
>>>>>>>> Thomas, do we need to mention it somewhere in the release note?
>>>>>>>
>>>>>>> If compatibility is kept, I think we don't need to mention it.
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Thanks Thomas for the information.
>>>>>>
>>>>>> Maxime
>>>>>
>>>>> About release note, except the versioning, there is also one new API
>>>> introduced
>>>>> in this patch, so we still need to mention this in release note
>>>>
>>>> Right, good catch.
>>>> Eelco, let me know what you would put, I'll add it while applying (No
>>>> need for a new revision).
>>>
>>> Btw, the vhost_lib.rst also needs a new item..
>>
>> What about the following?
>>
>> diff --git a/doc/guides/prog_guide/vhost_lib.rst 
>> b/doc/guides/prog_guide/vhost_lib.rst
>> index e8bb8c9b7b..0f964d7a4a 100644
>> --- a/doc/guides/prog_guide/vhost_lib.rst
>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>> @@ -334,6 +334,11 @@ The following is an overview of some key Vhost API 
>> functions:
>> Clean DMA vChannel finished to use. After this function is called,
>> the specified DMA vChannel should no longer be used by the Vhost library.
>>
>> +* ``rte_vhost_notify_guest(int vid, uint16_t queue_id)``
>> +
>> +  Inject the offloaded interrupt received by the 'guest_notify' callback,
>> +  into the vhost device's queue.
>> +
>>   Vhost-user Implementations
>>   --
>>
>> Maxime do you want to add it, or do you want a new rev?
>
> That's fine, I just added it now :) Can you check the next-virtio main
> branch and confirm all is OK for your series?
>
> https://git.dpdk.org/next/dpdk-next-virtio/log/
>

One small spelling issue, the rest looks good:

+* **Added callback API support for interrupt handling in the vhost library.**
+
+  A new callback, guest_notify, is introduced that can be used to handle the
+  interrupt kick outside of the datapath fast path. In addition, a new API,
+  rte_vhost_notify_guest(), is added to raise the interrupt outside of the
+  past path.

Last sentence spells ‘past path’ instead of ‘fast path’.


> Thanks,
> Maxime
>
>>
>>> Thanks,
>>> Chenbo
>>>
>>>>
>>>> Thanks,
>>>> Maxime
>>>>
>>>>> Thanks,
>>>>> Chenbo
>>



Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick

2023-06-01 Thread Eelco Chaudron



On 1 Jun 2023, at 4:18, Xia, Chenbo wrote:

>> -Original Message-
>> From: Maxime Coquelin 
>> Sent: Wednesday, May 31, 2023 5:29 PM
>> To: Xia, Chenbo ; Thomas Monjalon
>> ; Eelco Chaudron ;
>> david.march...@redhat.com
>> Cc: dev@dpdk.org
>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt
>> kick
>>
>>
>>
>> On 5/31/23 08:19, Xia, Chenbo wrote:
>>>> -Original Message-
>>>> From: Maxime Coquelin 
>>>> Sent: Tuesday, May 30, 2023 11:17 PM
>>>> To: Thomas Monjalon ; Eelco Chaudron
>>>> ; Xia, Chenbo ;
>>>> david.march...@redhat.com
>>>> Cc: dev@dpdk.org
>>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the
>> interrupt
>>>> kick
>>>>
>>>>
>>>>
>>>> On 5/30/23 15:16, Thomas Monjalon wrote:
>>>>> 30/05/2023 15:02, Maxime Coquelin:
>>>>>>
>>>>>> On 5/17/23 11:09, Eelco Chaudron wrote:
>>>>>>> This patch adds an operation callback which gets called every time
>> the
>>>>>>> library wants to call eventfd_write(). This eventfd_write() call
>> could
>>>>>>> result in a system call, which could potentially block the PMD
>> thread.
>>>>>>>
>>>>>>> The callback function can decide whether it's ok to handle the
>>>>>>> eventfd_write() now or have the newly introduced function,
>>>>>>> rte_vhost_notify_guest(), called at a later time.
>>>>>>>
>>>>>>> This can be used by 3rd party applications, like OVS, to avoid
>> system
>>>>>>> calls being called as part of the PMD threads.
>>>>>>>
>>>>>>> Signed-off-by: Eelco Chaudron 
>>>>>>> ---
>>>>>>> lib/vhost/meson.build |2 ++
>>>>>>> lib/vhost/rte_vhost.h |   23 +-
>>>>>>> lib/vhost/socket.c|   63
>>>> ++---
>>>>>>> lib/vhost/version.map |9 +++
>>>>>>> lib/vhost/vhost.c |   38 ++
>>>>>>> lib/vhost/vhost.h |   58 ---
>> ---
>>>> ---
>>>>>>> 6 files changed, 171 insertions(+), 22 deletions(-)
>>>>>>>
>>>>>>
>>>>>>
>>>>>> The patch looks good to me, but that's the first time we use function
>>>>>> versioning in Vhost library, so I'd like another pair of eyes to be
>>>> sure
>>>>>> I don't miss anything.
>>>>>>
>>>>>> Reviewed-by: Maxime Coquelin 
>>>>>>
>>>>>> Thomas, do we need to mention it somewhere in the release note?
>>>>>
>>>>> If compatibility is kept, I think we don't need to mention it.
>>>>>
>>>>>
>>>>
>>>> Thanks Thomas for the information.
>>>>
>>>> Maxime
>>>
>>> About release note, except the versioning, there is also one new API
>> introduced
>>> in this patch, so we still need to mention this in release note
>>
>> Right, good catch.
>> Eelco, let me know what you would put, I'll add it while applying (No
>> need for a new revision).
>
> Btw, the vhost_lib.rst also needs a new item..

What about the following?

diff --git a/doc/guides/prog_guide/vhost_lib.rst 
b/doc/guides/prog_guide/vhost_lib.rst
index e8bb8c9b7b..0f964d7a4a 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -334,6 +334,11 @@ The following is an overview of some key Vhost API 
functions:
   Clean DMA vChannel finished to use. After this function is called,
   the specified DMA vChannel should no longer be used by the Vhost library.

+* ``rte_vhost_notify_guest(int vid, uint16_t queue_id)``
+
+  Inject the offloaded interrupt received by the 'guest_notify' callback,
+  into the vhost device's queue.
+
 Vhost-user Implementations
 --

Maxime do you want to add it, or do you want a new rev?

> Thanks,
> Chenbo
>
>>
>> Thanks,
>> Maxime
>>
>>> Thanks,
>>> Chenbo



Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick

2023-05-31 Thread Eelco Chaudron



On 31 May 2023, at 14:48, Maxime Coquelin wrote:

> On 5/31/23 14:01, David Marchand wrote:
>> Eelco, Maxime,
>>
>> On Wed, May 17, 2023 at 11:09 AM Eelco Chaudron  wrote:
>>> @@ -846,6 +848,11 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
>>> *vsocket)
>>>  vsocket->path = NULL;
>>>  }
>>>
>>> +   if (vsocket && vsocket->malloc_notify_ops) {
>>> +   free(vsocket->malloc_notify_ops);
>>> +   vsocket->malloc_notify_ops = NULL;
>>> +   }
>>> +
>>>  if (vsocket) {
>>>  free(vsocket);
>>>  vsocket = NULL;
>>
>> Nit: we had several cleanups in the tree to remove patterns like if
>> (ptr) free(ptr).
>> Here, this helper could look for vsocket being NULL first thing, then
>> call free() unconditionnally.
>> And resetting the fields to NULL is probably not that useful, since
>> the vsocket is freed at the end.
>> Wdyt of:
>>
>> static void
>> vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>> {
>>  if (vsocket == NULL)
>>  return;
>>
>>  free(vsocket->path);
>>  free(vsocket->malloc_notify_ops);
>>  free(vsocket);
>> }
>>
>>
>
> It is indeed better, I can fix while applying if Eelco agrees.

Looks good to me.

//Eelco



Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt kick

2023-05-31 Thread Eelco Chaudron



On 31 May 2023, at 11:29, Maxime Coquelin wrote:

> On 5/31/23 08:19, Xia, Chenbo wrote:
>>> -Original Message-
>>> From: Maxime Coquelin 
>>> Sent: Tuesday, May 30, 2023 11:17 PM
>>> To: Thomas Monjalon ; Eelco Chaudron
>>> ; Xia, Chenbo ;
>>> david.march...@redhat.com
>>> Cc: dev@dpdk.org
>>> Subject: Re: [PATCH v3 4/4] vhost: add device op to offload the interrupt
>>> kick
>>>
>>>
>>>
>>> On 5/30/23 15:16, Thomas Monjalon wrote:
>>>> 30/05/2023 15:02, Maxime Coquelin:
>>>>>
>>>>> On 5/17/23 11:09, Eelco Chaudron wrote:
>>>>>> This patch adds an operation callback which gets called every time the
>>>>>> library wants to call eventfd_write(). This eventfd_write() call could
>>>>>> result in a system call, which could potentially block the PMD thread.
>>>>>>
>>>>>> The callback function can decide whether it's ok to handle the
>>>>>> eventfd_write() now or have the newly introduced function,
>>>>>> rte_vhost_notify_guest(), called at a later time.
>>>>>>
>>>>>> This can be used by 3rd party applications, like OVS, to avoid system
>>>>>> calls being called as part of the PMD threads.
>>>>>>
>>>>>> Signed-off-by: Eelco Chaudron 
>>>>>> ---
>>>>>> lib/vhost/meson.build |2 ++
>>>>>> lib/vhost/rte_vhost.h |   23 +-
>>>>>> lib/vhost/socket.c|   63
>>> ++---
>>>>>> lib/vhost/version.map |9 +++
>>>>>> lib/vhost/vhost.c |   38 ++
>>>>>> lib/vhost/vhost.h |   58 --
>>> ---
>>>>>> 6 files changed, 171 insertions(+), 22 deletions(-)
>>>>>>
>>>>>
>>>>>
>>>>> The patch looks good to me, but that's the first time we use function
>>>>> versioning in Vhost library, so I'd like another pair of eyes to be
>>> sure
>>>>> I don't miss anything.
>>>>>
>>>>> Reviewed-by: Maxime Coquelin 
>>>>>
>>>>> Thomas, do we need to mention it somewhere in the release note?
>>>>
>>>> If compatibility is kept, I think we don't need to mention it.
>>>>
>>>>
>>>
>>> Thanks Thomas for the information.
>>>
>>> Maxime
>>
>> About release note, except the versioning, there is also one new API 
>> introduced
>> in this patch, so we still need to mention this in release note
>
> Right, good catch.
> Eelco, let me know what you would put, I'll add it while applying (No
> need for a new revision).

Thanks guys! What about the following release note addition:

* **Added callback API support for interrupt handling in the vhost library.**

A new callback, guest_notify, is introduced that can be used to handle the 
interrupt kick outside of the datapath fast path.  In addition, a new API, 
rte_vhost_notify_guest(), is added to raise the interrupt outside of the past 
path.


Please change if you see fit.

Thanks,

Eelco

> Thanks,
> Maxime
>
>> Thanks,
>> Chenbo



Re: [PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a read/write one

2023-05-31 Thread Eelco Chaudron



On 31 May 2023, at 11:27, Maxime Coquelin wrote:

> On 5/31/23 08:37, Xia, Chenbo wrote:
>> Hi Eelco,
>>
>>> -Original Message-
>>> From: Eelco Chaudron 
>>> Sent: Wednesday, May 17, 2023 5:09 PM
>>> To: maxime.coque...@redhat.com; Xia, Chenbo ;
>>> david.march...@redhat.com
>>> Cc: dev@dpdk.org
>>> Subject: [PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a
>>> read/write one
>>>
>>> This change will allow the vhost interrupt datapath handling to be split
>>> between two processed without one of them holding an explicit lock.
>>>
>>> Signed-off-by: Eelco Chaudron 
>>> ---
>>>   lib/eal/include/generic/rte_rwlock.h |   17 ++
>>>   lib/vhost/vhost.c|   46 +
>>>   lib/vhost/vhost.h|4 +-
>>>   lib/vhost/vhost_user.c   |   14 +++--
>>>   lib/vhost/virtio_net.c   |   90 +
>>> -
>>>   5 files changed, 94 insertions(+), 77 deletions(-)
>>>
>>> diff --git a/lib/eal/include/generic/rte_rwlock.h
>>> b/lib/eal/include/generic/rte_rwlock.h
>>> index 71e2d8d5f4..9e083bbc61 100644
>>> --- a/lib/eal/include/generic/rte_rwlock.h
>>> +++ b/lib/eal/include/generic/rte_rwlock.h
>>> @@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl)
>>> __atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE);
>>>   }
>>>
>>> +/**
>>> + * Test if the write lock is taken.
>>> + *
>>> + * @param rwl
>>> + *   A pointer to a rwlock structure.
>>> + * @return
>>> + *   1 if the write lock is currently taken; 0 otherwise.
>>> + */
>>> +static inline int
>>> +rte_rwlock_write_is_locked(rte_rwlock_t *rwl)
>>> +{
>>> +   if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE)
>>> +   return 1;
>>> +
>>> +   return 0;
>>> +}
>>> +
>>
>> Again we need to update release note as it's a new EAL API.
>>
>>>   /**
>>>* Try to execute critical section in a hardware memory transaction, if
>>> it
>>>* fails or not available take a read lock
>>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
>>> index ef37943817..74bdbfd810 100644
>>> --- a/lib/vhost/vhost.c
>>> +++ b/lib/vhost/vhost.c
>>> @@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue
>>> *vq)
>>> else
>>> rte_free(vq->shadow_used_split);
>>>
>>> -   rte_spinlock_lock(&vq->access_lock);
>>> +   rte_rwlock_write_lock(&vq->access_lock);
>>> vhost_free_async_mem(vq);
>>> -   rte_spinlock_unlock(&vq->access_lock);
>>> +   rte_rwlock_write_unlock(&vq->access_lock);
>>> rte_free(vq->batch_copy_elems);
>>> vhost_user_iotlb_destroy(vq);
>>> rte_free(vq->log_cache);
>>> @@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t
>>> vring_idx)
>>>
>>> dev->virtqueue[i] = vq;
>>> init_vring_queue(dev, vq, i);
>>> -   rte_spinlock_init(&vq->access_lock);
>>> +   rte_rwlock_init(&vq->access_lock);
>>> vq->avail_wrap_counter = 1;
>>> vq->used_wrap_counter = 1;
>>> vq->signalled_used_valid = false;
>>> @@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
>>> if (!vq)
>>> return -1;
>>>
>>> -   rte_spinlock_lock(&vq->access_lock);
>>> +   rte_rwlock_read_lock(&vq->access_lock);
>>>
>>> if (vq_is_packed(dev))
>>> vhost_vring_call_packed(dev, vq);
>>> else
>>> vhost_vring_call_split(dev, vq);
>>>
>>> -   rte_spinlock_unlock(&vq->access_lock);
>>> +   rte_rwlock_read_unlock(&vq->access_lock);
>>
>> Not sure about this. vhost_ring_call_packed/split is changing some field in
>> Vq. Should we use write lock here?
>
> I don't think so, the purpose of the access_lock is not to make the
> datapath threads-safe, but to protect the datapath from metadata changes
> by the control path.

Thanks Chinbo for the review, and see Maxime’s comment above. Does this clarify 
your concern/question?

>>
>> Thanks,
>> Chenbo
>>



Re: [PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a read/write one

2023-05-18 Thread Eelco Chaudron



On 17 May 2023, at 19:33, Maxime Coquelin wrote:

> Hi Eelco,
>
> On 5/17/23 11:08, Eelco Chaudron wrote:
>> This change will allow the vhost interrupt datapath handling to be split
>> between two processed without one of them holding an explicit lock.
>>
>
> As I had a tuned PVP benchmarking setup at hand, I ran a 0.02% loss
> RFC2544 test with and without this patch to ensure moving to RX locks
> would not introduce performance regression.
>
> I can confirm there are no performance regression introduced with this
> patch applied:
> Tested-by: Maxime Coquelin 
>
> And the patch looks good to me:
> Reviewed-by: Maxime Coquelin 

Thanks for running the test and doing the review!

//Eelco

>> Signed-off-by: Eelco Chaudron 
>> ---
>>   lib/eal/include/generic/rte_rwlock.h |   17 ++
>>   lib/vhost/vhost.c|   46 +
>>   lib/vhost/vhost.h|4 +-
>>   lib/vhost/vhost_user.c   |   14 +++--
>>   lib/vhost/virtio_net.c   |   90 
>> +-
>>   5 files changed, 94 insertions(+), 77 deletions(-)
>>



Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick

2023-05-17 Thread Eelco Chaudron



On 16 May 2023, at 13:36, Eelco Chaudron wrote:

> On 16 May 2023, at 12:12, David Marchand wrote:
>
>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron  wrote:
>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>
>> [snip]
>>
>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
>>>>> *vsocket)
>>>>> vsocket->path = NULL;
>>>>> }
>>>>>
>>>>> +   if (vsocket && vsocket->alloc_notify_ops) {
>>>>> +#pragma GCC diagnostic push
>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>> +   free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>> +#pragma GCC diagnostic pop
>>>>> +   vsocket->notify_ops = NULL;
>>>>> +   }
>>>>
>>>> Rather than select the behavior based on a boolean (and here force the
>>>> compiler to close its eyes), I would instead add a non const pointer
>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>
>>> Good idea, I will make the change in v3.
>>
>> Feel free to use a better name for this field :-).
>>
>>>
>>>>> +
>>>>> if (vsocket) {
>>>>> free(vsocket);
>>>>> vsocket = NULL;
>>
>> [snip]
>>
>>>>> +   /*
>>>>> +* Although the ops structure is a const structure, we do need to
>>>>> +* override the guest_notify operation. This is because with the
>>>>> +* previous APIs it was "reserved" and if any garbage value was 
>>>>> passed,
>>>>> +* it could crash the application.
>>>>> +*/
>>>>> +   if (ops && !ops->guest_notify) {
>>>>
>>>> Hum, as described in the comment above, I don't think we should look
>>>> at ops->guest_notify value at all.
>>>> Checking ops != NULL should be enough.
>>>
>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it 
>>> means the previously ‘reserved[1]’ field is NULL, so we do not need to use 
>>> a new structure.
>>>
>>> I guess your comment would be true if we would introduce a new field in the 
>>> data structure, not replacing a reserved one.
>>
>> Hum, I don't understand my comment either o_O'.
>> Too many days off... or maybe my evil twin took over the keyboard.
>>
>>
>>>
>>>>> +   struct rte_vhost_device_ops *new_ops;
>>>>> +
>>>>> +   new_ops = malloc(sizeof(*new_ops));
>>>>
>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>> I am unclear of the impact though.
>>>
>>> Don’t think there is a portable API that we can use to determine the NUMA 
>>> for the ops memory and then allocate this on the same numa?
>>>
>>> Any thoughts or ideas on how to solve this? I hope most people will 
>>> memset() the ops structure and the reserved[1] part is zero, but it might 
>>> be a problem in the future if more extensions get added.
>>
>> Determinining current numa is doable, via 'ops'
>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>> numa_realloc().
>> The problem is how to allocate on this numa with the libc allocator
>> for which I have no idea...
>> We could go with the dpdk allocator (again, like numa_realloc()).
>>
>>
>> In practice, the passed ops will be probably from a const variable in
>> the program .data section (for which I think fields are set to 0
>> unless explicitly initialised), or a memset() will be called for a
>> dynamic allocation from good citizens.
>> So we can probably live with the current proposal.
>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>> we will drop this compat code.
>>
>> Maxime, Chenbo, what do you think?
>
> Wait for their response, but for now I assume we can just keep the numa 
> unaware malloc().
>
>>
>> [snip]
>>
>>>>
>>>> But putting indentation aside, is this change equivalent?
>>>> -   if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>> -   (vq->callfd >= 0)) ||
>>>> -   unlikely(!signalled_used_valid)) {
>>>> +   if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>> +   unlikely(!signalled_used_valid)) &&
>>>> +   vq->callfd >= 0) {
>>>
>>> They are not equal, but in the past eventfd_write() should also not have 
>>> been called with callfd < 0, guess this was an existing bug ;)
>>
>> I think this should be a separate fix.
>
> ACK, will add a separate patch in this series to fix it.

FYI I sent out the v3 patch.

//Eelco



[PATCH v3 4/4] vhost: add device op to offload the interrupt kick

2023-05-17 Thread Eelco Chaudron
This patch adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

Signed-off-by: Eelco Chaudron 
---
 lib/vhost/meson.build |2 ++
 lib/vhost/rte_vhost.h |   23 +-
 lib/vhost/socket.c|   63 ++---
 lib/vhost/version.map |9 +++
 lib/vhost/vhost.c |   38 ++
 lib/vhost/vhost.h |   58 -
 6 files changed, 171 insertions(+), 22 deletions(-)

diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
index 0d1abf6283..05679447db 100644
--- a/lib/vhost/meson.build
+++ b/lib/vhost/meson.build
@@ -38,3 +38,5 @@ driver_sdk_headers = files(
 'vdpa_driver.h',
 )
 deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
+
+use_function_versioning = true
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 58a5d4be92..7a10bc36cf 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
 */
void (*guest_notified)(int vid);
 
-   void *reserved[1]; /**< Reserved for future extension */
+   /**
+* If this callback is registered, notification to the guest can
+* be handled by the front-end calling rte_vhost_notify_guest().
+* If it's not handled, 'false' should be returned. This can be used
+* to remove the "slow" eventfd_write() syscall from the datapath.
+*/
+   bool (*guest_notify)(int vid, uint16_t queue_id);
 };
 
 /**
@@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int 
enable);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Inject the offloaded interrupt into the vhost device's queue. For more
+ * details see the 'guest_notify' vhost device operation.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index
+ */
+__rte_experimental
+void rte_vhost_notify_guest(int vid, uint16_t queue_id);
+
 /**
  * Register vhost driver. path could be different for multiple
  * instance support.
diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
index 669c322e12..f2c02075fe 100644
--- a/lib/vhost/socket.c
+++ b/lib/vhost/socket.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "fd_man.h"
@@ -59,6 +60,7 @@ struct vhost_user_socket {
struct rte_vdpa_device *vdpa_dev;
 
struct rte_vhost_device_ops const *notify_ops;
+   struct rte_vhost_device_ops *malloc_notify_ops;
 };
 
 struct vhost_user_connection {
@@ -846,6 +848,11 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
*vsocket)
vsocket->path = NULL;
}
 
+   if (vsocket && vsocket->malloc_notify_ops) {
+   free(vsocket->malloc_notify_ops);
+   vsocket->malloc_notify_ops = NULL;
+   }
+
if (vsocket) {
free(vsocket);
vsocket = NULL;
@@ -1099,21 +1106,69 @@ rte_vhost_driver_unregister(const char *path)
 /*
  * Register ops so that we can add/remove device to data core.
  */
-int
-rte_vhost_driver_callback_register(const char *path,
-   struct rte_vhost_device_ops const * const ops)
+static int
+vhost_driver_callback_register(const char *path,
+   struct rte_vhost_device_ops const * const ops,
+   struct rte_vhost_device_ops *malloc_ops)
 {
struct vhost_user_socket *vsocket;
 
pthread_mutex_lock(&vhost_user.mutex);
vsocket = find_vhost_user_socket(path);
-   if (vsocket)
+   if (vsocket) {
vsocket->notify_ops = ops;
+   free(vsocket->malloc_notify_ops);
+   vsocket->malloc_notify_ops = malloc_ops;
+   }
pthread_mutex_unlock(&vhost_user.mutex);
 
return vsocket ? 0 : -1;
 }
 
+int __vsym
+rte_vhost_driver_callback_register_v24(const char *path,
+   struct rte_vhost_device_ops const * const ops)
+{
+   return vhost_driver_callback_register(path, ops, NULL);
+}
+
+int __vsym
+rte_vhost_driver_callback_register_v23(const char *path,
+   struct rte_vhost_device_ops const * const ops)
+{
+   int ret;
+
+   /*
+* Although the ops structure is a const structure, we do need to
+* override the guest_notify operation. This is because with the
+  

[PATCH v3 3/4] vhost: fix invalid call FD handling

2023-05-17 Thread Eelco Chaudron
This patch fixes cases where IRQ injection is tried while
the call FD is not valid, which should not happen.

Fixes: b1cce26af1dc ("vhost: add notification for packed ring")
Fixes: e37ff954405a ("vhost: support virtqueue interrupt/notification 
suppression")

Signed-off-by: Maxime Coquelin 
Signed-off-by: Eelco Chaudron 
---
 lib/vhost/vhost.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 37609c7c8d..23a4e2b1a7 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -903,9 +903,9 @@ vhost_vring_call_split(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
"%s: used_event_idx=%d, old=%d, new=%d\n",
__func__, vhost_used_event(vq), old, new);
 
-   if ((vhost_need_event(vhost_used_event(vq), new, old) &&
-   (vq->callfd >= 0)) ||
-   unlikely(!signalled_used_valid)) {
+   if ((vhost_need_event(vhost_used_event(vq), new, old) ||
+   unlikely(!signalled_used_valid)) &&
+   vq->callfd >= 0) {
eventfd_write(vq->callfd, (eventfd_t) 1);
if (dev->flags & VIRTIO_DEV_STATS_ENABLED)

__atomic_fetch_add(&vq->stats.guest_notifications,
@@ -974,7 +974,7 @@ vhost_vring_call_packed(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
if (vhost_need_event(off, new, old))
kick = true;
 kick:
-   if (kick) {
+   if (kick && vq->callfd >= 0) {
eventfd_write(vq->callfd, (eventfd_t)1);
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);



[PATCH v3 2/4] vhost: make the guest_notifications statistic counter atomic

2023-05-17 Thread Eelco Chaudron
Making the guest_notifications statistic counter atomic, allows
it to be safely incremented while holding the read access_lock.

Signed-off-by: Eelco Chaudron 
---
 lib/vhost/vhost.c |8 
 lib/vhost/vhost.h |9 ++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index 74bdbfd810..8ff6434c93 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -2086,6 +2086,10 @@ rte_vhost_vring_stats_get(int vid, uint16_t queue_id,
 
rte_rwlock_write_lock(&vq->access_lock);
for (i = 0; i < VHOST_NB_VQ_STATS; i++) {
+   /*
+* No need to the read atomic counters as such, due to the
+* above write access_lock preventing them to be updated.
+*/
stats[i].value =
*(uint64_t *)(((char *)vq) + 
vhost_vq_stat_strings[i].offset);
stats[i].id = i;
@@ -2112,6 +2116,10 @@ int rte_vhost_vring_stats_reset(int vid, uint16_t 
queue_id)
vq = dev->virtqueue[queue_id];
 
rte_rwlock_write_lock(&vq->access_lock);
+   /*
+* No need to the reset atomic counters as such, due to the
+* above write access_lock preventing them to be updated.
+*/
memset(&vq->stats, 0, sizeof(vq->stats));
rte_rwlock_write_unlock(&vq->access_lock);
 
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 5c939ef06f..37609c7c8d 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -135,11 +135,12 @@ struct virtqueue_stats {
uint64_t broadcast;
/* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */
uint64_t size_bins[8];
-   uint64_t guest_notifications;
uint64_t iotlb_hits;
uint64_t iotlb_misses;
uint64_t inflight_submitted;
uint64_t inflight_completed;
+   /* Counters below are atomic, and should be incremented as such. */
+   uint64_t guest_notifications;
 };
 
 /**
@@ -907,7 +908,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
unlikely(!signalled_used_valid)) {
eventfd_write(vq->callfd, (eventfd_t) 1);
if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-   vq->stats.guest_notifications++;
+   
__atomic_fetch_add(&vq->stats.guest_notifications,
+   1, __ATOMIC_RELAXED);
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);
}
@@ -917,7 +919,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
&& (vq->callfd >= 0)) {
eventfd_write(vq->callfd, (eventfd_t)1);
if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-   vq->stats.guest_notifications++;
+   
__atomic_fetch_add(&vq->stats.guest_notifications,
+   1, __ATOMIC_RELAXED);
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);
}



[PATCH v3 1/4] vhost: change vhost_virtqueue access lock to a read/write one

2023-05-17 Thread Eelco Chaudron
This change will allow the vhost interrupt datapath handling to be split
between two processed without one of them holding an explicit lock.

Signed-off-by: Eelco Chaudron 
---
 lib/eal/include/generic/rte_rwlock.h |   17 ++
 lib/vhost/vhost.c|   46 +
 lib/vhost/vhost.h|4 +-
 lib/vhost/vhost_user.c   |   14 +++--
 lib/vhost/virtio_net.c   |   90 +-
 5 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/lib/eal/include/generic/rte_rwlock.h 
b/lib/eal/include/generic/rte_rwlock.h
index 71e2d8d5f4..9e083bbc61 100644
--- a/lib/eal/include/generic/rte_rwlock.h
+++ b/lib/eal/include/generic/rte_rwlock.h
@@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl)
__atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE);
 }
 
+/**
+ * Test if the write lock is taken.
+ *
+ * @param rwl
+ *   A pointer to a rwlock structure.
+ * @return
+ *   1 if the write lock is currently taken; 0 otherwise.
+ */
+static inline int
+rte_rwlock_write_is_locked(rte_rwlock_t *rwl)
+{
+   if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE)
+   return 1;
+
+   return 0;
+}
+
 /**
  * Try to execute critical section in a hardware memory transaction, if it
  * fails or not available take a read lock
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index ef37943817..74bdbfd810 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
else
rte_free(vq->shadow_used_split);
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
vhost_free_async_mem(vq);
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_write_unlock(&vq->access_lock);
rte_free(vq->batch_copy_elems);
vhost_user_iotlb_destroy(vq);
rte_free(vq->log_cache);
@@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t 
vring_idx)
 
dev->virtqueue[i] = vq;
init_vring_queue(dev, vq, i);
-   rte_spinlock_init(&vq->access_lock);
+   rte_rwlock_init(&vq->access_lock);
vq->avail_wrap_counter = 1;
vq->used_wrap_counter = 1;
vq->signalled_used_valid = false;
@@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
if (!vq)
return -1;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_read_lock(&vq->access_lock);
 
if (vq_is_packed(dev))
vhost_vring_call_packed(dev, vq);
else
vhost_vring_call_split(dev, vq);
 
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_read_unlock(&vq->access_lock);
 
return 0;
 }
@@ -1334,7 +1334,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx)
if (!vq)
return -1;
 
-   if (!rte_spinlock_trylock(&vq->access_lock))
+   if (rte_rwlock_read_trylock(&vq->access_lock))
return -EAGAIN;
 
if (vq_is_packed(dev))
@@ -1342,7 +1342,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx)
else
vhost_vring_call_split(dev, vq);
 
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_read_unlock(&vq->access_lock);
 
return 0;
 }
@@ -1365,7 +1365,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
if (!vq)
return 0;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
 
if (unlikely(!vq->enabled || vq->avail == NULL))
goto out;
@@ -1373,7 +1373,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
ret = *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
 
 out:
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_write_unlock(&vq->access_lock);
return ret;
 }
 
@@ -1457,12 +1457,12 @@ rte_vhost_enable_guest_notification(int vid, uint16_t 
queue_id, int enable)
if (!vq)
return -1;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
 
vq->notif_enable = enable;
ret = vhost_enable_guest_notification(dev, vq, enable);
 
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_write_unlock(&vq->access_lock);
 
return ret;
 }
@@ -1520,7 +1520,7 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid)
if (vq == NULL)
return 0;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
 
if (unlikely(!vq->enabled || vq->avail

[PATCH v3 0/4] vhost: add device op to offload the interrupt kick

2023-05-17 Thread Eelco Chaudron
This series adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

v3:
- Changed ABI compatibility code to no longer use a boolean
  to avoid having to disable specific GCC warnings.
- Moved the fd check fix to a separate patch (patch 3/4).
- Fixed some coding style issues.

v2: - Used vhost_virtqueue->index to find index for operation.
- Aligned function name to VDUSE RFC patchset.
- Added error and offload statistics counter.
- Mark new API as experimental.
- Change the virtual queue spin lock to read/write spin lock.
- Made shared counters atomic.
- Add versioned rte_vhost_driver_callback_register() for
  ABI compliance.

Eelco Chaudron (4):
  vhost: change vhost_virtqueue access lock to a read/write one
  vhost: make the guest_notifications statistic counter atomic
  vhost: fix invalid call FD handling
  vhost: add device op to offload the interrupt kick


 lib/eal/include/generic/rte_rwlock.h | 17 +
 lib/vhost/meson.build|  2 +
 lib/vhost/rte_vhost.h| 23 ++-
 lib/vhost/socket.c   | 63 +--
 lib/vhost/version.map|  9 +++
 lib/vhost/vhost.c| 92 +---
 lib/vhost/vhost.h| 69 ++---
 lib/vhost/vhost_user.c   | 14 ++---
 lib/vhost/virtio_net.c   | 90 +--
 9 files changed, 278 insertions(+), 101 deletions(-)



Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick

2023-05-16 Thread Eelco Chaudron



On 16 May 2023, at 13:45, Maxime Coquelin wrote:

> On 5/16/23 13:36, Eelco Chaudron wrote:
>>
>>
>> On 16 May 2023, at 12:12, David Marchand wrote:
>>
>>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron  wrote:
>>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>>
>>> [snip]
>>>
>>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
>>>>>> *vsocket)
>>>>>>  vsocket->path = NULL;
>>>>>>  }
>>>>>>
>>>>>> +   if (vsocket && vsocket->alloc_notify_ops) {
>>>>>> +#pragma GCC diagnostic push
>>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>>> +   free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>>> +#pragma GCC diagnostic pop
>>>>>> +   vsocket->notify_ops = NULL;
>>>>>> +   }
>>>>>
>>>>> Rather than select the behavior based on a boolean (and here force the
>>>>> compiler to close its eyes), I would instead add a non const pointer
>>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>>
>>>> Good idea, I will make the change in v3.
>>>
>>> Feel free to use a better name for this field :-).
>>>
>>>>
>>>>>> +
>>>>>>  if (vsocket) {
>>>>>>  free(vsocket);
>>>>>>  vsocket = NULL;
>>>
>>> [snip]
>>>
>>>>>> +   /*
>>>>>> +* Although the ops structure is a const structure, we do need to
>>>>>> +* override the guest_notify operation. This is because with the
>>>>>> +* previous APIs it was "reserved" and if any garbage value was 
>>>>>> passed,
>>>>>> +* it could crash the application.
>>>>>> +*/
>>>>>> +   if (ops && !ops->guest_notify) {
>>>>>
>>>>> Hum, as described in the comment above, I don't think we should look
>>>>> at ops->guest_notify value at all.
>>>>> Checking ops != NULL should be enough.
>>>>
>>>> Not sure I get you here. If the guest_notify passed by the user is NULL, 
>>>> it means the previously ‘reserved[1]’ field is NULL, so we do not need to 
>>>> use a new structure.
>>>>
>>>> I guess your comment would be true if we would introduce a new field in 
>>>> the data structure, not replacing a reserved one.
>>>
>>> Hum, I don't understand my comment either o_O'.
>>> Too many days off... or maybe my evil twin took over the keyboard.
>>>
>>>
>>>>
>>>>>> +   struct rte_vhost_device_ops *new_ops;
>>>>>> +
>>>>>> +   new_ops = malloc(sizeof(*new_ops));
>>>>>
>>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>>> I am unclear of the impact though.
>>>>
>>>> Don’t think there is a portable API that we can use to determine the NUMA 
>>>> for the ops memory and then allocate this on the same numa?
>>>>
>>>> Any thoughts or ideas on how to solve this? I hope most people will 
>>>> memset() the ops structure and the reserved[1] part is zero, but it might 
>>>> be a problem in the future if more extensions get added.
>>>
>>> Determinining current numa is doable, via 'ops'
>>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>>> numa_realloc().
>>> The problem is how to allocate on this numa with the libc allocator
>>> for which I have no idea...
>>> We could go with the dpdk allocator (again, like numa_realloc()).
>>>
>>>
>>> In practice, the passed ops will be probably from a const variable in
>>> the program .data section (for which I think fields are set to 0
>>> unless explicitly initialised), or a memset() will be called for a
>>> dynamic allocation from good citizens.
>>> So we can probably live with the current proposal.
>>> Plus, this is only for one release, since in 23.11 with the ABI

Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick

2023-05-16 Thread Eelco Chaudron



On 16 May 2023, at 12:12, David Marchand wrote:

> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron  wrote:
>> On 10 May 2023, at 13:44, David Marchand wrote:
>
> [snip]
>
>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
>>>> *vsocket)
>>>> vsocket->path = NULL;
>>>> }
>>>>
>>>> +   if (vsocket && vsocket->alloc_notify_ops) {
>>>> +#pragma GCC diagnostic push
>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>> +   free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>> +#pragma GCC diagnostic pop
>>>> +   vsocket->notify_ops = NULL;
>>>> +   }
>>>
>>> Rather than select the behavior based on a boolean (and here force the
>>> compiler to close its eyes), I would instead add a non const pointer
>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>
>> Good idea, I will make the change in v3.
>
> Feel free to use a better name for this field :-).
>
>>
>>>> +
>>>> if (vsocket) {
>>>> free(vsocket);
>>>> vsocket = NULL;
>
> [snip]
>
>>>> +   /*
>>>> +* Although the ops structure is a const structure, we do need to
>>>> +* override the guest_notify operation. This is because with the
>>>> +* previous APIs it was "reserved" and if any garbage value was 
>>>> passed,
>>>> +* it could crash the application.
>>>> +*/
>>>> +   if (ops && !ops->guest_notify) {
>>>
>>> Hum, as described in the comment above, I don't think we should look
>>> at ops->guest_notify value at all.
>>> Checking ops != NULL should be enough.
>>
>> Not sure I get you here. If the guest_notify passed by the user is NULL, it 
>> means the previously ‘reserved[1]’ field is NULL, so we do not need to use a 
>> new structure.
>>
>> I guess your comment would be true if we would introduce a new field in the 
>> data structure, not replacing a reserved one.
>
> Hum, I don't understand my comment either o_O'.
> Too many days off... or maybe my evil twin took over the keyboard.
>
>
>>
>>>> +   struct rte_vhost_device_ops *new_ops;
>>>> +
>>>> +   new_ops = malloc(sizeof(*new_ops));
>>>
>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>> I am unclear of the impact though.
>>
>> Don’t think there is a portable API that we can use to determine the NUMA 
>> for the ops memory and then allocate this on the same numa?
>>
>> Any thoughts or ideas on how to solve this? I hope most people will memset() 
>> the ops structure and the reserved[1] part is zero, but it might be a 
>> problem in the future if more extensions get added.
>
> Determinining current numa is doable, via 'ops'
> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
> numa_realloc().
> The problem is how to allocate on this numa with the libc allocator
> for which I have no idea...
> We could go with the dpdk allocator (again, like numa_realloc()).
>
>
> In practice, the passed ops will be probably from a const variable in
> the program .data section (for which I think fields are set to 0
> unless explicitly initialised), or a memset() will be called for a
> dynamic allocation from good citizens.
> So we can probably live with the current proposal.
> Plus, this is only for one release, since in 23.11 with the ABI bump,
> we will drop this compat code.
>
> Maxime, Chenbo, what do you think?

Wait for their response, but for now I assume we can just keep the numa unaware 
malloc().

>
> [snip]
>
>>>
>>> But putting indentation aside, is this change equivalent?
>>> -   if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>> -   (vq->callfd >= 0)) ||
>>> -   unlikely(!signalled_used_valid)) {
>>> +   if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>> +   unlikely(!signalled_used_valid)) &&
>>> +   vq->callfd >= 0) {
>>
>> They are not equal, but in the past eventfd_write() should also not have 
>> been called with callfd < 0, guess this was an existing bug ;)
>
> I think this should be a separate fix.

ACK, will add a separate patch in this series to fix it.

>
>>
>>>> +   vhost_vring_inject_irq(dev, vq);
>
>
> -- 
> David Marchand



Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick

2023-05-16 Thread Eelco Chaudron



On 10 May 2023, at 13:44, David Marchand wrote:

> On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron  wrote:
>>
>> This patch adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>>
>> Signed-off-by: Eelco Chaudron 
>> ---
>>  lib/vhost/meson.build |2 +
>>  lib/vhost/rte_vhost.h |   23 +++-
>>  lib/vhost/socket.c|   72 
>> ++---
>>  lib/vhost/version.map |9 ++
>>  lib/vhost/vhost.c |   38 ++
>>  lib/vhost/vhost.h |   65 +++-
>>  6 files changed, 184 insertions(+), 25 deletions(-)
>>
>> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
>> index 197a51d936..e93ba6b078 100644
>> --- a/lib/vhost/meson.build
>> +++ b/lib/vhost/meson.build
>> @@ -39,3 +39,5 @@ driver_sdk_headers = files(
>>  'vdpa_driver.h',
>>  )
>>  deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
>> +
>> +use_function_versioning = true
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 58a5d4be92..7a10bc36cf 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>>  */
>> void (*guest_notified)(int vid);
>>
>> -   void *reserved[1]; /**< Reserved for future extension */
>> +   /**
>> +* If this callback is registered, notification to the guest can
>> +* be handled by the front-end calling rte_vhost_notify_guest().
>> +* If it's not handled, 'false' should be returned. This can be used
>> +* to remove the "slow" eventfd_write() syscall from the datapath.
>> +*/
>> +   bool (*guest_notify)(int vid, uint16_t queue_id);
>>  };
>>
>>  /**
>> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t 
>> vring_idx,
>>
>>  int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int 
>> enable);
>>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior 
>> notice.
>> + *
>> + * Inject the offloaded interrupt into the vhost device's queue. For more
>> + * details see the 'guest_notify' vhost device operation.
>> + *
>> + * @param vid
>> + *  vhost device ID
>> + * @param queue_id
>> + *  virtio queue index
>> + */
>> +__rte_experimental
>> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
>> +
>>  /**
>>   * Register vhost driver. path could be different for multiple
>>   * instance support.
>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>> index 669c322e12..787d6bacf8 100644
>> --- a/lib/vhost/socket.c
>> +++ b/lib/vhost/socket.c
>> @@ -15,6 +15,7 @@
>>  #include 
>>  #include 
>>
>> +#include 
>>  #include 
>>
>>  #include "fd_man.h"
>> @@ -43,6 +44,7 @@ struct vhost_user_socket {
>> bool async_copy;
>> bool net_compliant_ol_flags;
>> bool stats_enabled;
>> +   bool alloc_notify_ops;
>>
>> /*
>>  * The "supported_features" indicates the feature bits the
>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
>> *vsocket)
>> vsocket->path = NULL;
>> }
>>
>> +   if (vsocket && vsocket->alloc_notify_ops) {
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> +   free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> +   vsocket->notify_ops = NULL;
>> +   }
>
> Rather than select the behavior based on a boolean (and here force the
> compiler to close its eyes), I would instead add a non const pointer
> to ops (let's say alloc_notify_ops) in vhost_user_socket.
> The code can then unconditionn

Re: [PATCH v2 0/3] vhost: add device op to offload the interrupt kick

2023-05-08 Thread Eelco Chaudron



On 5 Apr 2023, at 14:40, Eelco Chaudron wrote:

> This series adds an operation callback which gets called every time the
> library wants to call eventfd_write(). This eventfd_write() call could
> result in a system call, which could potentially block the PMD thread.
>
> The callback function can decide whether it's ok to handle the
> eventfd_write() now or have the newly introduced function,
> rte_vhost_notify_guest(), called at a later time.
>
> This can be used by 3rd party applications, like OVS, to avoid system
> calls being called as part of the PMD threads.


Wondering if anyone had a chance to look at this patchset.

Cheers,

Eelco

> v2: - Used vhost_virtqueue->index to find index for operation.
> - Aligned function name to VDUSE RFC patchset.
> - Added error and offload statistics counter.
> - Mark new API as experimental.
> - Change the virtual queue spin lock to read/write spin lock.
> - Made shared counters atomic.
> - Add versioned rte_vhost_driver_callback_register() for
>   ABI compliance.
>
> Eelco Chaudron (3):
>   vhost: Change vhost_virtqueue access lock to a read/write one.
>   vhost: make the guest_notifications statistic counter atomic.
>   vhost: add device op to offload the interrupt kick
>
>
>  lib/eal/include/generic/rte_rwlock.h | 17 +
>  lib/vhost/meson.build|  2 +
>  lib/vhost/rte_vhost.h| 23 ++-
>  lib/vhost/socket.c   | 72 --
>  lib/vhost/version.map|  9 +++
>  lib/vhost/vhost.c| 92 +---
>  lib/vhost/vhost.h| 70 ++---
>  lib/vhost/vhost_user.c   | 14 ++---
>  lib/vhost/virtio_net.c   | 90 +--
>  9 files changed, 288 insertions(+), 101 deletions(-)



[PATCH v2 3/3] vhost: add device op to offload the interrupt kick

2023-04-05 Thread Eelco Chaudron
This patch adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

Signed-off-by: Eelco Chaudron 
---
 lib/vhost/meson.build |2 +
 lib/vhost/rte_vhost.h |   23 +++-
 lib/vhost/socket.c|   72 ++---
 lib/vhost/version.map |9 ++
 lib/vhost/vhost.c |   38 ++
 lib/vhost/vhost.h |   65 +++-
 6 files changed, 184 insertions(+), 25 deletions(-)

diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
index 197a51d936..e93ba6b078 100644
--- a/lib/vhost/meson.build
+++ b/lib/vhost/meson.build
@@ -39,3 +39,5 @@ driver_sdk_headers = files(
 'vdpa_driver.h',
 )
 deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
+
+use_function_versioning = true
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 58a5d4be92..7a10bc36cf 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
 */
void (*guest_notified)(int vid);
 
-   void *reserved[1]; /**< Reserved for future extension */
+   /**
+* If this callback is registered, notification to the guest can
+* be handled by the front-end calling rte_vhost_notify_guest().
+* If it's not handled, 'false' should be returned. This can be used
+* to remove the "slow" eventfd_write() syscall from the datapath.
+*/
+   bool (*guest_notify)(int vid, uint16_t queue_id);
 };
 
 /**
@@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int 
enable);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Inject the offloaded interrupt into the vhost device's queue. For more
+ * details see the 'guest_notify' vhost device operation.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index
+ */
+__rte_experimental
+void rte_vhost_notify_guest(int vid, uint16_t queue_id);
+
 /**
  * Register vhost driver. path could be different for multiple
  * instance support.
diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
index 669c322e12..787d6bacf8 100644
--- a/lib/vhost/socket.c
+++ b/lib/vhost/socket.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "fd_man.h"
@@ -43,6 +44,7 @@ struct vhost_user_socket {
bool async_copy;
bool net_compliant_ol_flags;
bool stats_enabled;
+   bool alloc_notify_ops;
 
/*
 * The "supported_features" indicates the feature bits the
@@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket 
*vsocket)
vsocket->path = NULL;
}
 
+   if (vsocket && vsocket->alloc_notify_ops) {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+   free((struct rte_vhost_device_ops *)vsocket->notify_ops);
+#pragma GCC diagnostic pop
+   vsocket->notify_ops = NULL;
+   }
+
if (vsocket) {
free(vsocket);
vsocket = NULL;
@@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
 /*
  * Register ops so that we can add/remove device to data core.
  */
-int
-rte_vhost_driver_callback_register(const char *path,
-   struct rte_vhost_device_ops const * const ops)
+static int
+rte_vhost_driver_callback_register__(const char *path,
+   struct rte_vhost_device_ops const * const ops, bool ops_allocated)
 {
struct vhost_user_socket *vsocket;
 
pthread_mutex_lock(&vhost_user.mutex);
vsocket = find_vhost_user_socket(path);
-   if (vsocket)
+   if (vsocket) {
+   if (vsocket->alloc_notify_ops) {
+   vsocket->alloc_notify_ops = false;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+   free((struct rte_vhost_device_ops 
*)vsocket->notify_ops);
+#pragma GCC diagnostic pop
+   }
vsocket->notify_ops = ops;
+   if (ops_allocated)
+   vsocket->alloc_notify_ops = true;
+   }
pthread_mutex_unlock(&vhost_user.mutex);
 
return vsocket ? 0 : -1;
 }
 
+int __vsym
+rte_vhost_driver_callback_register_v24(const char *path,
+   str

[PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic.

2023-04-05 Thread Eelco Chaudron
Making the guest_notifications statistic counter atomic, allows
it to be safely incremented while holding the read access_lock.

Signed-off-by: Eelco Chaudron 
---
 lib/vhost/vhost.c |8 
 lib/vhost/vhost.h |9 ++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index 74bdbfd810..8ff6434c93 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -2086,6 +2086,10 @@ rte_vhost_vring_stats_get(int vid, uint16_t queue_id,
 
rte_rwlock_write_lock(&vq->access_lock);
for (i = 0; i < VHOST_NB_VQ_STATS; i++) {
+   /*
+* No need to the read atomic counters as such, due to the
+* above write access_lock preventing them to be updated.
+*/
stats[i].value =
*(uint64_t *)(((char *)vq) + 
vhost_vq_stat_strings[i].offset);
stats[i].id = i;
@@ -2112,6 +2116,10 @@ int rte_vhost_vring_stats_reset(int vid, uint16_t 
queue_id)
vq = dev->virtqueue[queue_id];
 
rte_rwlock_write_lock(&vq->access_lock);
+   /*
+* No need to the reset atomic counters as such, due to the
+* above write access_lock preventing them to be updated.
+*/
memset(&vq->stats, 0, sizeof(vq->stats));
rte_rwlock_write_unlock(&vq->access_lock);
 
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 5c939ef06f..37609c7c8d 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -135,11 +135,12 @@ struct virtqueue_stats {
uint64_t broadcast;
/* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */
uint64_t size_bins[8];
-   uint64_t guest_notifications;
uint64_t iotlb_hits;
uint64_t iotlb_misses;
uint64_t inflight_submitted;
uint64_t inflight_completed;
+   /* Counters below are atomic, and should be incremented as such. */
+   uint64_t guest_notifications;
 };
 
 /**
@@ -907,7 +908,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
unlikely(!signalled_used_valid)) {
eventfd_write(vq->callfd, (eventfd_t) 1);
if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-   vq->stats.guest_notifications++;
+   
__atomic_fetch_add(&vq->stats.guest_notifications,
+   1, __ATOMIC_RELAXED);
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);
}
@@ -917,7 +919,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
&& (vq->callfd >= 0)) {
eventfd_write(vq->callfd, (eventfd_t)1);
if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-   vq->stats.guest_notifications++;
+   
__atomic_fetch_add(&vq->stats.guest_notifications,
+   1, __ATOMIC_RELAXED);
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);
}



[PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one.

2023-04-05 Thread Eelco Chaudron
This change will allow the vhost interrupt datapath handling to be split
between two processed without one of them holding an explicit lock.

Signed-off-by: Eelco Chaudron 
---
 lib/eal/include/generic/rte_rwlock.h |   17 ++
 lib/vhost/vhost.c|   46 +
 lib/vhost/vhost.h|4 +-
 lib/vhost/vhost_user.c   |   14 +++--
 lib/vhost/virtio_net.c   |   90 +-
 5 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/lib/eal/include/generic/rte_rwlock.h 
b/lib/eal/include/generic/rte_rwlock.h
index d45c22c189..3a5f0e568f 100644
--- a/lib/eal/include/generic/rte_rwlock.h
+++ b/lib/eal/include/generic/rte_rwlock.h
@@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl)
__atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE);
 }
 
+/**
+ * Test if the write lock is taken.
+ *
+ * @param rwl
+ *   A pointer to a rwlock structure.
+ * @return
+ *   1 if the write lock is currently taken; 0 otherwise.
+ */
+static inline int
+rte_rwlock_write_is_locked(rte_rwlock_t *rwl)
+{
+   if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE)
+   return 1;
+
+   return 0;
+}
+
 /**
  * Try to execute critical section in a hardware memory transaction, if it
  * fails or not available take a read lock
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index ef37943817..74bdbfd810 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
else
rte_free(vq->shadow_used_split);
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
vhost_free_async_mem(vq);
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_write_unlock(&vq->access_lock);
rte_free(vq->batch_copy_elems);
vhost_user_iotlb_destroy(vq);
rte_free(vq->log_cache);
@@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t 
vring_idx)
 
dev->virtqueue[i] = vq;
init_vring_queue(dev, vq, i);
-   rte_spinlock_init(&vq->access_lock);
+   rte_rwlock_init(&vq->access_lock);
vq->avail_wrap_counter = 1;
vq->used_wrap_counter = 1;
vq->signalled_used_valid = false;
@@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
if (!vq)
return -1;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_read_lock(&vq->access_lock);
 
if (vq_is_packed(dev))
vhost_vring_call_packed(dev, vq);
else
vhost_vring_call_split(dev, vq);
 
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_read_unlock(&vq->access_lock);
 
return 0;
 }
@@ -1334,7 +1334,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx)
if (!vq)
return -1;
 
-   if (!rte_spinlock_trylock(&vq->access_lock))
+   if (rte_rwlock_read_trylock(&vq->access_lock))
return -EAGAIN;
 
if (vq_is_packed(dev))
@@ -1342,7 +1342,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx)
else
vhost_vring_call_split(dev, vq);
 
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_read_unlock(&vq->access_lock);
 
return 0;
 }
@@ -1365,7 +1365,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
if (!vq)
return 0;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
 
if (unlikely(!vq->enabled || vq->avail == NULL))
goto out;
@@ -1373,7 +1373,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
ret = *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
 
 out:
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_write_unlock(&vq->access_lock);
return ret;
 }
 
@@ -1457,12 +1457,12 @@ rte_vhost_enable_guest_notification(int vid, uint16_t 
queue_id, int enable)
if (!vq)
return -1;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
 
vq->notif_enable = enable;
ret = vhost_enable_guest_notification(dev, vq, enable);
 
-   rte_spinlock_unlock(&vq->access_lock);
+   rte_rwlock_write_unlock(&vq->access_lock);
 
return ret;
 }
@@ -1520,7 +1520,7 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid)
if (vq == NULL)
return 0;
 
-   rte_spinlock_lock(&vq->access_lock);
+   rte_rwlock_write_lock(&vq->access_lock);
 
if (unlikely(!vq->enabled || vq->avail

[PATCH v2 0/3] vhost: add device op to offload the interrupt kick

2023-04-05 Thread Eelco Chaudron
This series adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

v2: - Used vhost_virtqueue->index to find index for operation.
- Aligned function name to VDUSE RFC patchset.
- Added error and offload statistics counter.
- Mark new API as experimental.
- Change the virtual queue spin lock to read/write spin lock.
- Made shared counters atomic.
- Add versioned rte_vhost_driver_callback_register() for
  ABI compliance.

Eelco Chaudron (3):
  vhost: Change vhost_virtqueue access lock to a read/write one.
  vhost: make the guest_notifications statistic counter atomic.
  vhost: add device op to offload the interrupt kick


 lib/eal/include/generic/rte_rwlock.h | 17 +
 lib/vhost/meson.build|  2 +
 lib/vhost/rte_vhost.h| 23 ++-
 lib/vhost/socket.c   | 72 --
 lib/vhost/version.map|  9 +++
 lib/vhost/vhost.c| 92 +---
 lib/vhost/vhost.h| 70 ++---
 lib/vhost/vhost_user.c   | 14 ++---
 lib/vhost/virtio_net.c   | 90 +--
 9 files changed, 288 insertions(+), 101 deletions(-)



Re: [EXT] [PATCH] vhost: add device op to offload the interrupt kick

2023-04-03 Thread Eelco Chaudron



On 27 Mar 2023, at 18:35, Maxime Coquelin wrote:

> On 3/27/23 18:04, Eelco Chaudron wrote:
>>
>>
>> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:
>>
>>> Hi Eelco,
>>>
>>>> +void
>>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>>>> +  struct virtio_net *dev = get_device(vid);
>>>> +  struct vhost_virtqueue *vq;
>>>> +
>>>> +  if (!dev ||  queue_id >= VHOST_MAX_VRING)
>>>> +  return;
>>>> +
>>>> +  vq = dev->virtqueue[queue_id];
>>>> +  if (!vq)
>>>> +  return;
>>>> +
>>>> +  rte_spinlock_lock(&vq->access_lock);
>>>> +
>>>
>>> Is spin lock needed here before system call ?
>>
>> I assumed access_lock is protecting all the following fields in this 
>> structure, so I need the lock to read the vq->callfd, however, I can/should 
>> move the eventfd_write outside of the lock.
>
> The FD might be closed between the check and the call to eventfd_write
> though, but I agree this is not optimal to call the eventfd_write under
> the spinlock in your case, as you will block the pmd thread if it tries
> to enqueue/dequeue packets on this queue, defeating the purpose of this
> patch.
>
> Maybe the solution is to change to read-write locks for the access_lock
> spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst)
> and this API would use the read version, meaning they won't lock each
> other, and the control path (lib/vhost/vhost_user.c) will use the write
> version.
>
> Does that make sense?

Hi Maxime, I prepped a patch, but not the read/write part yet, 
https://github.com/chaudron/dpdk/commit/d51c93b4ff08b43daed33e3c0fee193a6d039c25#.

I was thinking that even a read/write lock does not work (or maybe we need a 
combination of taking the read and write lock). As we need to update 
statistics, which need protection.
For example here you see the split (with just two locks, but the sys call could 
be called in the read lock):

https://github.com/chaudron/dpdk/blob/d51c93b4ff08b43daed33e3c0fee193a6d039c25/lib/vhost/vhost.c#L1493

The best would be to not have a lock when calling the system call, but that 
does not seem safe. I do not see a clear solution, guess also as I have some 
trouble understanding the lock rules around vq’s.

Do you have some more insights to share? I can ping you offline and discuss 
this.

Cheers,

Eelco

>>
>>>> +  if (vq->callfd >= 0)
>>>> +  eventfd_write(vq->callfd, (eventfd_t)1);
>>>> +
>>>> +  rte_spinlock_unlock(&vq->access_lock);
>>>> +}
>>>> +
>>>
>>> Thanks.
>>



Re: [EXT] [PATCH] vhost: add device op to offload the interrupt kick

2023-03-28 Thread Eelco Chaudron



On 27 Mar 2023, at 18:35, Maxime Coquelin wrote:

> On 3/27/23 18:04, Eelco Chaudron wrote:
>>
>>
>> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:
>>
>>> Hi Eelco,
>>>
>>>> +void
>>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>>>> +  struct virtio_net *dev = get_device(vid);
>>>> +  struct vhost_virtqueue *vq;
>>>> +
>>>> +  if (!dev ||  queue_id >= VHOST_MAX_VRING)
>>>> +  return;
>>>> +
>>>> +  vq = dev->virtqueue[queue_id];
>>>> +  if (!vq)
>>>> +  return;
>>>> +
>>>> +  rte_spinlock_lock(&vq->access_lock);
>>>> +
>>>
>>> Is spin lock needed here before system call ?
>>
>> I assumed access_lock is protecting all the following fields in this 
>> structure, so I need the lock to read the vq->callfd, however, I can/should 
>> move the eventfd_write outside of the lock.
>
> The FD might be closed between the check and the call to eventfd_write
> though, but I agree this is not optimal to call the eventfd_write under
> the spinlock in your case, as you will block the pmd thread if it tries
> to enqueue/dequeue packets on this queue, defeating the purpose of this
> patch.
>
> Maybe the solution is to change to read-write locks for the access_lock
> spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst)
> and this API would use the read version, meaning they won't lock each
> other, and the control path (lib/vhost/vhost_user.c) will use the write
> version.
>
> Does that make sense?

Yes, this makes sense, let me investigate this and get back.

>>>> +  if (vq->callfd >= 0)
>>>> +  eventfd_write(vq->callfd, (eventfd_t)1);
>>>> +
>>>> +  rte_spinlock_unlock(&vq->access_lock);
>>>> +}
>>>> +
>>>
>>> Thanks.
>>



Re: [EXT] [PATCH] vhost: add device op to offload the interrupt kick

2023-03-27 Thread Eelco Chaudron



On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:

> Hi Eelco,
>
>> +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>> +struct virtio_net *dev = get_device(vid);
>> +struct vhost_virtqueue *vq;
>> +
>> +if (!dev ||  queue_id >= VHOST_MAX_VRING)
>> +return;
>> +
>> +vq = dev->virtqueue[queue_id];
>> +if (!vq)
>> +return;
>> +
>> +rte_spinlock_lock(&vq->access_lock);
>> +
>
> Is spin lock needed here before system call ?

I assumed access_lock is protecting all the following fields in this structure, 
so I need the lock to read the vq->callfd, however, I can/should move the 
eventfd_write outside of the lock.

>> +if (vq->callfd >= 0)
>> +eventfd_write(vq->callfd, (eventfd_t)1);
>> +
>> +rte_spinlock_unlock(&vq->access_lock);
>> +}
>> +
>
> Thanks.



Re: [PATCH] vhost: add device op to offload the interrupt kick

2023-03-27 Thread Eelco Chaudron



On 27 Mar 2023, at 15:21, Maxime Coquelin wrote:

> Hi Eelco,
>
> On 3/27/23 14:51, Eelco Chaudron wrote:
>> This patch adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>
> That's a good idea, please find some comments inline:

Thanks for the review, see inline.
>
>> Signed-off-by: Eelco Chaudron 
>> ---
>>   lib/vhost/rte_vhost.h |   10 +-
>>   lib/vhost/vhost.c |   21 +
>>   lib/vhost/vhost.h |   43 ---
>>   3 files changed, 58 insertions(+), 16 deletions(-)
>>
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 58a5d4be92..af7a394d0f 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>>   */
>>  void (*guest_notified)(int vid);
>>  -   void *reserved[1]; /**< Reserved for future extension */
>> +/**
>> + * If this callback is registered, notification to the guest can
>> + * be handled by the front-end calling rte_vhost_notify_guest().
>> + * If it's not handled, 'false' should be returned. This can be used
>> + * to remove the "slow" eventfd_write() syscall from the datapath.
>> + */
>> +bool (*guest_notify)(int vid, uint16_t queue_id);
>>   };
>>/**
>> @@ -433,6 +439,8 @@ void rte_vhost_log_used_vring(int vid, uint16_t 
>> vring_idx,
>>int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int 
>> enable);
>>  +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
>
> The new API needs to be tagged as experimental, and also documented. (I
> see rte_vhost_enable_guest_notification is not properly documented, so
> not a good example!)

I used exactly that as an example, so thought it should be good ;)
Will add this in the next revision...

>> +
>>   /**
>>* Register vhost driver. path could be different for multiple
>>* instance support.
>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
>> index ef37943817..ee090d78ef 100644
>> --- a/lib/vhost/vhost.c
>> +++ b/lib/vhost/vhost.c
>> @@ -1467,6 +1467,27 @@ rte_vhost_enable_guest_notification(int vid, uint16_t 
>> queue_id, int enable)
>>  return ret;
>>   }
>>  +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
>> +{
>> +struct virtio_net *dev = get_device(vid);
>> +struct vhost_virtqueue *vq;
>> +
>> +if (!dev ||  queue_id >= VHOST_MAX_VRING)
>> +return;
>> +
>> +vq = dev->virtqueue[queue_id];
>> +if (!vq)
>> +return;
>> +
>> +rte_spinlock_lock(&vq->access_lock);
>> +
>> +if (vq->callfd >= 0)
>> +eventfd_write(vq->callfd, (eventfd_t)1);
>
> Maybe we should return an error of callfd is invalid or eventfd_write()
> failed.

See below

>> +
>> +rte_spinlock_unlock(&vq->access_lock);
>> +}
>> +
>>   void
>>   rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>>   {
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 8fdab13c70..39ad8260a1 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -883,6 +883,30 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, 
>> uint16_t old)
>>  return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>>   }
>>  +static __rte_always_inline void
>> +vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +if (dev->notify_ops->guest_notify) {
>> +uint16_t qid;
>> +for (qid = 0;  qid < dev->nr_vring; qid++) {
>> +if (dev->virtqueue[qid] == vq) {
>> +if (dev->notify_ops->guest_notify(dev->vid,
>> +  qid))
>> +goto done;
>> +break;
>> +}
>> +

[PATCH] vhost: add device op to offload the interrupt kick

2023-03-27 Thread Eelco Chaudron
This patch adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

Signed-off-by: Eelco Chaudron 
---
 lib/vhost/rte_vhost.h |   10 +-
 lib/vhost/vhost.c |   21 +
 lib/vhost/vhost.h |   43 ---
 3 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 58a5d4be92..af7a394d0f 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
 */
void (*guest_notified)(int vid);
 
-   void *reserved[1]; /**< Reserved for future extension */
+   /**
+* If this callback is registered, notification to the guest can
+* be handled by the front-end calling rte_vhost_notify_guest().
+* If it's not handled, 'false' should be returned. This can be used
+* to remove the "slow" eventfd_write() syscall from the datapath.
+*/
+   bool (*guest_notify)(int vid, uint16_t queue_id);
 };
 
 /**
@@ -433,6 +439,8 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int 
enable);
 
+void rte_vhost_notify_guest(int vid, uint16_t queue_id);
+
 /**
  * Register vhost driver. path could be different for multiple
  * instance support.
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index ef37943817..ee090d78ef 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -1467,6 +1467,27 @@ rte_vhost_enable_guest_notification(int vid, uint16_t 
queue_id, int enable)
return ret;
 }
 
+void
+rte_vhost_notify_guest(int vid, uint16_t queue_id)
+{
+   struct virtio_net *dev = get_device(vid);
+   struct vhost_virtqueue *vq;
+
+   if (!dev ||  queue_id >= VHOST_MAX_VRING)
+   return;
+
+   vq = dev->virtqueue[queue_id];
+   if (!vq)
+   return;
+
+   rte_spinlock_lock(&vq->access_lock);
+
+   if (vq->callfd >= 0)
+   eventfd_write(vq->callfd, (eventfd_t)1);
+
+   rte_spinlock_unlock(&vq->access_lock);
+}
+
 void
 rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
 {
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 8fdab13c70..39ad8260a1 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -883,6 +883,30 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, 
uint16_t old)
return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
 }
 
+static __rte_always_inline void
+vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+   if (dev->notify_ops->guest_notify) {
+   uint16_t qid;
+   for (qid = 0;  qid < dev->nr_vring; qid++) {
+   if (dev->virtqueue[qid] == vq) {
+   if (dev->notify_ops->guest_notify(dev->vid,
+ qid))
+   goto done;
+   break;
+   }
+   }
+   }
+   eventfd_write(vq->callfd, (eventfd_t) 1);
+
+done:
+   if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+   vq->stats.guest_notifications++;
+   if (dev->notify_ops->guest_notified)
+   dev->notify_ops->guest_notified(dev->vid);
+}
+
+
 static __rte_always_inline void
 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -905,21 +929,13 @@ vhost_vring_call_split(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
if ((vhost_need_event(vhost_used_event(vq), new, old) &&
(vq->callfd >= 0)) ||
unlikely(!signalled_used_valid)) {
-   eventfd_write(vq->callfd, (eventfd_t) 1);
-   if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-   vq->stats.guest_notifications++;
-   if (dev->notify_ops->guest_notified)
-   dev->notify_ops->guest_notified(dev->vid);
+   vhost_vring_kick_guest(dev, vq);
}
} else {
/* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0)) {
- 

Re: [dpdk-dev] [RFC v2] meter: fix ABI break due to experimental tag removal

2020-02-03 Thread Eelco Chaudron




On 30 Jan 2020, at 21:20, Thomas Monjalon wrote:





Yes, sorry, this is what I meant.
Given 19.11.0 already has the symbol as experimental,
and that applications like OVS had to accept it as experimental,
why removing experimental tag in 19.11.1?


I think it was mentioned that it was preferred not to suppress the
compiler warning to avoid any accidental use in the future, but the
OVS
maintainer(s) should answer as I might remember wrongly.


Yes this is the reason, OVS compiles with -Werror so we would like to
avoid the warnings. You can not disable them per include, it’s 
global

for all of DPDK.


Yes but anyway OVS must accept the experimental function as the next 
release

will use it with DPDK 19.11.0.


Yes, we do for now, but we would like to remove it ASAP as it opens up 
the code base for the experimental APIs to slip in…




Re: [dpdk-dev] [RFC v2] meter: fix ABI break due to experimental tag removal

2020-01-30 Thread Eelco Chaudron




On 30 Jan 2020, at 17:04, Luca Boccassi wrote:


On Thu, 2020-01-30 at 16:55 +0100, Thomas Monjalon wrote:

30/01/2020 15:21, Luca Boccassi:

On Thu, 2020-01-30 at 15:17 +0100, Thomas Monjalon wrote:

30/01/2020 13:57, Luca Boccassi:

On Thu, 2020-01-30 at 13:33 +0100, Thomas Monjalon wrote:

Hi,

I disagree with the need of this patch.
The symbol was experimental, meaning we can change it.
Removing experimental tag is not an ABI break.


Hi,

This symbol change was requested for backport in 19.11.x, and
experimental or not I'm not too keen on backward incompatible
changes
to the public interface in an _LTS point release_. The
compromise
was
to see if we could support both symbols version, which makes
the
change
backward compatible.

If you prefer not to have this patch in mainline I'm also fine
in
taking it just for the LTS. I agree with you that it is not
required
for mainline releases (although nicer for me if it's a backport
rather
than a new change).


I would like to avoid opening the door for maintaining the
experimental ABI
in the mainline. Please take it directly in the LTS.

The next question is to know whether we really want to have such
patch in LTS.
Anyway, 19.11.0 has this symbol as experimental.
How adding a non-experimental version of the function in 19.11.1
will
change
the ABI status of the whole 19.11 branch?


The problem is not adding the new symbol, but removing the
experimental
one. Changing the version of the symbol was requested by OVS for
inclusion in 19.11.


Yes, sorry, this is what I meant.
Given 19.11.0 already has the symbol as experimental,
and that applications like OVS had to accept it as experimental,
why removing experimental tag in 19.11.1?


I think it was mentioned that it was preferred not to suppress the
compiler warning to avoid any accidental use in the future, but the 
OVS

maintainer(s) should answer as I might remember wrongly.


Yes this is the reason, OVS compiles with -Werror so we would like to 
avoid the warnings. You can not disable them per include, it’s global 
for all of DPDK.


//Eelco



Re: [dpdk-dev] [PATCH] vhost: fix deadlock on port deletion

2020-01-15 Thread Eelco Chaudron



On 14 Jan 2020, at 19:53, Maxime Coquelin wrote:

> If the vhost-user application (e.g. OVS) deletes the vhost-user
> port while Qemu sends a vhost-user request, a deadlock can
> happen if the request handler tries to acquire vhost-user's
> global mutex, which is also locked by the vhost-user port
> deletion API (rte_vhost_driver_unregister).
>
> This patch prevents the deadlock by making
> rte_vhost_driver_unregister() to release the mutex and try
> again if a request is being handled to give a chance to
> the request handler to complete.
>
> Fixes: 8b4b949144b8 ("vhost: fix dead lock on closing in server mode")
> Fixes: 5fbb3941da9f ("vhost: introduce driver features related APIs")
> Cc: sta...@dpdk.org
>
> Signed-off-by: Maxime Coquelin 

Acked-by: Eelco Chaudron 



Re: [dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface

2019-12-18 Thread Eelco Chaudron

Thanks!



On 18 Dec 2019, at 3:57, Ye Xiaolong wrote:


On 12/18, Zhang, Xiao wrote:



-Original Message-
From: Eelco Chaudron [mailto:echau...@redhat.com]
Sent: Tuesday, November 19, 2019 9:45 PM
To: Xing, Beilei ; Zhang, Qi Z 


Cc: Zhang, Xiao ; dev@dpdk.org
Subject: [PATCH v2] net/i40e: always re-program promiscuous mode on 
VF

interface

During a kernel PF reset, this event is propagated to the VF.
The DPDK VF PMD will execute the reset task before the PF is done 
with his. This
results in the admin queue message not being responded to leaving 
the port in

"promiscuous" mode.

This patch makes sure the promiscuous mode is configured 
independently of the

current admin state.

Signed-off-by: Eelco Chaudron 


Reviewed-by: Xiao Zhang 


Applied to dpdk-next-net-intel, Thanks.




Re: [dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface

2019-12-17 Thread Eelco Chaudron

Trying again?

On 4 Dec 2019, at 16:18, Eelco Chaudron wrote:


Any update on this patch?


On 19 Nov 2019, at 14:45, Eelco Chaudron wrote:


During a kernel PF reset, this event is propagated to the VF.
The DPDK VF PMD will execute the reset task before the PF is done
with his. This results in the admin queue message not being responded
to leaving the port in "promiscuous" mode.

This patch makes sure the promiscuous mode is configured 
independently

of the current admin state.

Signed-off-by: Eelco Chaudron 
---
v1-v2:
  In the earlier patch, we would force the 
vf->promisc_unicast_enabled

  state to false after a reset. Based on the review comments it was
  prefered to not take the current state into account when 
programming.


  v1 patch was called: net/i40e: force promiscuous state after VF 
reset


 drivers/net/i40e/i40e_ethdev_vf.c |   16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c

index 5dba092..43f7ab5 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2162,10 +2162,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If enabled, just return */
-   if (vf->promisc_unicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, 1, vf->promisc_multicast_enabled);
if (ret == 0)
vf->promisc_unicast_enabled = TRUE;
@@ -2181,10 +2177,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If disabled, just return */
-   if (!vf->promisc_unicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, 0, vf->promisc_multicast_enabled);
if (ret == 0)
vf->promisc_unicast_enabled = FALSE;
@@ -2200,10 +2192,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If enabled, just return */
-   if (vf->promisc_multicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 1);
if (ret == 0)
vf->promisc_multicast_enabled = TRUE;
@@ -2219,10 +2207,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If enabled, just return */
-   if (!vf->promisc_multicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 0);
if (ret == 0)
vf->promisc_multicast_enabled = FALSE;




[dpdk-dev] [PATCH] meter: move RFC4115 trTCM APIs as none experimental

2019-12-17 Thread Eelco Chaudron
Moved RFC4115 APIs to none experimental as they have been there
since 19.02. Also, these APIs are the same as the none RFC4115 APIs.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_meter/rte_meter.h   |6 --
 lib/librte_meter/rte_meter_version.map |4 ++--
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index d69b118..62c8c1e 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -140,7 +140,6 @@ struct rte_meter_trtcm_rfc4115_params {
  * @return
  *0 upon success, error code otherwise
  */
-__rte_experimental
 int
 rte_meter_trtcm_rfc4115_profile_config(
struct rte_meter_trtcm_rfc4115_profile *p,
@@ -187,7 +186,6 @@ struct rte_meter_trtcm_rfc4115_params {
  * @return
  *0 upon success, error code otherwise
  */
-__rte_experimental
 int
 rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm_rfc4115 *m,
struct rte_meter_trtcm_rfc4115_profile *p);
@@ -295,7 +293,6 @@ struct rte_meter_trtcm_rfc4115_params {
  * @return
  *Color assigned to the current IP packet
  */
-__rte_experimental
 static inline enum rte_color
 rte_meter_trtcm_rfc4115_color_blind_check(
struct rte_meter_trtcm_rfc4115 *m,
@@ -322,7 +319,6 @@ struct rte_meter_trtcm_rfc4115_params {
  * @return
  *Color assigned to the current IP packet
  */
-__rte_experimental
 static inline enum rte_color
 rte_meter_trtcm_rfc4115_color_aware_check(
struct rte_meter_trtcm_rfc4115 *m,
@@ -582,7 +578,6 @@ struct rte_meter_trtcm_rfc4115 {
return RTE_COLOR_GREEN;
 }
 
-__rte_experimental
 static inline enum rte_color
 rte_meter_trtcm_rfc4115_color_blind_check(
struct rte_meter_trtcm_rfc4115 *m,
@@ -626,7 +621,6 @@ struct rte_meter_trtcm_rfc4115 {
return RTE_COLOR_RED;
 }
 
-__rte_experimental
 static inline enum rte_color
 rte_meter_trtcm_rfc4115_color_aware_check(
struct rte_meter_trtcm_rfc4115 *m,
diff --git a/lib/librte_meter/rte_meter_version.map 
b/lib/librte_meter/rte_meter_version.map
index 46410b0..fc12cc0 100644
--- a/lib/librte_meter/rte_meter_version.map
+++ b/lib/librte_meter/rte_meter_version.map
@@ -13,11 +13,11 @@ DPDK_20.0 {
local: *;
 };
 
-EXPERIMENTAL {
+DPDK_20.0.1 {
global:
 
rte_meter_trtcm_rfc4115_color_aware_check;
rte_meter_trtcm_rfc4115_color_blind_check;
rte_meter_trtcm_rfc4115_config;
rte_meter_trtcm_rfc4115_profile_config;
-};
+} DPDK_20.0;



Re: [dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface

2019-12-04 Thread Eelco Chaudron

Any update on this patch?


On 19 Nov 2019, at 14:45, Eelco Chaudron wrote:


During a kernel PF reset, this event is propagated to the VF.
The DPDK VF PMD will execute the reset task before the PF is done
with his. This results in the admin queue message not being responded
to leaving the port in "promiscuous" mode.

This patch makes sure the promiscuous mode is configured independently
of the current admin state.

Signed-off-by: Eelco Chaudron 
---
v1-v2:
  In the earlier patch, we would force the vf->promisc_unicast_enabled
  state to false after a reset. Based on the review comments it was
  prefered to not take the current state into account when 
programming.


  v1 patch was called: net/i40e: force promiscuous state after VF 
reset


 drivers/net/i40e/i40e_ethdev_vf.c |   16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c

index 5dba092..43f7ab5 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2162,10 +2162,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If enabled, just return */
-   if (vf->promisc_unicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, 1, vf->promisc_multicast_enabled);
if (ret == 0)
vf->promisc_unicast_enabled = TRUE;
@@ -2181,10 +2177,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If disabled, just return */
-   if (!vf->promisc_unicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, 0, vf->promisc_multicast_enabled);
if (ret == 0)
vf->promisc_unicast_enabled = FALSE;
@@ -2200,10 +2192,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If enabled, just return */
-   if (vf->promisc_multicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 1);
if (ret == 0)
vf->promisc_multicast_enabled = TRUE;
@@ -2219,10 +2207,6 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)
 	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

int ret;

-   /* If enabled, just return */
-   if (!vf->promisc_multicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 0);
if (ret == 0)
vf->promisc_multicast_enabled = FALSE;




Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-11-19 Thread Eelco Chaudron




On 13 Nov 2019, at 2:14, Zhang, Xiao wrote:


-Original Message-
From: Eelco Chaudron [mailto:echau...@redhat.com]
Sent: Tuesday, November 12, 2019 7:09 PM
To: Zhang, Xiao 
Cc: Zhang, Qi Z ; dev@dpdk.org; Xing, Beilei

Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state 
after VF

reset



On 12 Nov 2019, at 1:52, Zhang, Xiao wrote:


Hi Eelco,

Seems you missed this mail.


Not sure why I missed this email, as it does not show up in my email 
client :( See

below…




Hi Eelco,

I think you may need add more detailed message in the commit 
log

or comments.


My interpretation of the request was that Beilei wanted to know
why
disabling promiscuous mode in HW was failing. Beilei can you
comment, is the additional description from Xiao enough?


Yes, promisc_unicast_enabled flag is not cleared during vf reset
because fail to disable promiscuous mode, So I think we need to
root
cause why fail to  disable promiscuous mode first.
This patch looks like a workaround but not a fix.



This was debugged together with Xiao and from what I understand is
that DPDK fails to reset promiscuous mode in hardware as PF and VF
operations are not synced between kernel and DPDK.

Xiao told me this could not be fixed in another way, Xiao can you
comment?



Checked again, the root cause is not synced issue between kernel 
and

DPDK
during reset.


What is the root cause, so I can update the patch description for the
next revision.


Kernel PF do reset once VF mac changed and send reset event to VF,
VF do reset once received even if reset task not done by PF, so admin 
q message will not get response.


Thanks, added this to the v2 commit message.



Suggest to remove the checking and setting of 
promisc_unicast_enabled

flag,
since this flag is only used when enable/disable promiscuous mode.
Considering the un-synced issue, it will be more clean if remove 
the

flag.
Also same with flag promisc_multicast_enabled.


So if I understand it correctly remove the following code:

2203/* If enabled, just return */
2204if (vf->promisc_multicast_enabled)
2205return 0;

and

/* If enabled, just return */
2223if (!vf->promisc_multicast_enabled)
2224return 0;

Or to remove the flags from the i40e_vf structure (and relative  
code):


1051bool promisc_unicast_enabled;
1052bool promisc_multicast_enabled;

Let me know and I craft up a patch…


You can remove promisc_unicast_enabled/promisc_multicast_enabled and 
related code.


Removing the flags did not work, as they are needed for programming one 
or the other, so I kept them.
In the end, I removed the “If enabled” checks above, see the v2 
patch with the name
“[PATCH v2] net/i40e: always re-program promiscuous mode on VF 
interface”




[dpdk-dev] [PATCH v2] net/i40e: always re-program promiscuous mode on VF interface

2019-11-19 Thread Eelco Chaudron
During a kernel PF reset, this event is propagated to the VF.
The DPDK VF PMD will execute the reset task before the PF is done
with his. This results in the admin queue message not being responded
to leaving the port in "promiscuous" mode.

This patch makes sure the promiscuous mode is configured independently
of the current admin state.

Signed-off-by: Eelco Chaudron 
---
v1-v2:
  In the earlier patch, we would force the vf->promisc_unicast_enabled
  state to false after a reset. Based on the review comments it was
  prefered to not take the current state into account when programming.

  v1 patch was called: net/i40e: force promiscuous state after VF reset

 drivers/net/i40e/i40e_ethdev_vf.c |   16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c
index 5dba092..43f7ab5 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2162,10 +2162,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device 
*pci_dev)
struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
int ret;
 
-   /* If enabled, just return */
-   if (vf->promisc_unicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, 1, vf->promisc_multicast_enabled);
if (ret == 0)
vf->promisc_unicast_enabled = TRUE;
@@ -2181,10 +2177,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device 
*pci_dev)
struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
int ret;
 
-   /* If disabled, just return */
-   if (!vf->promisc_unicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, 0, vf->promisc_multicast_enabled);
if (ret == 0)
vf->promisc_unicast_enabled = FALSE;
@@ -2200,10 +2192,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device 
*pci_dev)
struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
int ret;
 
-   /* If enabled, just return */
-   if (vf->promisc_multicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 1);
if (ret == 0)
vf->promisc_multicast_enabled = TRUE;
@@ -2219,10 +2207,6 @@ static int eth_i40evf_pci_remove(struct rte_pci_device 
*pci_dev)
struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
int ret;
 
-   /* If enabled, just return */
-   if (!vf->promisc_multicast_enabled)
-   return 0;
-
ret = i40evf_config_promisc(dev, vf->promisc_unicast_enabled, 0);
if (ret == 0)
vf->promisc_multicast_enabled = FALSE;



Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-11-12 Thread Eelco Chaudron




On 12 Nov 2019, at 1:52, Zhang, Xiao wrote:


Hi Eelco,

Seems you missed this mail.


Not sure why I missed this email, as it does not show up in my email 
client :(

See below…




Hi Eelco,

I think you may need add more detailed message in the commit log
or comments.


My interpretation of the request was that Beilei wanted to know 
why

disabling promiscuous mode in HW was failing. Beilei can you
comment, is the additional description from Xiao enough?


Yes, promisc_unicast_enabled flag is not cleared during vf reset
because fail to disable promiscuous mode, So I think we need to 
root

cause why fail to  disable promiscuous mode first.
This patch looks like a workaround but not a fix.



This was debugged together with Xiao and from what I understand is
that DPDK fails to reset promiscuous mode in hardware as PF and VF
operations are not synced between kernel and DPDK.

Xiao told me this could not be fixed in another way, Xiao can you
comment?



Checked again, the root cause is not synced issue between kernel and 
DPDK

during reset.


What is the root cause, so I can update the patch description for the 
next revision.


Suggest to remove the checking and setting of promisc_unicast_enabled 
flag,

since this flag is only used when enable/disable promiscuous mode.
Considering the un-synced issue, it will be more clean if remove the 
flag.

Also same with flag promisc_multicast_enabled.


So if I understand it correctly remove the following code:

2203/* If enabled, just return */
2204if (vf->promisc_multicast_enabled)
2205return 0;

and

/* If enabled, just return */
2223if (!vf->promisc_multicast_enabled)
2224return 0;

Or to remove the flags from the i40e_vf structure (and relative  code):

1051bool promisc_unicast_enabled;
1052bool promisc_multicast_enabled;

Let me know and I craft up a patch…


Cheers,


Eelco



Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-11-11 Thread Eelco Chaudron




On 1 Nov 2019, at 9:12, Eelco Chaudron wrote:





+   /*
+* Even though the device reset is successful disabling
promiscuous
+* mode might not always succeed, causing enabling it after
reset
to


I think we need to root cause why fail to disable promiscuous 
mode

and
try to fix it first.


I’ve copied in Xiao who helped me identify the issue in your
driver.

The issue is because the change from kernel pf was not synced to 
DPDK

vf during
the closing period of reset, so we get this failure. Xiao can you 
add

more details?



The root cause is when kernel pf generate DPDK vf reset event, the
flag
vf->promisc_unicast_enabled will not be cleared but promiscuous 
mode

is
disabled. When trying to enable promiscuous mode next time by 
calling

i40evf_dev_promiscuous_enable won't work because it will check the
vf->promisc_unicast_enabled flag first.

static int
i40evf_dev_promiscuous_enable(struct rte_eth_dev *dev)
{
...
/* If enabled, just return */
if (vf->promisc_unicast_enabled)
return 0;
...
}

Hi Eelco,

I think you may need add more detailed message in the commit log or
comments.


My interpretation of the request was that Beilei wanted to know why
disabling promiscuous mode in HW was failing. Beilei can you 
comment, is

the additional description from Xiao enough?


Yes, promisc_unicast_enabled flag is not cleared during vf reset 
because fail to disable promiscuous mode,
So I think we need to root cause why fail to  disable promiscuous 
mode first.

This patch looks like a workaround but not a fix.



This was debugged together with Xiao and from what I understand is 
that DPDK fails to reset promiscuous mode in hardware as PF and VF 
operations are not synced between kernel and DPDK.


Xiao told me this could not be fixed in another way, Xiao can you 
comment?




Xiao any update you can add to this so it’s more clear for Beilei?

Thanks,

Eelco



Re: [dpdk-dev] [PATCH] net/af_xdp: enforce an MTU of 1500

2019-11-07 Thread Eelco Chaudron




On 7 Nov 2019, at 14:27, Ciara Loftus wrote:


Packets larger than this will fail to be transmitted by AF_XDP,
so limit the MTU to 1500.

Signed-off-by: Ciara Loftus 
---
 drivers/net/af_xdp/rte_eth_af_xdp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c

index 2b1245ee4..62c801500 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -595,7 +595,7 @@ eth_dev_info(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *dev_info)

dev_info->max_tx_queues = internals->queue_cnt;

dev_info->min_mtu = RTE_ETHER_MIN_MTU;
-	dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - 
ETH_AF_XDP_DATA_HEADROOM;

+   dev_info->max_mtu = 1500;


Why is this limited to 1500 in the AF_XDP PMD? For native AF_XDP in OVS 
the limit is mainly by the driver/XDP page limit:
	min(PAGE_SIZE, ETH_AF_XDP_FRAME_SIZE ) - ETH_XDP_DATA_HEADROOM - 
XDP_PACKET_HEADROOM




dev_info->default_rxportconf.nb_queues = 1;
dev_info->default_txportconf.nb_queues = 1;
@@ -1328,6 +1328,8 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device 
*dev)

return -1;
}

+   eth_dev_mtu_set(eth_dev, 1500);
+
rte_eth_dev_probing_finish(eth_dev);

return 0;
--
2.17.1




Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-11-01 Thread Eelco Chaudron




On 1 Nov 2019, at 3:38, Xing, Beilei wrote:


-Original Message-
From: Eelco Chaudron [mailto:echau...@redhat.com]
Sent: Tuesday, October 29, 2019 3:47 PM
To: Zhang, Xiao ; Xing, Beilei 


Cc: Zhang, Qi Z ; dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state 
after VF

reset



On 29 Oct 2019, at 8:39, Zhang, Xiao wrote:


-Original Message-
From: Eelco Chaudron 
Sent: Friday, October 25, 2019 5:24 PM
To: Xing, Beilei ; Zhang, Xiao

Cc: Zhang, Qi Z ; dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state
after VF reset



On 17 Oct 2019, at 8:34, Xing, Beilei wrote:


On 17 Sep 2019, at 9:40, Eelco Chaudron wrote:

Even though the device reset is successful, disabling 
promiscuous
mode might not always succeed, causing enabling it after reset 
to

fail.
This would happen when the kernel driver requires a reset of the
VF.

This patch resets the internal state, so next time promiscuous
mode
is configured it will be enabled.

Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c
b/drivers/net/i40e/i40e_ethdev_vf.c
index 551f6fa..e0f99a4 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct
rte_pci_device *pci_dev)  i40evf_dev_reset(struct rte_eth_dev
*dev)
{
int ret;
+   struct i40e_vf *vf =
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

ret = i40evf_dev_uninit(dev);
if (ret)
return ret;

+   /*
+* Even though the device reset is successful disabling
promiscuous
+* mode might not always succeed, causing enabling it after
reset
to


I think we need to root cause why fail to disable promiscuous mode
and
try to fix it first.


I’ve copied in Xiao who helped me identify the issue in your
driver.

The issue is because the change from kernel pf was not synced to 
DPDK

vf during
the closing period of reset, so we get this failure. Xiao can you 
add

more details?



The root cause is when kernel pf generate DPDK vf reset event, the
flag
vf->promisc_unicast_enabled will not be cleared but promiscuous mode
is
disabled. When trying to enable promiscuous mode next time by 
calling

i40evf_dev_promiscuous_enable won't work because it will check the
vf->promisc_unicast_enabled flag first.

static int
i40evf_dev_promiscuous_enable(struct rte_eth_dev *dev)
{
...
/* If enabled, just return */
if (vf->promisc_unicast_enabled)
return 0;
...
}

Hi Eelco,

I think you may need add more detailed message in the commit log or
comments.


My interpretation of the request was that Beilei wanted to know why
disabling promiscuous mode in HW was failing. Beilei can you comment, 
is

the additional description from Xiao enough?


Yes, promisc_unicast_enabled flag is not cleared during vf reset 
because fail to disable promiscuous mode,
So I think we need to root cause why fail to  disable promiscuous mode 
first.

This patch looks like a workaround but not a fix.



This was debugged together with Xiao and from what I understand is that 
DPDK fails to reset promiscuous mode in hardware as PF and VF operations 
are not synced between kernel and DPDK.


Xiao told me this could not be fixed in another way, Xiao can you 
comment?







+* fail. This would happen when the kernel driver requires a
reset
+* of the VF.
+*/
+   if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+   vf->promisc_unicast_enabled = FALSE;
+
ret = i40evf_dev_init(dev);

return ret;
--
1.8.3.1




Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-10-29 Thread Eelco Chaudron




On 29 Oct 2019, at 8:39, Zhang, Xiao wrote:


-Original Message-
From: Eelco Chaudron 
Sent: Friday, October 25, 2019 5:24 PM
To: Xing, Beilei ; Zhang, Xiao 


Cc: Zhang, Qi Z ; dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state 
after VF

reset



On 17 Oct 2019, at 8:34, Xing, Beilei wrote:


On 17 Sep 2019, at 9:40, Eelco Chaudron wrote:


Even though the device reset is successful, disabling promiscuous
mode might not always succeed, causing enabling it after reset to
fail.
This would happen when the kernel driver requires a reset of the 
VF.


This patch resets the internal state, so next time promiscuous 
mode

is configured it will be enabled.

Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c
b/drivers/net/i40e/i40e_ethdev_vf.c
index 551f6fa..e0f99a4 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct
rte_pci_device *pci_dev)  i40evf_dev_reset(struct rte_eth_dev 
*dev)

{
int ret;
+   struct i40e_vf *vf =
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

ret = i40evf_dev_uninit(dev);
if (ret)
return ret;

+   /*
+* Even though the device reset is successful disabling
promiscuous
+	 * mode might not always succeed, causing enabling it after 
reset

to


I think we need to root cause why fail to disable promiscuous mode 
and

try to fix it first.


I’ve copied in Xiao who helped me identify the issue in your 
driver.


The issue is because the change from kernel pf was not synced to DPDK 
vf during
the closing period of reset, so we get this failure. Xiao can you add 
more details?




The root cause is when kernel pf generate DPDK vf reset event, the 
flag
vf->promisc_unicast_enabled will not be cleared but promiscuous mode 
is

disabled. When trying to enable promiscuous mode next time by calling
i40evf_dev_promiscuous_enable won't work because it will check the
vf->promisc_unicast_enabled flag first.

static int
i40evf_dev_promiscuous_enable(struct rte_eth_dev *dev)
{
...
/* If enabled, just return */
if (vf->promisc_unicast_enabled)
return 0;
...
}

Hi Eelco,

I think you may need add more detailed message in the commit log or 
comments.


My interpretation of the request was that Beilei wanted to know why 
disabling promiscuous mode in HW was failing. Beilei can you comment, is 
the additional description from Xiao enough?




+	 * fail. This would happen when the kernel driver requires a 
reset

+* of the VF.
+*/
+   if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+   vf->promisc_unicast_enabled = FALSE;
+
ret = i40evf_dev_init(dev);

return ret;
--
1.8.3.1




Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-10-25 Thread Eelco Chaudron




On 17 Oct 2019, at 8:34, Xing, Beilei wrote:


On 17 Sep 2019, at 9:40, Eelco Chaudron wrote:

Even though the device reset is successful, disabling promiscuous 
mode

might not always succeed, causing enabling it after reset to fail.
This would happen when the kernel driver requires a reset of the VF.

This patch resets the internal state, so next time promiscuous mode 
is

configured it will be enabled.

Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c
b/drivers/net/i40e/i40e_ethdev_vf.c
index 551f6fa..e0f99a4 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct
rte_pci_device *pci_dev)  i40evf_dev_reset(struct rte_eth_dev *dev)  
{

int ret;
+   struct i40e_vf *vf =
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);

ret = i40evf_dev_uninit(dev);
if (ret)
return ret;

+   /*
+	 * Even though the device reset is successful disabling 
promiscuous
+	 * mode might not always succeed, causing enabling it after reset 
to


I think we need to root cause why fail to disable promiscuous mode and 
try to fix it first.


I’ve copied in Xiao who helped me identify the issue in your driver.

The issue is because the change from kernel pf was not synced to DPDK vf 
during the closing period of reset, so we get this failure. Xiao can you 
add more details?




+* fail. This would happen when the kernel driver requires a reset
+* of the VF.
+*/
+   if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+   vf->promisc_unicast_enabled = FALSE;
+
ret = i40evf_dev_init(dev);

return ret;
--
1.8.3.1




Re: [dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-10-15 Thread Eelco Chaudron

Any update on this patch?

Thanks,

Eelco


On 17 Sep 2019, at 9:40, Eelco Chaudron wrote:


Even though the device reset is successful, disabling promiscuous mode
might not always succeed, causing enabling it after reset to
fail. This would happen when the kernel driver requires a reset of the
VF.

This patch resets the internal state, so next time promiscuous mode is
configured it will be enabled.

Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c

index 551f6fa..e0f99a4 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct 
rte_pci_device *pci_dev)

 i40evf_dev_reset(struct rte_eth_dev *dev)
 {
int ret;
+	struct i40e_vf *vf = 
I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);


ret = i40evf_dev_uninit(dev);
if (ret)
return ret;

+   /*
+* Even though the device reset is successful disabling promiscuous
+* mode might not always succeed, causing enabling it after reset to
+* fail. This would happen when the kernel driver requires a reset
+* of the VF.
+*/
+   if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+   vf->promisc_unicast_enabled = FALSE;
+
ret = i40evf_dev_init(dev);

return ret;
--
1.8.3.1


[dpdk-dev] [PATCH] net/i40e: force promiscuous state after VF reset

2019-09-17 Thread Eelco Chaudron
Even though the device reset is successful, disabling promiscuous mode
might not always succeed, causing enabling it after reset to
fail. This would happen when the kernel driver requires a reset of the
VF.

This patch resets the internal state, so next time promiscuous mode is
configured it will be enabled.

Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c
index 551f6fa..e0f99a4 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2276,11 +2276,21 @@ static int eth_i40evf_pci_remove(struct rte_pci_device 
*pci_dev)
 i40evf_dev_reset(struct rte_eth_dev *dev)
 {
int ret;
+   struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
ret = i40evf_dev_uninit(dev);
if (ret)
return ret;
 
+   /*
+* Even though the device reset is successful disabling promiscuous
+* mode might not always succeed, causing enabling it after reset to
+* fail. This would happen when the kernel driver requires a reset
+* of the VF.
+*/
+   if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+   vf->promisc_unicast_enabled = FALSE;
+
ret = i40evf_dev_init(dev);
 
return ret;
-- 
1.8.3.1



Re: [dpdk-dev] [PATCH] net/i40e: downgrade unnecessary error log

2019-09-13 Thread Eelco Chaudron

Forgot to include the stable alias to get this in 18.11, sorry Kevin

//Eelco

On 12 Sep 2019, at 14:40, Ye Xiaolong wrote:


On 09/12, David Marchand wrote:
On Thu, Sep 12, 2019 at 12:38 PM Eelco Chaudron  
wrote:


When receiving the unsupported AQ messages, it's taken as an
error. It's not appropriate and triggers too much unnecessary print.

This commit is similar to commit e130425300b0 ("net/i40e: downgrade
unnecessary error log") which made the same change for the PF 
instance.


Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c

index c77b30c54..7b1d485c6 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -1411,7 +1411,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev)
}
break;
default:
-   PMD_DRV_LOG(ERR, "Request %u is not 
supported yet",
+   PMD_DRV_LOG(DEBUG, "Request %u is not 
supported yet",

aq_opc);
break;
}
--
2.18.1



Reviewed-by: David Marchand 


--
David Marchand


Reviewed-by: Xiaolong Ye 

Applied to dpdk-next-net-intel.


Re: [dpdk-dev] [PATCH] net/i40e: downgrade unnecessary error log

2019-09-12 Thread Eelco Chaudron

On 12 Sep 2019, at 14:40, Ye Xiaolong wrote:


On 09/12, David Marchand wrote:
On Thu, Sep 12, 2019 at 12:38 PM Eelco Chaudron  
wrote:


When receiving the unsupported AQ messages, it's taken as an
error. It's not appropriate and triggers too much unnecessary print.

This commit is similar to commit e130425300b0 ("net/i40e: downgrade
unnecessary error log") which made the same change for the PF 
instance.


Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c

index c77b30c54..7b1d485c6 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -1411,7 +1411,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev)
}
break;
default:
-   PMD_DRV_LOG(ERR, "Request %u is not 
supported yet",
+   PMD_DRV_LOG(DEBUG, "Request %u is not 
supported yet",

aq_opc);
break;
}
--
2.18.1



Reviewed-by: David Marchand 


--
David Marchand


Reviewed-by: Xiaolong Ye 

Applied to dpdk-next-net-intel.


Thanks David/Ye


[dpdk-dev] [PATCH] net/i40e: downgrade unnecessary error log

2019-09-12 Thread Eelco Chaudron
When receiving the unsupported AQ messages, it's taken as an
error. It's not appropriate and triggers too much unnecessary print.

This commit is similar to commit e130425300b0 ("net/i40e: downgrade
unnecessary error log") which made the same change for the PF instance.

Signed-off-by: Eelco Chaudron 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c
index c77b30c54..7b1d485c6 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -1411,7 +1411,7 @@ i40evf_handle_aq_msg(struct rte_eth_dev *dev)
}
break;
default:
-   PMD_DRV_LOG(ERR, "Request %u is not supported yet",
+   PMD_DRV_LOG(DEBUG, "Request %u is not supported yet",
aq_opc);
break;
}
-- 
2.18.1



[dpdk-dev] [PATCH] vhost: add device opr when notification to guest is sent

2019-08-28 Thread Eelco Chaudron
This patch adds an operation callback which gets called every time
the library is waking up the guest trough an eventfd_write() call.

This can be used by 3rd party application, like OVS, to track the
number of times interrupts where generated. This might be of
interest to find out system-call were called in the fast path.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_vhost/rte_vhost.h | 10 +-
 lib/librte_vhost/vhost.h | 15 ---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 7fb1729..878e339 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -172,7 +172,15 @@ struct vhost_device_ops {
int (*new_connection)(int vid);
void (*destroy_connection)(int vid);
 
-   void *reserved[2]; /**< Reserved for future extension */
+   /**
+* This callback gets called each time a guest gets notified
+* about waiting packets. This is the interrupt handling trough
+* the eventfd_write(callfd), which can be used for counting these
+* "slow" syscalls.
+*/
+   void (*guest_notified)(int vid);
+
+   void *reserved[1]; /**< Reserved for future extension */
 };
 
 /**
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa..5131a97 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -543,13 +543,19 @@ void *vhost_alloc_copy_ind_table(struct virtio_net *dev,
 
if ((vhost_need_event(vhost_used_event(vq), new, old) &&
(vq->callfd >= 0)) ||
-   unlikely(!signalled_used_valid))
+   unlikely(!signalled_used_valid)) {
eventfd_write(vq->callfd, (eventfd_t) 1);
+   if (dev->notify_ops->guest_notified)
+   dev->notify_ops->guest_notified(dev->vid);
+   }
} else {
/* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-   && (vq->callfd >= 0))
+   && (vq->callfd >= 0)) {
eventfd_write(vq->callfd, (eventfd_t)1);
+   if (dev->notify_ops->guest_notified)
+   dev->notify_ops->guest_notified(dev->vid);
+   }
}
 }
 
@@ -600,8 +606,11 @@ void *vhost_alloc_copy_ind_table(struct virtio_net *dev,
if (vhost_need_event(off, new, old))
kick = true;
 kick:
-   if (kick)
+   if (kick) {
eventfd_write(vq->callfd, (eventfd_t)1);
+   if (dev->notify_ops->guest_notified)
+   dev->notify_ops->guest_notified(dev->vid);
+   }
 }
 
 static __rte_always_inline void
-- 
1.8.3.1



Re: [dpdk-dev] i40e driver with VFs and their changing MAC

2019-07-03 Thread Eelco Chaudron

Intel, any feedback on the below?

Thanks,

Eelco

On 24 Jun 2019, at 17:39, Eelco Chaudron wrote:


Hi,

I’m running into an issue where I have a VF used by OVS-DPDK and 
from the kernel side change the MAC.


I used the following command to change the mac:

ip link set p5p2 vf 0 mac 52:54:00:92:d3:30

In the kernel log I see the following:

[ 3420.075736] i40e :05:00.1: Bring down and up the VF interface 
to make this change effective.


This seems to bring down the VF, which seems to make the DPDK side go 
crazy:
2019-06-24T15:16:55.052Z|1|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|2|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|3|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|4|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|5|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|6|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|7|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|8|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|9|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00010|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00011|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00012|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00013|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00014|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00015|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00016|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00017|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00018|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00019|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00020|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00021|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00022|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00023|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00024|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00025|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00026|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00027|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00028|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00029|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00030|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00031|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00032|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00033|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00034|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00035|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00036|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00037|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00038|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00039|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00040|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00041|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00042|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00043|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00044|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00045|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00046|dpdk|ERR|i40evf_handle_aq_msg(): 
Request 0 is not supported yet
2019-06-24T15:16:55.052Z|00047|dpdk|ERR

[dpdk-dev] i40e driver with VFs and their changing MAC

2019-06-24 Thread Eelco Chaudron

Hi,

I’m running into an issue where I have a VF used by OVS-DPDK and from 
the kernel side change the MAC.


I used the following command to change the mac:

ip link set p5p2 vf 0 mac 52:54:00:92:d3:30

In the kernel log I see the following:

[ 3420.075736] i40e :05:00.1: Bring down and up the VF interface to 
make this change effective.


This seems to bring down the VF, which seems to make the DPDK side go 
crazy:
2019-06-24T15:16:55.052Z|1|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|2|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|3|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|4|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|5|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|6|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|7|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|8|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|9|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00010|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00011|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00012|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00013|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00014|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00015|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00016|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00017|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00018|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00019|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00020|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00021|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00022|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00023|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00024|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00025|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00026|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00027|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00028|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00029|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00030|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00031|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00032|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00033|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00034|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00035|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00036|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00037|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00038|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00039|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00040|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00041|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00042|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00043|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00044|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00045|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00046|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00047|dpdk|ERR|i40evf_handle_aq_msg(): Request 
0 is not supported yet
2019-06-24T15:16:55.052Z|00048|dpdk|ERR|i40evf_handle_aq_msg(

Re: [dpdk-dev] [dpdk-stable] [PATCH 2/2] net/ice: fix dropped packets statistics name

2019-06-03 Thread Eelco Chaudron
Looks good to me…

Acked-by: Eelco Chaudron 

On 3 Jun 2019, at 10:31, David Marchand wrote:

> Copy/paste from i40e, let's align with the fix on i40e.
>
> Fixes: a37bde56314d ("net/ice: support statistics")
> Cc: sta...@dpdk.org
>
> Signed-off-by: David Marchand 
> ---
>  drivers/net/ice/ice_ethdev.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
> index bdbceb4..6cd1c74 100644
> --- a/drivers/net/ice/ice_ethdev.c
> +++ b/drivers/net/ice/ice_ethdev.c
> @@ -153,13 +153,13 @@ struct ice_xstats_name_off {
>   {"rx_unicast_packets", offsetof(struct ice_eth_stats, rx_unicast)},
>   {"rx_multicast_packets", offsetof(struct ice_eth_stats, rx_multicast)},
>   {"rx_broadcast_packets", offsetof(struct ice_eth_stats, rx_broadcast)},
> - {"rx_dropped", offsetof(struct ice_eth_stats, rx_discards)},
> + {"rx_dropped_packets", offsetof(struct ice_eth_stats, rx_discards)},
>   {"rx_unknown_protocol_packets", offsetof(struct ice_eth_stats,
>   rx_unknown_protocol)},
>   {"tx_unicast_packets", offsetof(struct ice_eth_stats, tx_unicast)},
>   {"tx_multicast_packets", offsetof(struct ice_eth_stats, tx_multicast)},
>   {"tx_broadcast_packets", offsetof(struct ice_eth_stats, tx_broadcast)},
> - {"tx_dropped", offsetof(struct ice_eth_stats, tx_discards)},
> + {"tx_dropped_packets", offsetof(struct ice_eth_stats, tx_discards)},
>  };
>
>  #define ICE_NB_ETH_XSTATS (sizeof(ice_stats_strings) / \
> -- 
> 1.8.3.1


Re: [dpdk-dev] [dpdk-stable] [PATCH 1/2] net/i40e: fix dropped packets statistics name

2019-06-03 Thread Eelco Chaudron

Looks good to me…

Acked-by: Eelco Chaudron 

On 3 Jun 2019, at 10:31, David Marchand wrote:


i40e and i40evf currently use two different names for the statistic on
dropped packets on the rx and tx sides.
Let's prefer i40evf so that all statistics are suffixed with _packets.

This also avoids a statistic name conflict in OVS.

Fixes: f4a91c38b4ad ("i40e: add extended stats")
Cc: sta...@dpdk.org

Signed-off-by: David Marchand 
---
 drivers/net/i40e/i40e_ethdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev.c 
b/drivers/net/i40e/i40e_ethdev.c

index 7fa9e1b..2384d4d 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -526,13 +526,13 @@ struct rte_i40e_xstats_name_off {
{"rx_unicast_packets", offsetof(struct i40e_eth_stats, rx_unicast)},
 	{"rx_multicast_packets", offsetof(struct i40e_eth_stats, 
rx_multicast)},
 	{"rx_broadcast_packets", offsetof(struct i40e_eth_stats, 
rx_broadcast)},

-   {"rx_dropped", offsetof(struct i40e_eth_stats, rx_discards)},
+	{"rx_dropped_packets", offsetof(struct i40e_eth_stats, 
rx_discards)},

{"rx_unknown_protocol_packets", offsetof(struct i40e_eth_stats,
rx_unknown_protocol)},
{"tx_unicast_packets", offsetof(struct i40e_eth_stats, tx_unicast)},
 	{"tx_multicast_packets", offsetof(struct i40e_eth_stats, 
tx_multicast)},
 	{"tx_broadcast_packets", offsetof(struct i40e_eth_stats, 
tx_broadcast)},

-   {"tx_dropped", offsetof(struct i40e_eth_stats, tx_discards)},
+	{"tx_dropped_packets", offsetof(struct i40e_eth_stats, 
tx_discards)},

 };

 #define I40E_NB_ETH_XSTATS (sizeof(rte_i40e_stats_strings) / \
--
1.8.3.1


[dpdk-dev] [PATCH v3] lib/librte_meter: fix divide by zero for RFC4115 meter

2019-03-19 Thread Eelco Chaudron
RFC 4115 allows a meter with either cir and/or eir configured.
When only one is configured a divide by zero would occur.

Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM")
Cc: echau...@redhat.com

Signed-off-by: Eelco Chaudron 

---
 v3 - Rather than using a 0 check, set up profile data such that no
  check at runtime is needed.
 v2 - Removed configuration change that got included by accident

 lib/librte_meter/rte_meter.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c
index e55f9be65..45679444e 100644
--- a/lib/librte_meter/rte_meter.c
+++ b/lib/librte_meter/rte_meter.c
@@ -19,7 +19,15 @@
 static void
 rte_meter_get_tb_params(uint64_t hz, uint64_t rate, uint64_t *tb_period, 
uint64_t *tb_bytes_per_period)
 {
-   double period = ((double) hz) / ((double) rate);
+   double period;
+
+   if (rate == 0) {
+   *tb_bytes_per_period = 0;
+   *tb_period = RTE_METER_TB_PERIOD_MIN;
+   return;
+   }
+
+   period = ((double) hz) / ((double) rate);
 
if (period >= RTE_METER_TB_PERIOD_MIN) {
*tb_bytes_per_period = 1;



Re: [dpdk-dev] [PATCH v2] lib/librte_meter: fix divide by zero for RFC4115 meter

2019-03-19 Thread Eelco Chaudron




On 28 Feb 2019, at 19:50, Dumitrescu, Cristian wrote:


Hi Eelco,

Sorry for my delayed reply


No problem I was OOO also so hence my late reply ;)


-Original Message-
From: Eelco Chaudron [mailto:echau...@redhat.com]
Sent: Friday, February 1, 2019 4:07 PM
To: Dumitrescu, Cristian 
Cc: dev@dpdk.org
Subject: [PATCH v2] lib/librte_meter: fix divide by zero for RFC4115 
meter


RFC 4115 allows a meter with either cir and/or eir configured.
When only one is configured a divide by zero would occur.

Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM")
Cc: echau...@redhat.com

Signed-off-by: Eelco Chaudron 
---

 v2 - Removed configuration change that got included by accident

 lib/librte_meter/rte_meter.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/librte_meter/rte_meter.h 
b/lib/librte_meter/rte_meter.h

index 005e4..56d85ecf0 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -597,8 +597,8 @@ rte_meter_trtcm_rfc4115_color_blind_check(
/* Bucket update */
time_diff_tc = time - m->time_tc;
time_diff_te = time - m->time_te;
-   n_periods_tc = time_diff_tc / p->cir_period;
-   n_periods_te = time_diff_te / p->eir_period;
+	n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 
0;
+	n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 
0;

m->time_tc += n_periods_tc * p->cir_period;
m->time_te += n_periods_te * p->eir_period;

@@ -641,8 +641,8 @@ rte_meter_trtcm_rfc4115_color_aware_check(
/* Bucket update */
time_diff_tc = time - m->time_tc;
time_diff_te = time - m->time_te;
-   n_periods_tc = time_diff_tc / p->cir_period;
-   n_periods_te = time_diff_te / p->eir_period;
+	n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 
0;
+	n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 
0;

m->time_tc += n_periods_tc * p->cir_period;
m->time_te += n_periods_te * p->eir_period;



Yes, this is indeed an issue, good catch!

For performance reasons, we'd like to avoid a test on the fast path 
(rte_meter_xyz_check) and replace it with more work done on the 
configuration stage (rte_meter_profile_xyz_config), if possible.


We can intercept the null CIR or EIR cases very early in 
rte_meter_trtcm_rfc4115_profile_config() and deal with them separately 
by setting the CIR/EIR period and bytes_per_period to some neutral 
values and skipping the call to rte_meter_get_tb_params(): period = 
RTE_METER_TB_PERIOD_MIN, bytes_per_period = 0.


Excellent idea, will sent out a v3 soon, after some testing…

//Eelco


[dpdk-dev] [PATCH v2] lib/librte_meter: fix divide by zero for RFC4115 meter

2019-02-01 Thread Eelco Chaudron
RFC 4115 allows a meter with either cir and/or eir configured.
When only one is configured a divide by zero would occur.

Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM")
Cc: echau...@redhat.com

Signed-off-by: Eelco Chaudron 
---

 v2 - Removed configuration change that got included by accident

 lib/librte_meter/rte_meter.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index 005e4..56d85ecf0 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -597,8 +597,8 @@ rte_meter_trtcm_rfc4115_color_blind_check(
/* Bucket update */
time_diff_tc = time - m->time_tc;
time_diff_te = time - m->time_te;
-   n_periods_tc = time_diff_tc / p->cir_period;
-   n_periods_te = time_diff_te / p->eir_period;
+   n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0;
+   n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0;
m->time_tc += n_periods_tc * p->cir_period;
m->time_te += n_periods_te * p->eir_period;
 
@@ -641,8 +641,8 @@ rte_meter_trtcm_rfc4115_color_aware_check(
/* Bucket update */
time_diff_tc = time - m->time_tc;
time_diff_te = time - m->time_te;
-   n_periods_tc = time_diff_tc / p->cir_period;
-   n_periods_te = time_diff_te / p->eir_period;
+   n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0;
+   n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0;
m->time_tc += n_periods_tc * p->cir_period;
m->time_te += n_periods_te * p->eir_period;
 



[dpdk-dev] [PATCH] lib/librte_meter: fix divide by zero for RFC4115 meter

2019-02-01 Thread Eelco Chaudron
RFC 4115 allows a meter with either cir and/or eir configured.
When only one is configured a divide by zero would occur.

Fixes: 655796d2b5fb ("meter: support RFC4115 trTCM")
Cc: echau...@redhat.com

Signed-off-by: Eelco Chaudron 
---
 config/common_linuxapp   |4 ++--
 lib/librte_meter/rte_meter.h |8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 6c1c8d0f4..186323993 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -7,9 +7,9 @@ CONFIG_RTE_EXEC_ENV="linuxapp"
 CONFIG_RTE_EXEC_ENV_LINUXAPP=y
 
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
-CONFIG_RTE_EAL_IGB_UIO=y
+CONFIG_RTE_EAL_IGB_UIO=n
 CONFIG_RTE_EAL_VFIO=y
-CONFIG_RTE_KNI_KMOD=y
+CONFIG_RTE_KNI_KMOD=n
 CONFIG_RTE_LIBRTE_KNI=y
 CONFIG_RTE_LIBRTE_PMD_KNI=y
 CONFIG_RTE_LIBRTE_VHOST=y
diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index 005e4..56d85ecf0 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -597,8 +597,8 @@ rte_meter_trtcm_rfc4115_color_blind_check(
/* Bucket update */
time_diff_tc = time - m->time_tc;
time_diff_te = time - m->time_te;
-   n_periods_tc = time_diff_tc / p->cir_period;
-   n_periods_te = time_diff_te / p->eir_period;
+   n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0;
+   n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0;
m->time_tc += n_periods_tc * p->cir_period;
m->time_te += n_periods_te * p->eir_period;
 
@@ -641,8 +641,8 @@ rte_meter_trtcm_rfc4115_color_aware_check(
/* Bucket update */
time_diff_tc = time - m->time_tc;
time_diff_te = time - m->time_te;
-   n_periods_tc = time_diff_tc / p->cir_period;
-   n_periods_te = time_diff_te / p->eir_period;
+   n_periods_tc = p->cir_period != 0 ? time_diff_tc / p->cir_period : 0;
+   n_periods_te = p->eir_period != 0 ? time_diff_te / p->eir_period : 0;
m->time_tc += n_periods_tc * p->cir_period;
m->time_te += n_periods_te * p->eir_period;
 



Re: [dpdk-dev] VF of a X520 card does not process VLAN packets

2019-01-22 Thread Eelco Chaudron

Hi Konstantin,

Thanks for your reply…

Yes adding the VLAN to the VF from the kernel side will work, however, 
this is a different behaviour than the XL710.


The XL710 will allow all VLANs to pass through once the PMD takes 
control of the VF.


The real use case is using this VF interface trough Open vSwitch which 
needs to see all packets regardless of the VLAN.
It does not make sense to create 4K VLANs through the kernel to get this 
working.


Looking forward to your reply…

Cheers,

Eelco



On 18 Jan 2019, at 12:25, Ananyev, Konstantin wrote:


Hi
It’s been while since I looked at it last time,
but shouldn’t you assign desired vlan tag to the VF first:
ip link set  vf  vlan 
?
Konstantin

From: Eelco Chaudron [mailto:echau...@redhat.com]
Sent: Thursday, January 3, 2019 4:42 PM
To: Lu, Wenzhuo ; Ananyev, Konstantin 


Cc: dev 
Subject: Re: [dpdk-dev] VF of a X520 card does not process VLAN 
packets



Hi maintainers, any feedback on the below?

Thanks,

Eelco

On 18 Dec 2018, at 12:06, Eelco Chaudron wrote:

Hi,

I’m assigning a VF of an X520 card for DPDK/testpmd/OVS but it's not 
accepting tagged VLAN packets (it does accept tag 0). Is this a known 
bug/limitation of the 82599ES chipset?


This is how I've tested it:

$ echo 1 > /sys/bus/pci/devices/:05:00.0/sriov_numvfs
$driverctl -v set-override :05:10.0 vfio-pci
$./testpmd -c 7 -n 4 --socket-mem 2048,0 -w :05:10.0 -- \
--burst 64 -i --rxq=1 --txq=1 --rxd=4096 --txd=1024 \
—coremask=6 --auto-start --port-topology=chained
EAL: Detected 28 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
...
testpmd>

I’m sending broadcast ARP packets:

Ethernet II, Src: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00), Dst: 
ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)

Destination: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)
Address: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)
 ..1.     = LG bit: Locally administered address 
(this is NOT the factory default)
 ...1     = IG bit: Group address 
(multicast/broadcast)

Source: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00)
Address: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00)
 ..0.     = LG bit: Globally unique address 
(factory default)

 ...0     = IG bit: Individual address (unicast)
Type: 802.1Q Virtual LAN (0x8100)
802.1Q Virtual LAN, PRI: 0, CFI: 0, ID: 10
000.    = Priority: Best Effort (default) (0)
...0    = CFI: Canonical (0)
   1010 = ID: 10
Type: ARP (0x0806)
Padding: 2e2f303132333435363738393a3b
Trailer: 3c3d3e3f404142434445464748494a4b4c4d4e4f50515253...
Address Resolution Protocol (request/gratuitous ARP)
Hardware type: Ethernet (1)
Protocol type: IP (0x0800)
Hardware size: 6
Protocol size: 4
Opcode: request (1)
[Is gratuitous: True]
Sender MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00)
Sender IP address: 0.0.0.0 (0.0.0.0)
Target MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00)
Target IP address: 0.0.0.0 (0.0.0.0)

No traffic is received, if you replace tag 10 with 0, packets are 
received.


Note that this is a simple reproducer with testpmd, but it's reported 
as part of an OVS integration. This works fine on the physical port, 
or with a VF on an XL710 card.


Any input appreciated.

Thanks,

Eelco





Re: [dpdk-dev] [RFC v1 00/11] scapy/python extension

2019-01-10 Thread Eelco Chaudron

Hi Xueming,

I was wondering if this patch went anywhere, could not find anything in 
the list archive?


I like the idea, as it’s useful for quick unit testing, without 
relying on Trex or trafgen.


Cheers,

Eelco



quick guide document:
https://github.com/steevenlee/dpdk/blob/master_scapy/doc/guides/howto/scapy.rst
github branch:
https://github.com/steevenlee/dpdk/tree/master_scapy


Xueming Li (11):
  lib/cmdline: support backspace key
  lib/cmdline: init parse result memeory
  lib/cmdline: add echo support in batch loading from file
  app/testpmd: support command echo in CLI batch loading
  test: update batch loading test
  lib/python: add embedded python lib
  app/testpmd: add python command
  app/testpmd: add pktgen forwarding engine
  app/testpmd: add pktgen engine scapy commands
  test/expect: add expect test scripts
  doc/scapy: add scapy how-to guide

 app/test-pmd/Makefile|6 +
 app/test-pmd/cmdline.c   |   80 ++-
 app/test-pmd/pktgen.c| 1092 
++

 app/test-pmd/testpmd.c   |1 +
 app/test-pmd/testpmd.h   |5 +
 config/common_base   |6 +
 doc/guides/howto/scapy.rst   |  300 
 lib/Makefile |2 +
 lib/librte_cmdline/cmdline_parse.c   |2 +
 lib/librte_cmdline/cmdline_rdline.c  |1 +
 lib/librte_cmdline/cmdline_socket.c  |5 +-
 lib/librte_cmdline/cmdline_socket.h  |3 +-
 lib/librte_cmdline/cmdline_vt100.c   |1 +
 lib/librte_cmdline/cmdline_vt100.h   |1 +
 lib/librte_eal/common/include/rte_log.h  |1 +
 lib/librte_python/Makefile   |   60 ++
 lib/librte_python/rte_python.c   |  387 +++
 lib/librte_python/rte_python.h   |   71 ++
 lib/librte_python/rte_python_version.map |   12 +
 mk/rte.app.mk|1 +
 test/expect/init.exp |   28 +
 test/expect/rx.exp   |  134 
 test/test/test_cmdline_lib.c |   10 +-
 23 files changed, 2199 insertions(+), 10 deletions(-)
 create mode 100644 app/test-pmd/pktgen.c
 create mode 100644 doc/guides/howto/scapy.rst
 create mode 100644 lib/librte_python/Makefile
 create mode 100644 lib/librte_python/rte_python.c
 create mode 100644 lib/librte_python/rte_python.h
 create mode 100644 lib/librte_python/rte_python_version.map
 create mode 100644 test/expect/init.exp
 create mode 100644 test/expect/rx.exp

--
2.13.3


Re: [dpdk-dev] [PATCH v4 0/2] lib/librte_meter: add RFC4115 trTCM meter support

2019-01-10 Thread Eelco Chaudron
Thanks Thomas!

On 10 Jan 2019, at 0:22, Thomas Monjalon wrote:

> 07/01/2019 16:58, Dumitrescu, Cristian:
>> From: Eelco Chaudron [mailto:echau...@redhat.com]
>>>
>>> This patch adds support for RFC4115 trTCM meters.
>>>
>>>
>>> Signed-off-by: Eelco Chaudron 
>>>
>>> v4:
>>>  - Fixed typo
>>>  - Cleaned up code to be aligned with RFC diagram
>>>
>>
>> Series Acked-by: Cristian Dumitrescu 
>>
>> Applied to next-qos tree, thanks!
>
> #include "rte_compat.h" is missing.
> Fixing in-tree


Re: [dpdk-dev] [PATCH v4 1/2] lib/librte_meter: add RFC4115 trTCM meter support

2019-01-06 Thread Eelco Chaudron




On 4 Jan 2019, at 20:42, Stephen Hemminger wrote:


On Fri,  4 Jan 2019 13:59:42 +
Eelco Chaudron  wrote:


This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 




Fix comment formatting.


Hi Stephen, I left these warnings in on purpose, to match the existing 
comment style in the file.


These specific warnings are from a structure copied from one right above 
which also has the same style and line lengths.


As they are warning I though I should leave it as is, Christian what are 
your thoughts on this?




### [dpdk-dev] [PATCH v4 1/2] lib/librte_meter: add RFC4115 trTCM 
meter support


WARNING:BLOCK_COMMENT_STYLE: Block comments use * on subsequent lines
#172: FILE: lib/librte_meter/rte_meter.h:59:
+/** trTCM parameters per metered traffic flow. The CIR, EIR, CBS and 
EBS
+parameters only count bytes of IP packets and do not include link 
specific


WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a 
separate line

#174: FILE: lib/librte_meter/rte_meter.h:61:
+none-zero respectively.*/

WARNING:LONG_LINE_COMMENT: line over 80 characters
#176: FILE: lib/librte_meter/rte_meter.h:63:
+	uint64_t cir; /**< Committed Information Rate (CIR). Measured in 
bytes per second. */


WARNING:LONG_LINE_COMMENT: line over 80 characters
#177: FILE: lib/librte_meter/rte_meter.h:64:
+	uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes 
per second. */


WARNING:LONG_LINE_COMMENT: line over 80 characters
#355: FILE: lib/librte_meter/rte_meter.h:406:
+	/**< Number of bytes currently available in the excess(E) token 
bucket */


total: 0 errors, 5 warnings, 356 lines checked


Re: [dpdk-dev] [PATCH v3 1/2] lib/librte_meter: add RFC4115 trTCM meter support

2019-01-04 Thread Eelco Chaudron




On 21 Dec 2018, at 17:15, Dumitrescu, Cristian wrote:


Hi Eelco,





+/** trTCM parameters per metered traffic flow. The CIR, EIT, CBS and 
EBS


Small typo here: EIT to be replaced by EIR.

+parameters only count bytes of IP packets and do not include link 
specific
+headers. The CBS and EBS need to be greater than zero if CIR and EIR 
are

+none-zero respectively.*/
+struct rte_meter_trtcm_rfc4115_params {
+   uint64_t cir; /**< Committed Information Rate (CIR). Measured in
bytes per second. */
+   uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes
per second. */
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes.
*/
+   uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */
+};
+





+static inline enum rte_meter_color __rte_experimental
+rte_meter_trtcm_rfc4115_color_blind_check(
+   struct rte_meter_trtcm_rfc4115 *m,
+   struct rte_meter_trtcm_rfc4115_profile *p,
+   uint64_t time,
+   uint32_t pkt_len)
+{
+	uint64_t time_diff_tc, time_diff_te, n_periods_tc, n_periods_te, 
tc,

te;
+
+   /* Bucket update */
+   time_diff_tc = time - m->time_tc;
+   time_diff_te = time - m->time_te;
+   n_periods_tc = time_diff_tc / p->cir_period;
+   n_periods_te = time_diff_te / p->eir_period;
+   m->time_tc += n_periods_tc * p->cir_period;
+   m->time_te += n_periods_te * p->eir_period;
+
+   tc = m->tc + n_periods_tc * p->cir_bytes_per_period;
+   if (tc > p->cbs)
+   tc = p->cbs;
+
+   te = m->te + n_periods_te * p->eir_bytes_per_period;
+   if (te > p->ebs)
+   te = p->ebs;
+
+   /* Color logic */
+   if (tc >= pkt_len) {
+   m->tc = tc - pkt_len;
+   m->te = te;
+   return e_RTE_METER_GREEN;
+   } else if (te >= pkt_len) {
+   m->tc = tc;
+   m->te = te - pkt_len;
+   return e_RTE_METER_YELLOW;
+   }
+
+   /* If we end up here the color is RED */
+   m->tc = tc;
+   m->te = te;
+   return e_RTE_METER_RED;
+}
+


Since the branch (tc >= pkt_len) == TRUE always returns, I suggest we 
remove the following "else", as it is redundant:


/* Color logic */
if (tc >= pkt_len) {
m->tc = tc - pkt_len;
m->te = te;
return e_RTE_METER_GREEN;
}

if (te >= pkt_len) {
m->tc = tc;
m->te = te - pkt_len;
return e_RTE_METER_YELLOW;
}

/* If we end up here the color is RED */
m->tc = tc;
m->te = te;
return e_RTE_METER_RED;



+static inline enum rte_meter_color __rte_experimental
+rte_meter_trtcm_rfc4115_color_aware_check(
+   struct rte_meter_trtcm_rfc4115 *m,
+   struct rte_meter_trtcm_rfc4115_profile *p,
+   uint64_t time,
+   uint32_t pkt_len,
+   enum rte_meter_color pkt_color)
+{
+	uint64_t time_diff_tc, time_diff_te, n_periods_tc, n_periods_te, 
tc,

te;
+
+   /* Bucket update */
+   time_diff_tc = time - m->time_tc;
+   time_diff_te = time - m->time_te;
+   n_periods_tc = time_diff_tc / p->cir_period;
+   n_periods_te = time_diff_te / p->eir_period;
+   m->time_tc += n_periods_tc * p->cir_period;
+   m->time_te += n_periods_te * p->eir_period;
+
+   tc = m->tc + n_periods_tc * p->cir_bytes_per_period;
+   if (tc > p->cbs)
+   tc = p->cbs;
+
+   te = m->te + n_periods_te * p->eir_bytes_per_period;
+   if (te > p->ebs)
+   te = p->ebs;
+
+   /* Color logic */
+   if (pkt_color == e_RTE_METER_GREEN) {
+   if (tc >= pkt_len) {
+   m->tc = tc - pkt_len;
+   m->te = te;
+   return e_RTE_METER_GREEN;
+   } else if (te >= pkt_len) {
+   m->tc = tc;
+   m->te = te - pkt_len;
+   return e_RTE_METER_YELLOW;
+   }
+   } else if (pkt_color == e_RTE_METER_YELLOW && te >= pkt_len) {
+   m->tc = tc;
+   m->te = te - pkt_len;
+   return e_RTE_METER_YELLOW;
+   }
+
+   /* If we end up here the color is RED */
+   m->tc = tc;
+   m->te = te;
+   return e_RTE_METER_RED;
+}
+
+


I suggest we follow the logic from the diagram in the RFC rather than 
the logic in the text preceding the diagram. Although the two 
descriptions are equivalent (after a bit of thinking), the diagram 
seems more optimal to me:


/* Color logic */
if ((pkt_color == e_RTE_METER_GREEN) && (tc >= pkt_len)) {
m->tc = tc - pkt_len;
m->te = te;
return e_RTE_METER_GREEN;
}

if ((pkt_color != e_RTE_METER_RED) && (te >= pkt_len)) {
m->tc = tc;
m->te = te - pkt_len;
return e_RTE_METER_YELLOW;
}

/* If we end up here

[dpdk-dev] [PATCH v4 2/2] test/test_meter: update meter test to include RFC4115 meters

2019-01-04 Thread Eelco Chaudron
Add test cases for RFC4115 meters

Signed-off-by: Eelco Chaudron 
---
 test/test/test_meter.c |  212 
 1 file changed, 212 insertions(+)

diff --git a/test/test/test_meter.c b/test/test/test_meter.c
index 8bb47e75c..f935faa53 100644
--- a/test/test/test_meter.c
+++ b/test/test/test_meter.c
@@ -32,8 +32,10 @@
 
 #define TM_TEST_TRTCM_CIR_DF 4600
 #define TM_TEST_TRTCM_PIR_DF 6900
+#define TM_TEST_TRTCM_EIR_DF 6900
 #define TM_TEST_TRTCM_CBS_DF 2048
 #define TM_TEST_TRTCM_PBS_DF 4096
+#define TM_TEST_TRTCM_EBS_DF 4096
 
 static struct rte_meter_srtcm_params sparams =
{.cir = TM_TEST_SRTCM_CIR_DF,
@@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams=
 .cbs = TM_TEST_TRTCM_CBS_DF,
 .pbs = TM_TEST_TRTCM_PBS_DF,};
 
+static struct rte_meter_trtcm_rfc4115_params rfc4115params =
+   {.cir = TM_TEST_TRTCM_CIR_DF,
+.eir = TM_TEST_TRTCM_EIR_DF,
+.cbs = TM_TEST_TRTCM_CBS_DF,
+.ebs = TM_TEST_TRTCM_EBS_DF,};
+
 /**
  * functional test for rte_meter_srtcm_config
  */
@@ -148,6 +156,45 @@ tm_test_trtcm_config(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_config
+ */
+static inline int
+tm_test_trtcm_rfc4115_config(void)
+{
+   struct rte_meter_trtcm_rfc4115_profile tp;
+   struct  rte_meter_trtcm_rfc4115_params rfc4115params1;
+#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config"
+
+   /* invalid parameter test */
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /*
+* cbs and pbs should be none-zero if cir and eir are none-zero
+* respectively
+*/
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.cbs = 0;
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.ebs = 0;
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /* usual parameter, should be successful */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   return 0;
+}
+
 /**
  * functional test for rte_meter_srtcm_color_blind_check
  */
@@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_color_blind_check
+ */
+static inline int
+tm_test_trtcm_rfc4115_color_blind_check(void)
+{
+#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check"
+
+   uint64_t time;
+   struct rte_meter_trtcm_rfc4115_profile tp;
+   struct rte_meter_trtcm_rfc4115 tm;
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Test green */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1)
+   != e_RTE_METER_GREEN)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN");
+
+   /* Test yellow */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   /* 

[dpdk-dev] [PATCH v4 0/2] lib/librte_meter: add RFC4115 trTCM meter support

2019-01-04 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.


Signed-off-by: Eelco Chaudron 

v4:
 - Fixed typo
 - Cleaned up code to be aligned with RFC diagram

v3:
 - Gave the rfc4115 meter it's own data structures

v2:
 - Marked all functions with __rte_experimental, and added
   "EXPERIMENTAL:..." to doxygen comments.
 - Removed library version change, and merged it with first patch
 - Do not call rte_meter_trtcm_rfc4115_color_aware_check in
   rte_meter_trtcm_rfc4115_color_blind_check to avoid error with
   it being marked as experimental

Eelco Chaudron (2):
  lib/librte_meter: add RFC4115 trTCM meter support
  test/test_meter: update meter test to include RFC4115 meters


 lib/librte_meter/rte_meter.c   |   42 ++
 lib/librte_meter/rte_meter.h   |  233 
 lib/librte_meter/rte_meter_version.map |9 +
 test/test/test_meter.c |  212 +
 4 files changed, 494 insertions(+), 2 deletions(-)



[dpdk-dev] [PATCH v4 1/2] lib/librte_meter: add RFC4115 trTCM meter support

2019-01-04 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_meter/rte_meter.c   |   42 ++
 lib/librte_meter/rte_meter.h   |  233 
 lib/librte_meter/rte_meter_version.map |9 +
 3 files changed, 282 insertions(+), 2 deletions(-)

diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c
index 473f69aba..e55f9be65 100644
--- a/lib/librte_meter/rte_meter.c
+++ b/lib/librte_meter/rte_meter.c
@@ -110,3 +110,45 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m,
 
return 0;
 }
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_profile_config(
+   struct rte_meter_trtcm_rfc4115_profile *p,
+   struct rte_meter_trtcm_rfc4115_params *params)
+{
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Check input parameters */
+   if ((p == NULL) ||
+   (params == NULL) ||
+   (params->cir != 0 && params->cbs == 0) ||
+   (params->eir != 0 && params->ebs == 0))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   p->cbs = params->cbs;
+   p->ebs = params->ebs;
+   rte_meter_get_tb_params(hz, params->cir, &p->cir_period,
+   &p->cir_bytes_per_period);
+   rte_meter_get_tb_params(hz, params->eir, &p->eir_period,
+   &p->eir_bytes_per_period);
+
+   return 0;
+}
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_config(
+   struct rte_meter_trtcm_rfc4115 *m,
+   struct rte_meter_trtcm_rfc4115_profile *p)
+{
+   /* Check input parameters */
+   if ((m == NULL) || (p == NULL))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   m->time_tc = m->time_te = rte_get_tsc_cycles();
+   m->tc = p->cbs;
+   m->te = p->ebs;
+
+   return 0;
+}
diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index 58a051583..c7eee896a 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
  */
@@ -16,6 +17,7 @@ extern "C" {
  * Traffic metering algorithms:
  *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697
  *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698
+ *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115
  *
  ***/
 
@@ -49,10 +51,21 @@ be greater than or equal to CIR. Both CBS or EBS have to be 
greater than zero. *
 struct rte_meter_trtcm_params {
uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes 
per second. */
uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per 
second. */
-   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */
uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
 };
 
+/** trTCM parameters per metered traffic flow. The CIR, EIR, CBS and EBS
+parameters only count bytes of IP packets and do not include link specific
+headers. The CBS and EBS need to be greater than zero if CIR and EIR are
+none-zero respectively.*/
+struct rte_meter_trtcm_rfc4115_params {
+   uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes 
per second. */
+   uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per 
second. */
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */
+   uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */
+};
+
 /**
  * Internal data structure storing the srTCM configuration profile. Typically
  * shared by multiple srTCM objects.
@@ -65,12 +78,24 @@ struct rte_meter_srtcm_profile;
  */
 struct rte_meter_trtcm_profile;
 
+/**
+ * Internal data structure storing the trTCM RFC4115 configuration profile.
+ * Typically shared by multiple trTCM objects.
+ */
+struct rte_meter_trtcm_rfc4115_profile;
+
 /** Internal data structure storing the srTCM run-time context per metered 
traffic flow. */
 struct rte_meter_srtcm;
 
 /** Internal data structure storing the trTCM run-time context per metered 
traffic flow. */
 struct rte_meter_trtcm;
 
+/**
+ * Internal data structure storing the trTCM RFC4115 run-time context per
+ * metered traffic flow.
+ */
+struct rte_meter_trtcm_rfc4115;
+
 /**
  * srTCM profile configuration
  *
@@ -98,6 +123,23 @@ rte_meter_srtcm_profile_config(struct 
rte_meter_srtcm_profile *p,
 int
 rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p,
struct rte_meter_trtcm_params *params);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * trTCM RFC 4115 profile configuration
+ *
+ * @param p
+ *Pointer to pre-allocated trTCM profile data structure
+ * @

Re: [dpdk-dev] VF of a X520 card does not process VLAN packets

2019-01-03 Thread Eelco Chaudron

Hi maintainers, any feedback on the below?

Thanks,

Eelco


On 18 Dec 2018, at 12:06, Eelco Chaudron wrote:


Hi,

I’m assigning a VF of an X520 card for DPDK/testpmd/OVS but it's not 
accepting tagged VLAN packets (it does accept tag 0).  Is this a known 
bug/limitation of the 82599ES chipset?


This is how I've tested it:

$ echo 1 > /sys/bus/pci/devices/\:05\:00.0/sriov_numvfs
$driverctl -v set-override :05:10.0 vfio-pci
$./testpmd -c 7 -n 4 --socket-mem 2048,0 -w :05:10.0 -- \
  --burst 64 -i --rxq=1 --txq=1 --rxd=4096 --txd=1024 \
  —coremask=6 --auto-start --port-topology=chained
EAL: Detected 28 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
...
testpmd>


I’m sending broadcast ARP packets:

Ethernet II, Src: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00), Dst: 
ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)

Destination: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)
Address: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)
 ..1.     = LG bit: Locally administered 
address (this is NOT the factory default)
 ...1     = IG bit: Group address 
(multicast/broadcast)

Source: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00)
Address: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00)
 ..0.     = LG bit: Globally unique 
address (factory default)
 ...0     = IG bit: Individual address 
(unicast)

Type: 802.1Q Virtual LAN (0x8100)
802.1Q Virtual LAN, PRI: 0, CFI: 0, ID: 10
000.    = Priority: Best Effort (default) (0)
...0    = CFI: Canonical (0)
   1010 = ID: 10
Type: ARP (0x0806)
Padding: 2e2f303132333435363738393a3b
Trailer: 3c3d3e3f404142434445464748494a4b4c4d4e4f50515253...
Address Resolution Protocol (request/gratuitous ARP)
Hardware type: Ethernet (1)
Protocol type: IP (0x0800)
Hardware size: 6
Protocol size: 4
Opcode: request (1)
[Is gratuitous: True]
Sender MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00)
Sender IP address: 0.0.0.0 (0.0.0.0)
Target MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00)
Target IP address: 0.0.0.0 (0.0.0.0)

No traffic is received, if you replace tag 10 with 0, packets are 
received.


Note that this is a simple reproducer with testpmd, but it's reported 
as part of an OVS integration. This works fine on the physical port, 
or with a VF on an XL710 card.


Any input appreciated.

Thanks,

Eelco


[dpdk-dev] [PATCH v3 2/2] test/test_meter: update meter test to include RFC4115 meters

2018-12-18 Thread Eelco Chaudron
Add test cases for RFC4115 meters

Signed-off-by: Eelco Chaudron 
---
 test/test/test_meter.c |  212 
 1 file changed, 212 insertions(+)

diff --git a/test/test/test_meter.c b/test/test/test_meter.c
index 8bb47e75c..f935faa53 100644
--- a/test/test/test_meter.c
+++ b/test/test/test_meter.c
@@ -32,8 +32,10 @@
 
 #define TM_TEST_TRTCM_CIR_DF 4600
 #define TM_TEST_TRTCM_PIR_DF 6900
+#define TM_TEST_TRTCM_EIR_DF 6900
 #define TM_TEST_TRTCM_CBS_DF 2048
 #define TM_TEST_TRTCM_PBS_DF 4096
+#define TM_TEST_TRTCM_EBS_DF 4096
 
 static struct rte_meter_srtcm_params sparams =
{.cir = TM_TEST_SRTCM_CIR_DF,
@@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams=
 .cbs = TM_TEST_TRTCM_CBS_DF,
 .pbs = TM_TEST_TRTCM_PBS_DF,};
 
+static struct rte_meter_trtcm_rfc4115_params rfc4115params =
+   {.cir = TM_TEST_TRTCM_CIR_DF,
+.eir = TM_TEST_TRTCM_EIR_DF,
+.cbs = TM_TEST_TRTCM_CBS_DF,
+.ebs = TM_TEST_TRTCM_EBS_DF,};
+
 /**
  * functional test for rte_meter_srtcm_config
  */
@@ -148,6 +156,45 @@ tm_test_trtcm_config(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_config
+ */
+static inline int
+tm_test_trtcm_rfc4115_config(void)
+{
+   struct rte_meter_trtcm_rfc4115_profile tp;
+   struct  rte_meter_trtcm_rfc4115_params rfc4115params1;
+#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config"
+
+   /* invalid parameter test */
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /*
+* cbs and pbs should be none-zero if cir and eir are none-zero
+* respectively
+*/
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.cbs = 0;
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.ebs = 0;
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /* usual parameter, should be successful */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   return 0;
+}
+
 /**
  * functional test for rte_meter_srtcm_color_blind_check
  */
@@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_color_blind_check
+ */
+static inline int
+tm_test_trtcm_rfc4115_color_blind_check(void)
+{
+#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check"
+
+   uint64_t time;
+   struct rte_meter_trtcm_rfc4115_profile tp;
+   struct rte_meter_trtcm_rfc4115 tm;
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Test green */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1)
+   != e_RTE_METER_GREEN)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN");
+
+   /* Test yellow */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   /* 

[dpdk-dev] [PATCH v3 1/2] lib/librte_meter: add RFC4115 trTCM meter support

2018-12-18 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_meter/rte_meter.c   |   42 ++
 lib/librte_meter/rte_meter.h   |  235 
 lib/librte_meter/rte_meter_version.map |9 +
 3 files changed, 284 insertions(+), 2 deletions(-)

diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c
index 473f69aba..e55f9be65 100644
--- a/lib/librte_meter/rte_meter.c
+++ b/lib/librte_meter/rte_meter.c
@@ -110,3 +110,45 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m,
 
return 0;
 }
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_profile_config(
+   struct rte_meter_trtcm_rfc4115_profile *p,
+   struct rte_meter_trtcm_rfc4115_params *params)
+{
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Check input parameters */
+   if ((p == NULL) ||
+   (params == NULL) ||
+   (params->cir != 0 && params->cbs == 0) ||
+   (params->eir != 0 && params->ebs == 0))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   p->cbs = params->cbs;
+   p->ebs = params->ebs;
+   rte_meter_get_tb_params(hz, params->cir, &p->cir_period,
+   &p->cir_bytes_per_period);
+   rte_meter_get_tb_params(hz, params->eir, &p->eir_period,
+   &p->eir_bytes_per_period);
+
+   return 0;
+}
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_config(
+   struct rte_meter_trtcm_rfc4115 *m,
+   struct rte_meter_trtcm_rfc4115_profile *p)
+{
+   /* Check input parameters */
+   if ((m == NULL) || (p == NULL))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   m->time_tc = m->time_te = rte_get_tsc_cycles();
+   m->tc = p->cbs;
+   m->te = p->ebs;
+
+   return 0;
+}
diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index 58a051583..e39919da0 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -16,6 +16,7 @@ extern "C" {
  * Traffic metering algorithms:
  *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697
  *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698
+ *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115
  *
  ***/
 
@@ -49,10 +50,21 @@ be greater than or equal to CIR. Both CBS or EBS have to be 
greater than zero. *
 struct rte_meter_trtcm_params {
uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes 
per second. */
uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per 
second. */
-   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */
uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
 };
 
+/** trTCM parameters per metered traffic flow. The CIR, EIT, CBS and EBS
+parameters only count bytes of IP packets and do not include link specific
+headers. The CBS and EBS need to be greater than zero if CIR and EIR are
+none-zero respectively.*/
+struct rte_meter_trtcm_rfc4115_params {
+   uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes 
per second. */
+   uint64_t eir; /**< Excess Information Rate (EIR). Measured in bytes per 
second. */
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */
+   uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */
+};
+
 /**
  * Internal data structure storing the srTCM configuration profile. Typically
  * shared by multiple srTCM objects.
@@ -65,12 +77,24 @@ struct rte_meter_srtcm_profile;
  */
 struct rte_meter_trtcm_profile;
 
+/**
+ * Internal data structure storing the trTCM RFC4115 configuration profile.
+ * Typically shared by multiple trTCM objects.
+ */
+struct rte_meter_trtcm_rfc4115_profile;
+
 /** Internal data structure storing the srTCM run-time context per metered 
traffic flow. */
 struct rte_meter_srtcm;
 
 /** Internal data structure storing the trTCM run-time context per metered 
traffic flow. */
 struct rte_meter_trtcm;
 
+/**
+ * Internal data structure storing the trTCM RFC4115 run-time context per
+ * metered traffic flow.
+ */
+struct rte_meter_trtcm_rfc4115;
+
 /**
  * srTCM profile configuration
  *
@@ -98,6 +122,23 @@ rte_meter_srtcm_profile_config(struct 
rte_meter_srtcm_profile *p,
 int
 rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p,
struct rte_meter_trtcm_params *params);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * trTCM RFC 4115 profile configuration
+ *
+ * @param p
+ *Pointer to pre-allocated trTCM profile data structure
+ * @param params
+ *trTCM profile parameters
+ * @return
+ *0 upon success, error code otherwise
+ */
+

[dpdk-dev] [PATCH v3 0/2] lib/librte_meter: add RFC4115 trTCM meter support

2018-12-18 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.


Signed-off-by: Eelco Chaudron 

v3:
 - Gave the rfc4115 meter it's own data structures

v2:
 - Marked all functions with __rte_experimental, and added
   "EXPERIMENTAL:..." to doxygen comments.
 - Removed library version change, and merged it with first patch
 - Do not call rte_meter_trtcm_rfc4115_color_aware_check in
   rte_meter_trtcm_rfc4115_color_blind_check to avoid error with
   it being marked as experimental

Eelco Chaudron (2):
  lib/librte_meter: add RFC4115 trTCM meter support
  test/test_meter: update meter test to include RFC4115 meters


 lib/librte_meter/rte_meter.c   |   42 ++
 lib/librte_meter/rte_meter.h   |  235 
 lib/librte_meter/rte_meter_version.map |9 +
 test/test/test_meter.c |  212 +
 4 files changed, 496 insertions(+), 2 deletions(-)



[dpdk-dev] VF of a X520 card does not process VLAN packets

2018-12-18 Thread Eelco Chaudron

Hi,

I’m assigning a VF of an X520 card for DPDK/testpmd/OVS but it's not 
accepting tagged VLAN packets (it does accept tag 0).  Is this a known 
bug/limitation of the 82599ES chipset?


This is how I've tested it:

$ echo 1 > /sys/bus/pci/devices/\:05\:00.0/sriov_numvfs
$driverctl -v set-override :05:10.0 vfio-pci
$./testpmd -c 7 -n 4 --socket-mem 2048,0 -w :05:10.0 -- \
  --burst 64 -i --rxq=1 --txq=1 --rxd=4096 --txd=1024 \
  —coremask=6 --auto-start --port-topology=chained
EAL: Detected 28 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
...
testpmd>


I’m sending broadcast ARP packets:

Ethernet II, Src: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00), Dst: 
ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)

Destination: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)
Address: ff:ff:ff:ff:ff:ff (ff:ff:ff:ff:ff:ff)
 ..1.     = LG bit: Locally administered 
address (this is NOT the factory default)
 ...1     = IG bit: Group address 
(multicast/broadcast)

Source: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00)
Address: 04:f4:bc:43:e1:00 (04:f4:bc:43:e1:00)
 ..0.     = LG bit: Globally unique address 
(factory default)
 ...0     = IG bit: Individual address 
(unicast)

Type: 802.1Q Virtual LAN (0x8100)
802.1Q Virtual LAN, PRI: 0, CFI: 0, ID: 10
000.    = Priority: Best Effort (default) (0)
...0    = CFI: Canonical (0)
   1010 = ID: 10
Type: ARP (0x0806)
Padding: 2e2f303132333435363738393a3b
Trailer: 3c3d3e3f404142434445464748494a4b4c4d4e4f50515253...
Address Resolution Protocol (request/gratuitous ARP)
Hardware type: Ethernet (1)
Protocol type: IP (0x0800)
Hardware size: 6
Protocol size: 4
Opcode: request (1)
[Is gratuitous: True]
Sender MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00)
Sender IP address: 0.0.0.0 (0.0.0.0)
Target MAC address: 00:00:00:00:00:00 (00:00:00:00:00:00)
Target IP address: 0.0.0.0 (0.0.0.0)

No traffic is received, if you replace tag 10 with 0, packets are 
received.


Note that this is a simple reproducer with testpmd, but it's reported as 
part of an OVS integration. This works fine on the physical port, or 
with a VF on an XL710 card.


Any input appreciated.

Thanks,

Eelco


Re: [dpdk-dev] [PATCH v2 1/2] lib/librte_meter: add RFC4115 trTCM meter support

2018-12-17 Thread Eelco Chaudron




On 17 Dec 2018, at 12:23, Dumitrescu, Cristian wrote:


Hi Eelco,


-Original Message-
From: Eelco Chaudron [mailto:echau...@redhat.com]
Sent: Thursday, November 29, 2018 11:29 AM
To: Dumitrescu, Cristian 
Cc: dev@dpdk.org
Subject: [PATCH v2 1/2] lib/librte_meter: add RFC4115 trTCM meter 
support


This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_meter/rte_meter.c   |   40 +
 lib/librte_meter/rte_meter.h   |  236
++--
 lib/librte_meter/rte_meter_version.map |9 +
 3 files changed, 269 insertions(+), 16 deletions(-)

diff --git a/lib/librte_meter/rte_meter.c 
b/lib/librte_meter/rte_meter.c

index 473f69aba..817680dbb 100644
--- a/lib/librte_meter/rte_meter.c
+++ b/lib/librte_meter/rte_meter.c
@@ -110,3 +110,43 @@ rte_meter_trtcm_config(struct rte_meter_trtcm
*m,

return 0;
 }
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_profile_config(struct 
rte_meter_trtcm_profile

*p,
+   struct rte_meter_trtcm_params *params)
+{
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Check input parameters */
+   if ((p == NULL) ||
+   (params == NULL) ||
+   (params->cir != 0 && params->cbs == 0) ||
+   (params->eir != 0 && params->ebs == 0))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   p->cbs = params->cbs;
+   p->ebs = params->ebs;
+   rte_meter_get_tb_params(hz, params->cir, &p->cir_period,
+   &p->cir_bytes_per_period);
+   rte_meter_get_tb_params(hz, params->eir, &p->eir_period,
+   &p->eir_bytes_per_period);
+
+   return 0;
+}
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm *m,
+   struct rte_meter_trtcm_profile *p)
+{
+   /* Check input parameters */
+   if ((m == NULL) || (p == NULL))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   m->time_tc = m->time_te = rte_get_tsc_cycles();
+   m->tc = p->cbs;
+   m->te = p->ebs;
+
+   return 0;
+}
diff --git a/lib/librte_meter/rte_meter.h 
b/lib/librte_meter/rte_meter.h

index 58a051583..c3f88d0cb 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -16,6 +16,7 @@ extern "C" {
  * Traffic metering algorithms:
  *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 
2697
  *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 
2698
+ *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 
4115

  *
  ***/

@@ -43,14 +44,25 @@ struct rte_meter_srtcm_params {
uint64_t ebs; /**< Excess Burst Size (EBS).  Measured in bytes. */
 };

-/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and 
PBS

parameters
-only count bytes of IP packets and do not include link specific 
headers. PIR

has to
-be greater than or equal to CIR. Both CBS or EBS have to be greater 
than

zero. */
+/** trTCM parameters per metered traffic flow. The CIR, PIR/EIT, CBS 
and

PBS/EBS
+parameters only count bytes of IP packets and do not include link 
specific

+headers.
+
+- For RFC2698 operations PIR has to be greater than or equal to CIR. 
Both

CBS
+  or EBS have to be greater than zero.
+- For RFC4115 operations CBS and EBS need to be greater than zero if 
CIR

and
+  EIR are none-zero respectively.*/
 struct rte_meter_trtcm_params {
uint64_t cir; /**< Committed Information Rate (CIR). Measured in
bytes per second. */
-   uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes
per second. */
-   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */
-   uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
+   union {
+   uint64_t pir; /**< Peak Information Rate (PIR). Measured in
bytes per second. */
+   uint64_t eir; /**< Excess Information Rate (EIR). Measured in
bytes per second. */
+   };
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes.
*/
+   union {
+   uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes.
*/
+   uint64_t ebs; /**< Excess Burst Size (EBS). Measured in
bytes. */
+   };
 };



In order to prevent complicating the data structures and/or creating 
false dependencies between algs, please create a separate data 
structure struct rte_meter_trtcm_rfc4115_params.


This is also inline with the naming conventions from 
librte_ethdev/rte_mtr.h


Thanks for the feedback, as this was the thing I wanted to clarify…
I’ll try to send a V2 later this week (or next year ;).



 /**
@@ -98,6 +110,22 @@ rte_meter_srtcm_profile_config(struct
rte_meter_srtcm_profile *p,
 int
 rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p,
struct rte_meter_trtcm_params *p

[dpdk-dev] [PATCH v2 1/2] lib/librte_meter: add RFC4115 trTCM meter support

2018-11-29 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_meter/rte_meter.c   |   40 +
 lib/librte_meter/rte_meter.h   |  236 ++--
 lib/librte_meter/rte_meter_version.map |9 +
 3 files changed, 269 insertions(+), 16 deletions(-)

diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c
index 473f69aba..817680dbb 100644
--- a/lib/librte_meter/rte_meter.c
+++ b/lib/librte_meter/rte_meter.c
@@ -110,3 +110,43 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m,
 
return 0;
 }
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p,
+   struct rte_meter_trtcm_params *params)
+{
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Check input parameters */
+   if ((p == NULL) ||
+   (params == NULL) ||
+   (params->cir != 0 && params->cbs == 0) ||
+   (params->eir != 0 && params->ebs == 0))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   p->cbs = params->cbs;
+   p->ebs = params->ebs;
+   rte_meter_get_tb_params(hz, params->cir, &p->cir_period,
+   &p->cir_bytes_per_period);
+   rte_meter_get_tb_params(hz, params->eir, &p->eir_period,
+   &p->eir_bytes_per_period);
+
+   return 0;
+}
+
+int __rte_experimental
+rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm *m,
+   struct rte_meter_trtcm_profile *p)
+{
+   /* Check input parameters */
+   if ((m == NULL) || (p == NULL))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   m->time_tc = m->time_te = rte_get_tsc_cycles();
+   m->tc = p->cbs;
+   m->te = p->ebs;
+
+   return 0;
+}
diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index 58a051583..c3f88d0cb 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -16,6 +16,7 @@ extern "C" {
  * Traffic metering algorithms:
  *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697
  *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698
+ *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115
  *
  ***/
 
@@ -43,14 +44,25 @@ struct rte_meter_srtcm_params {
uint64_t ebs; /**< Excess Burst Size (EBS).  Measured in bytes. */
 };
 
-/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and PBS 
parameters
-only count bytes of IP packets and do not include link specific headers. PIR 
has to
-be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. 
*/
+/** trTCM parameters per metered traffic flow. The CIR, PIR/EIT, CBS and 
PBS/EBS
+parameters only count bytes of IP packets and do not include link specific
+headers.
+
+- For RFC2698 operations PIR has to be greater than or equal to CIR. Both CBS
+  or EBS have to be greater than zero.
+- For RFC4115 operations CBS and EBS need to be greater than zero if CIR and
+  EIR are none-zero respectively.*/
 struct rte_meter_trtcm_params {
uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes 
per second. */
-   uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per 
second. */
-   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */
-   uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
+   union {
+   uint64_t pir; /**< Peak Information Rate (PIR). Measured in 
bytes per second. */
+   uint64_t eir; /**< Excess Information Rate (EIR). Measured in 
bytes per second. */
+   };
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */
+   union {
+   uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
+   uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. 
*/
+   };
 };
 
 /**
@@ -98,6 +110,22 @@ rte_meter_srtcm_profile_config(struct 
rte_meter_srtcm_profile *p,
 int
 rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p,
struct rte_meter_trtcm_params *params);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * trTCM RFC 4115 profile configuration
+ *
+ * @param p
+ *Pointer to pre-allocated trTCM profile data structure
+ * @param params
+ *trTCM profile parameters
+ * @return
+ *0 upon success, error code otherwise
+ */
+int __rte_experimental
+rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p,
+   struct rte_meter_trtcm_params *params);
 
 /**
  * srTCM configuration per metered traffic flow
@@ -127,6 +155,23 @@ int
 rte_meter_trtcm_config(struct rte_meter_trtcm *m,
struct rte_meter_trtcm_profile *p);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice

[dpdk-dev] [PATCH v2 2/2] test/test_meter: update meter test to include RFC4115 meters

2018-11-29 Thread Eelco Chaudron
Add test cases for RFC4115 meters

Signed-off-by: Eelco Chaudron 
---
 test/test/test_meter.c |  212 
 1 file changed, 212 insertions(+)

diff --git a/test/test/test_meter.c b/test/test/test_meter.c
index 8bb47e75c..097da0143 100644
--- a/test/test/test_meter.c
+++ b/test/test/test_meter.c
@@ -32,8 +32,10 @@
 
 #define TM_TEST_TRTCM_CIR_DF 4600
 #define TM_TEST_TRTCM_PIR_DF 6900
+#define TM_TEST_TRTCM_EIR_DF 6900
 #define TM_TEST_TRTCM_CBS_DF 2048
 #define TM_TEST_TRTCM_PBS_DF 4096
+#define TM_TEST_TRTCM_EBS_DF 4096
 
 static struct rte_meter_srtcm_params sparams =
{.cir = TM_TEST_SRTCM_CIR_DF,
@@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams=
 .cbs = TM_TEST_TRTCM_CBS_DF,
 .pbs = TM_TEST_TRTCM_PBS_DF,};
 
+static struct rte_meter_trtcm_params rfc4115params =
+   {.cir = TM_TEST_TRTCM_CIR_DF,
+.eir = TM_TEST_TRTCM_EIR_DF,
+.cbs = TM_TEST_TRTCM_CBS_DF,
+.ebs = TM_TEST_TRTCM_EBS_DF,};
+
 /**
  * functional test for rte_meter_srtcm_config
  */
@@ -148,6 +156,45 @@ tm_test_trtcm_config(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_config
+ */
+static inline int
+tm_test_trtcm_rfc4115_config(void)
+{
+   struct rte_meter_trtcm_profile tp;
+   struct  rte_meter_trtcm_params rfc4115params1;
+#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config"
+
+   /* invalid parameter test */
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /*
+* cbs and pbs should be none-zero if cir and eir are none-zero
+* respectively
+*/
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.cbs = 0;
+   if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.pbs = 0;
+   if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /* usual parameter, should be successful */
+   if (rte_meter_trtcm_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   return 0;
+}
+
 /**
  * functional test for rte_meter_srtcm_color_blind_check
  */
@@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_color_blind_check
+ */
+static inline int
+tm_test_trtcm_rfc4115_color_blind_check(void)
+{
+#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check"
+
+   uint64_t time;
+   struct rte_meter_trtcm_profile tp;
+   struct rte_meter_trtcm tm;
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Test green */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1)
+   != e_RTE_METER_GREEN)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN");
+
+   /* Test yellow */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   /* Test red */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp

[dpdk-dev] [PATCH v2 0/2] lib/librte_meter: add RFC4115 trTCM meter support

2018-11-29 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 

v2:
 - Marked all functions with __rte_experimental, and added
   "EXPERIMENTAL:..." to doxygen comments.
 - Removed library version change, and merged it with first patch
 - Do not call rte_meter_trtcm_rfc4115_color_aware_check in
   rte_meter_trtcm_rfc4115_color_blind_check to avoid error with
   it being marked as experimental

Eelco Chaudron (2):
  lib/librte_meter: add RFC4115 trTCM meter support
  test/test_meter: update meter test to include RFC4115 meters


 lib/librte_meter/rte_meter.c   |   40 +
 lib/librte_meter/rte_meter.h   |  236 ++--
 lib/librte_meter/rte_meter_version.map |9 +
 test/test/test_meter.c |  212 +
 4 files changed, 481 insertions(+), 16 deletions(-)



Re: [dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function

2018-11-28 Thread Eelco Chaudron



On 28 Nov 2018, at 13:51, Thomas Monjalon wrote:

> 28/11/2018 13:40, Eelco Chaudron:
>>
>> On 28 Nov 2018, at 11:09, Thomas Monjalon wrote:
>>
>>> 28/11/2018 10:27, Eelco Chaudron:
>>>> On 28 Nov 2018, at 9:38, David Marchand wrote:
>>>>> On Tue, Nov 27, 2018 at 4:22 PM Eelco Chaudron 
>>>>> wrote:
>>>>>> --- a/lib/librte_meter/Makefile
>>>>>> +++ b/lib/librte_meter/Makefile
>>>>>> -LIBABIVER := 2
>>>>>> +LIBABIVER := 3
>>>>>
>>>>> As far as I understood the policy around the EXPERIMENTAL section,
>>>>> you
>>>>> don't need to bump the library version.
>>>>
>>>> Thought I would add the new function as none experimental, i.e. next
>>>> version, but checkpatch did not allow me to do this.
>>>>
>>>> Tried to find info on what the right process was, as these functions
>>>> are
>>>> just another meter implementation using similar existing APIs. If
>>>> anyone
>>>> has any background on this please point me to it.
>>>
>>> It is documented here:
>>> http://doc.dpdk.org/guides/contributing/versioning.html
>>>
>>> The case for "similar API" is not handled specifically so far.
>>> So you need to introduce it as experimental.
>>
>> Thanks for the clarification, will update the APIs with
>> __rte_experimental in the next iteration.
>
> You should also add this on top of the doxygen comment:
>
>  * @warning
>  * @b EXPERIMENTAL: this API may change without prior notice

Thanks, done!


Re: [dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function

2018-11-28 Thread Eelco Chaudron




On 28 Nov 2018, at 11:09, Thomas Monjalon wrote:


28/11/2018 10:27, Eelco Chaudron:

On 28 Nov 2018, at 9:38, David Marchand wrote:

On Tue, Nov 27, 2018 at 4:22 PM Eelco Chaudron 
wrote:

--- a/lib/librte_meter/Makefile
+++ b/lib/librte_meter/Makefile
-LIBABIVER := 2
+LIBABIVER := 3


As far as I understood the policy around the EXPERIMENTAL section, 
you

don't need to bump the library version.


Thought I would add the new function as none experimental, i.e. next
version, but checkpatch did not allow me to do this.

Tried to find info on what the right process was, as these functions 
are
just another meter implementation using similar existing APIs. If 
anyone

has any background on this please point me to it.


It is documented here:
http://doc.dpdk.org/guides/contributing/versioning.html

The case for "similar API" is not handled specifically so far.
So you need to introduce it as experimental.


Thanks for the clarification, will update the APIs with 
__rte_experimental in the next iteration.



I changed the library version as an existing data structure changed
(which in theory should not change the location of the data), but the
ABI check popped warnings so I decided to increase the version.


It deserves to analyze why the ABI check raises a warning.
If it really needs to bump the ABI version, you should justify it
in the commit message, and explain what changed in the ABI section
of the release notes, plus update the version in the release notes.


Will look at it more closely and update it for the next iteration.


Re: [dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function

2018-11-28 Thread Eelco Chaudron




On 28 Nov 2018, at 9:38, David Marchand wrote:


Hello Eelco,

On Tue, Nov 27, 2018 at 4:22 PM Eelco Chaudron  
wrote:



Update the ABI to include the new RFC4115 meter functions
---
 lib/librte_meter/Makefile  |2 +-
 lib/librte_meter/meson.build   |2 +-
 lib/librte_meter/rte_meter_version.map |9 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/librte_meter/Makefile b/lib/librte_meter/Makefile
index 2dc071e8e..79ad79744 100644
--- a/lib/librte_meter/Makefile
+++ b/lib/librte_meter/Makefile
@@ -16,7 +16,7 @@ LDLIBS += -lrte_eal

 EXPORT_MAP := rte_meter_version.map

-LIBABIVER := 2
+LIBABIVER := 3

 #
 # all source are stored in SRCS-y



As far as I understood the policy around the EXPERIMENTAL section, you
don't need to bump the library version.


Thought I would add the new function as none experimental, i.e. next 
version, but checkpatch did not allow me to do this.


Tried to find info on what the right process was, as these functions are 
just another meter implementation using similar existing APIs. If anyone 
has any background on this please point me to it.


I changed the library version as an existing data structure changed 
(which in theory should not change the location of the data), but the 
ABI check popped warnings so I decided to increase the version.




+ you should squash this into the first patch.


I’ll do this on the next version.

Thanks,

Eelco


[dpdk-dev] [PATCH 3/3] lib/librte_meter: update abi to include new rfc4115 function

2018-11-27 Thread Eelco Chaudron
Update the ABI to include the new RFC4115 meter functions
---
 lib/librte_meter/Makefile  |2 +-
 lib/librte_meter/meson.build   |2 +-
 lib/librte_meter/rte_meter_version.map |9 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/librte_meter/Makefile b/lib/librte_meter/Makefile
index 2dc071e8e..79ad79744 100644
--- a/lib/librte_meter/Makefile
+++ b/lib/librte_meter/Makefile
@@ -16,7 +16,7 @@ LDLIBS += -lrte_eal
 
 EXPORT_MAP := rte_meter_version.map
 
-LIBABIVER := 2
+LIBABIVER := 3
 
 #
 # all source are stored in SRCS-y
diff --git a/lib/librte_meter/meson.build b/lib/librte_meter/meson.build
index 947bc19e2..422123e20 100644
--- a/lib/librte_meter/meson.build
+++ b/lib/librte_meter/meson.build
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
-version = 2
+version = 3
 sources = files('rte_meter.c')
 headers = files('rte_meter.h')
diff --git a/lib/librte_meter/rte_meter_version.map 
b/lib/librte_meter/rte_meter_version.map
index cb79f0c2b..4b460d580 100644
--- a/lib/librte_meter/rte_meter_version.map
+++ b/lib/librte_meter/rte_meter_version.map
@@ -17,3 +17,12 @@ DPDK_18.08 {
rte_meter_srtcm_profile_config;
rte_meter_trtcm_profile_config;
 };
+
+EXPERIMENTAL {
+   global:
+
+   rte_meter_trtcm_rfc4115_color_aware_check;
+   rte_meter_trtcm_rfc4115_color_blind_check;
+   rte_meter_trtcm_rfc4115_config;
+   rte_meter_trtcm_rfc4115_profile_config;
+};



[dpdk-dev] [PATCH 1/3] lib/librte_meter: add RFC4115 trTCM meter support

2018-11-27 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.

Signed-off-by: Eelco Chaudron 
---
 lib/librte_meter/rte_meter.c |   40 +
 lib/librte_meter/rte_meter.h |  193 +++---
 2 files changed, 217 insertions(+), 16 deletions(-)

diff --git a/lib/librte_meter/rte_meter.c b/lib/librte_meter/rte_meter.c
index 473f69aba..aeeac69fe 100644
--- a/lib/librte_meter/rte_meter.c
+++ b/lib/librte_meter/rte_meter.c
@@ -110,3 +110,43 @@ rte_meter_trtcm_config(struct rte_meter_trtcm *m,
 
return 0;
 }
+
+int
+rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p,
+   struct rte_meter_trtcm_params *params)
+{
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Check input parameters */
+   if ((p == NULL) ||
+   (params == NULL) ||
+   (params->cir != 0 && params->cbs == 0) ||
+   (params->eir != 0 && params->ebs == 0))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   p->cbs = params->cbs;
+   p->ebs = params->ebs;
+   rte_meter_get_tb_params(hz, params->cir, &p->cir_period,
+   &p->cir_bytes_per_period);
+   rte_meter_get_tb_params(hz, params->eir, &p->eir_period,
+   &p->eir_bytes_per_period);
+
+   return 0;
+}
+
+int
+rte_meter_trtcm_rfc4115_config(struct rte_meter_trtcm *m,
+   struct rte_meter_trtcm_profile *p)
+{
+   /* Check input parameters */
+   if ((m == NULL) || (p == NULL))
+   return -EINVAL;
+
+   /* Initialize trTCM run-time structure */
+   m->time_tc = m->time_te = rte_get_tsc_cycles();
+   m->tc = p->cbs;
+   m->te = p->ebs;
+
+   return 0;
+}
diff --git a/lib/librte_meter/rte_meter.h b/lib/librte_meter/rte_meter.h
index 58a051583..ff1ca3f77 100644
--- a/lib/librte_meter/rte_meter.h
+++ b/lib/librte_meter/rte_meter.h
@@ -16,6 +16,7 @@ extern "C" {
  * Traffic metering algorithms:
  *1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697
  *2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698
+ *3. Two Rate Three Color Marker (trTCM): defined by IETF RFC 4115
  *
  ***/
 
@@ -43,14 +44,25 @@ struct rte_meter_srtcm_params {
uint64_t ebs; /**< Excess Burst Size (EBS).  Measured in bytes. */
 };
 
-/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and PBS 
parameters
-only count bytes of IP packets and do not include link specific headers. PIR 
has to
-be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. 
*/
+/** trTCM parameters per metered traffic flow. The CIR, PIR/EIT, CBS and 
PBS/EBS
+parameters only count bytes of IP packets and do not include link specific
+headers.
+
+- For RFC2698 operations PIR has to be greater than or equal to CIR. Both CBS
+  or EBS have to be greater than zero.
+- For RFC4115 operations CBS and EBS need to be greater than zero if CIR and
+  EIR are none-zero respectively.*/
 struct rte_meter_trtcm_params {
uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes 
per second. */
-   uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per 
second. */
-   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */
-   uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
+   union {
+   uint64_t pir; /**< Peak Information Rate (PIR). Measured in 
bytes per second. */
+   uint64_t eir; /**< Excess Information Rate (EIR). Measured in 
bytes per second. */
+   };
+   uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */
+   union {
+   uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */
+   uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. 
*/
+   };
 };
 
 /**
@@ -98,6 +110,19 @@ rte_meter_srtcm_profile_config(struct 
rte_meter_srtcm_profile *p,
 int
 rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p,
struct rte_meter_trtcm_params *params);
+/**
+ * trTCM RFC 4115 profile configuration
+ *
+ * @param p
+ *Pointer to pre-allocated trTCM profile data structure
+ * @param params
+ *trTCM profile parameters
+ * @return
+ *0 upon success, error code otherwise
+ */
+int
+rte_meter_trtcm_rfc4115_profile_config(struct rte_meter_trtcm_profile *p,
+   struct rte_meter_trtcm_params *params);
 
 /**
  * srTCM configuration per metered traffic flow
@@ -127,6 +152,20 @@ int
 rte_meter_trtcm_config(struct rte_meter_trtcm *m,
struct rte_meter_trtcm_profile *p);
 
+/**
+ * trTCM RFC 4115 configuration per metered traffic flow
+ *
+ * @param m
+ *Pointer to pre-allocated trTCM data structure
+ * @param p
+ *trTCM profile. Needs to be valid.
+ * @return
+ *0 upon success, error code otherwise
+ */
+int
+rte_meter_trtcm

[dpdk-dev] [PATCH 0/3] lib/librte_meter: add RFC4115 trTCM meter support

2018-11-27 Thread Eelco Chaudron
This patch adds support for RFC4115 trTCM meters.


Signed-off-by: Eelco Chaudron 

Eelco Chaudron (3):
  lib/librte_meter: add RFC4115 trTCM meter support
  test/test_meter: update meter test to include RFC4115 meters
  lib/librte_meter: update abi to include new rfc4115 function


 lib/librte_meter/Makefile  |2 
 lib/librte_meter/meson.build   |2 
 lib/librte_meter/rte_meter.c   |   40 ++
 lib/librte_meter/rte_meter.h   |  193 +++--
 lib/librte_meter/rte_meter_version.map |9 +
 test/test/test_meter.c |  212 
 6 files changed, 440 insertions(+), 18 deletions(-)



[dpdk-dev] [PATCH 2/3] test/test_meter: update meter test to include RFC4115 meters

2018-11-27 Thread Eelco Chaudron
Add test cases for RFC4115 meters

Signed-off-by: Eelco Chaudron 
---
 test/test/test_meter.c |  212 
 1 file changed, 212 insertions(+)

diff --git a/test/test/test_meter.c b/test/test/test_meter.c
index 8bb47e75c..097da0143 100644
--- a/test/test/test_meter.c
+++ b/test/test/test_meter.c
@@ -32,8 +32,10 @@
 
 #define TM_TEST_TRTCM_CIR_DF 4600
 #define TM_TEST_TRTCM_PIR_DF 6900
+#define TM_TEST_TRTCM_EIR_DF 6900
 #define TM_TEST_TRTCM_CBS_DF 2048
 #define TM_TEST_TRTCM_PBS_DF 4096
+#define TM_TEST_TRTCM_EBS_DF 4096
 
 static struct rte_meter_srtcm_params sparams =
{.cir = TM_TEST_SRTCM_CIR_DF,
@@ -46,6 +48,12 @@ static structrte_meter_trtcm_params tparams=
 .cbs = TM_TEST_TRTCM_CBS_DF,
 .pbs = TM_TEST_TRTCM_PBS_DF,};
 
+static struct rte_meter_trtcm_params rfc4115params =
+   {.cir = TM_TEST_TRTCM_CIR_DF,
+.eir = TM_TEST_TRTCM_EIR_DF,
+.cbs = TM_TEST_TRTCM_CBS_DF,
+.ebs = TM_TEST_TRTCM_EBS_DF,};
+
 /**
  * functional test for rte_meter_srtcm_config
  */
@@ -148,6 +156,45 @@ tm_test_trtcm_config(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_config
+ */
+static inline int
+tm_test_trtcm_rfc4115_config(void)
+{
+   struct rte_meter_trtcm_profile tp;
+   struct  rte_meter_trtcm_params rfc4115params1;
+#define TRTCM_RFC4115_CFG_MSG "trtcm_rfc4115_config"
+
+   /* invalid parameter test */
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, NULL) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+   if (rte_meter_trtcm_rfc4115_profile_config(NULL, &rfc4115params) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /*
+* cbs and pbs should be none-zero if cir and eir are none-zero
+* respectively
+*/
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.cbs = 0;
+   if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   rfc4115params1 = rfc4115params;
+   rfc4115params1.pbs = 0;
+   if (rte_meter_trtcm_profile_config(&tp, &rfc4115params1) == 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   /* usual parameter, should be successful */
+   if (rte_meter_trtcm_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_CFG_MSG);
+
+   return 0;
+}
+
 /**
  * functional test for rte_meter_srtcm_color_blind_check
  */
@@ -265,6 +312,65 @@ tm_test_trtcm_color_blind_check(void)
return 0;
 }
 
+/**
+ * functional test for rte_meter_trtcm_rfc4115_color_blind_check
+ */
+static inline int
+tm_test_trtcm_rfc4115_color_blind_check(void)
+{
+#define TRTCM_RFC4115_BLIND_CHECK_MSG "trtcm_rfc4115_blind_check"
+
+   uint64_t time;
+   struct rte_meter_trtcm_profile tp;
+   struct rte_meter_trtcm tm;
+   uint64_t hz = rte_get_tsc_hz();
+
+   /* Test green */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF - 1)
+   != e_RTE_METER_GREEN)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" GREEN");
+
+   /* Test yellow */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_CBS_DF + 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp, &rfc4115params) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   if (rte_meter_trtcm_rfc4115_config(&tm, &tp) != 0)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG);
+   time = rte_get_tsc_cycles() + hz;
+   if (rte_meter_trtcm_rfc4115_color_blind_check(
+   &tm, &tp, time, TM_TEST_TRTCM_EBS_DF - 1)
+   != e_RTE_METER_YELLOW)
+   melog(TRTCM_RFC4115_BLIND_CHECK_MSG" YELLOW");
+
+   /* Test red */
+   if (rte_meter_trtcm_rfc4115_profile_config(&tp

Re: [dpdk-dev] [dpdk-stable] [PATCH 17.11] mem: fix memory initialization time

2018-11-14 Thread Eelco Chaudron




On 12 Nov 2018, at 12:26, Eelco Chaudron wrote:


On 12 Nov 2018, at 12:18, Alejandro Lucero wrote:


When using large amount of hugepage based memory, doing all the
hugepages mapping can take quite significant time.

The problem is hugepages being initially mmaped to virtual addresses
which will be tried later for the final hugepage mmaping. This causes
the final mapping requiring calling mmap with another hint address 
which
can happen several times, depending on the amount of memory to mmap, 
and

which each mmmap taking more than a second.

This patch changes the hint for the initial hugepage mmaping using
a starting address which will not collide with the final mmaping.

Fixes: 293c0c4b957f ("mem: use address hint for mapping hugepages")

Signed-off-by: Alejandro Lucero 


Thanks Alejandro for sending the patch. This issue was found in an 
OVS-DPDK environment.

I verified/tested the patch.

Acked-by: Eelco Chaudron 
Tested-by: Eelco Chaudron 

---
 lib/librte_eal/linuxapp/eal/eal_memory.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c

index bac969a12..0675809b7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -421,6 +421,21 @@ map_all_hugepages(struct hugepage_file 
*hugepg_tbl, struct hugepage_info *hpi,

}
 #endif

+#ifdef RTE_ARCH_64
+   /*
+* Hugepages are first mmaped individually and then re-mmapped to
+	 * another region for having contiguous physical pages in 
contiguous

+* virtual addresses. Setting here vma_addr for the first hugepage
+	 * mapped to a virtual address which will not collide with the 
second

+* mmaping later. The next hugepages will use increments of this
+* initial address.
+*
+* The final virtual address will be based on baseaddr which is
+* 0x1. We use a hint here starting at 0x2, leaving
+	 * another 4GB just in case, plus the total available hugepages 
memory.

+*/
+	vma_addr = (char *)0x2 + (hpi->hugepage_sz * 
hpi->num_pages[0]);

+#endif
for (i = 0; i < hpi->num_pages[0]; i++) {
uint64_t hugepage_sz = hpi->hugepage_sz;

--
2.17.1


Adding OVS dev to this thread, as this issue is introduced in DPDK 
17.11.4.




Re: [dpdk-dev] [dpdk-stable] [PATCH 17.11] mem: fix memory initialization time

2018-11-12 Thread Eelco Chaudron




On 12 Nov 2018, at 12:18, Alejandro Lucero wrote:


When using large amount of hugepage based memory, doing all the
hugepages mapping can take quite significant time.

The problem is hugepages being initially mmaped to virtual addresses
which will be tried later for the final hugepage mmaping. This causes
the final mapping requiring calling mmap with another hint address 
which
can happen several times, depending on the amount of memory to mmap, 
and

which each mmmap taking more than a second.

This patch changes the hint for the initial hugepage mmaping using
a starting address which will not collide with the final mmaping.

Fixes: 293c0c4b957f ("mem: use address hint for mapping hugepages")

Signed-off-by: Alejandro Lucero 


Thanks Alejandro for sending the patch. This issue was found in an 
OVS-DPDK environment.

I verified/tested the patch.

Acked-by: Eelco Chaudron 
Tested-by: Eelco Chaudron 

---
 lib/librte_eal/linuxapp/eal/eal_memory.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c

index bac969a12..0675809b7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -421,6 +421,21 @@ map_all_hugepages(struct hugepage_file 
*hugepg_tbl, struct hugepage_info *hpi,

}
 #endif

+#ifdef RTE_ARCH_64
+   /*
+* Hugepages are first mmaped individually and then re-mmapped to
+* another region for having contiguous physical pages in contiguous
+* virtual addresses. Setting here vma_addr for the first hugepage
+	 * mapped to a virtual address which will not collide with the 
second

+* mmaping later. The next hugepages will use increments of this
+* initial address.
+*
+* The final virtual address will be based on baseaddr which is
+* 0x1. We use a hint here starting at 0x2, leaving
+	 * another 4GB just in case, plus the total available hugepages 
memory.

+*/
+	vma_addr = (char *)0x2 + (hpi->hugepage_sz * 
hpi->num_pages[0]);

+#endif
for (i = 0; i < hpi->num_pages[0]; i++) {
uint64_t hugepage_sz = hpi->hugepage_sz;

--
2.17.1


Re: [dpdk-dev] RFC 4115 variant for rte_meter_trtcm_color_blind/aware_check()

2018-11-06 Thread Eelco Chaudron

On 6 Nov 2018, at 11:53, Dumitrescu, Cristian wrote:


-Original Message-
From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Eelco Chaudron
Sent: Tuesday, November 6, 2018 10:32 AM
To: dev@dpdk.org
Subject: [dpdk-dev] RFC 4115 variant for
rte_meter_trtcm_color_blind/aware_check()

Hi All,

Was wondering if anyone is planning to add an RFC 4115 equivalent of 
the

rte_meter_trtcm_color_blind/aware_check() check APIs?

If not I can go ahead and sent a patch to add it, i.e. something like
rte_meter_trtcm_rfc4115_color_blind/aware_check().

Cheers,

Eelco


Hi Eelco,

This is somewhere on my ToDo list, but if you're ready to send a patch 
now, I would be very happy to review it and merge. Thanks for your 
help!


Hi Christian,

I have it on my ToDo list also ;) But let me take it of yours, and 
I’ll investigate what it takes to add it (also from the testing 
framework side…).


//Eelco


[dpdk-dev] RFC 4115 variant for rte_meter_trtcm_color_blind/aware_check()

2018-11-06 Thread Eelco Chaudron

Hi All,

Was wondering if anyone is planning to add an RFC 4115 equivalent of the 
rte_meter_trtcm_color_blind/aware_check() check APIs?


If not I can go ahead and sent a patch to add it, i.e. something like 
rte_meter_trtcm_rfc4115_color_blind/aware_check().


Cheers,

Eelco


Re: [dpdk-dev] [dpdk-stable] [PATCH v4 2/5] bus/pci: use IOVAs check when setting IOVA mode

2018-07-11 Thread Eelco Chaudron




On 10 Jul 2018, at 19:25, Alejandro Lucero wrote:


Although VT-d emulation currently only supports 39 bits, it could
be iovas being within that supported range. This patch allows
IOVA mode in such a case.

Indeed, memory initialization code can be modified for using lower
virtual addresses than those used by the kernel for 64 bits processes
by default, and therefore memsegs iovas can use 39 bits or less for
most system. And this is likely 100% true for VMs.

Applicable to v17.11.3 only.

Signed-off-by: Alejandro Lucero 
---
 drivers/bus/pci/linux/pci.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 74deef3..792c819 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "eal_private.h"
 #include "eal_filesystem.h"
@@ -613,10 +614,12 @@
fclose(fp);

 	mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 
1;

-   if (mgaw < X86_VA_WIDTH)
+
+   if (!rte_eal_check_dma_mask(mgaw))
+   return true;
+   else
return false;

-   return true;
 }
 #elif defined(RTE_ARCH_PPC_64)
 static bool
@@ -640,13 +643,17 @@
 {
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
+   int iommu_dma_mask_check_done = 0;

FOREACH_DRIVER_ON_PCIBUS(drv) {
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (!rte_pci_match(drv, dev))
continue;
-   if (!pci_one_device_iommu_support_va(dev))
-   return false;
+   if (!iommu_dma_mask_check_done) {
+   if (pci_one_device_iommu_support_va(dev) < 0)
+   return false;
+   iommu_dma_mask_check_done  = 1;


As I do not know enough on what IOMMU hardware can coexist, I leave this 
patch for others to review.

Here is the previous question/answer:

Not sure why this change? Why do we only need to check one device on 
all the buses?


Because there is just one emulated IOMMU hardware. The limitation in 
this case is not in a specific PCI device. And I do not think it is 
possible to have two different (emulated or not) IOMMU hardware. Yes, 
you can have more than one controller but being same IOMMU type.


If the above is confirmed, you can consider this patch ack’ed.


+   }
}
}
return true;
--
1.9.1


Re: [dpdk-dev] [dpdk-stable] [PATCH v4 1/5] mem: add function for checking memsegs IOVAs addresses

2018-07-11 Thread Eelco Chaudron



On 10 Jul 2018, at 19:25, Alejandro Lucero wrote:

> A device can suffer addressing limitations. This functions checks
> memsegs have iovas within the supported range based on dma mask.
>
> PMD should use this during initialization if supported devices
> suffer addressing limitations, returning an error if this function
> returns memsegs out of range.
>
> Another potential usage is for emulated IOMMU hardware with addressing
> limitations.
>
> Applicable to v17.11.3 only.
>
> Signed-off-by: Alejandro Lucero 
> Acked-by: Anatoly Burakov 
> ---
Looks good to me.

Acked-by: Eelco Chaudron 



Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses

2018-07-10 Thread Eelco Chaudron




On 10 Jul 2018, at 13:43, Alejandro Lucero wrote:


On Tue, Jul 10, 2018 at 12:33 PM, Burakov, Anatoly <
anatoly.bura...@intel.com> wrote:


On 10-Jul-18 12:14 PM, Eelco Chaudron wrote:




On 10 Jul 2018, at 12:52, Alejandro Lucero wrote:

On Tue, Jul 10, 2018 at 11:06 AM, Eelco Chaudron 


wrote:




On 10 Jul 2018, at 11:34, Alejandro Lucero wrote:

On Tue, Jul 10, 2018 at 9:56 AM, Eelco Chaudron 




wrote:




On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

A device can suffer addressing limitations. This functions 
checks


memsegs have iovas within the supported range based on dma mask.


PMD should use this during initialization if supported devices
suffer addressing limitations, returning an error if this 
function

returns memsegs out of range.

Another potential usage is for emulated IOMMU hardware with
addressing
limitations.

Signed-off-by: Alejandro Lucero 


Acked-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 33
++
 lib/librte_eal/common/include/rte_memory.h |  3 +++
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 37 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c
b/lib/librte_eal/common/eal_common_memory.c
index fc6c44d..f5efebe 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -109,6 +109,39 @@
}
 }

+/* check memseg iovas are within the required range based on 
dma

mask
*/
+int
+rte_eal_check_dma_mask(uint8_t maskbits)
+{
+
+   const struct rte_mem_config *mcfg;
+   uint64_t mask;
+   int i;
+


I think we should add some sanity check to the input maskbits, 
i.e.

[64,0)
or [64, 32]? What would be a reasonable lower bound.


This is not a user's API, so any invocation will be reviewed, 
but I

guess


adding a sanity check here does not harm.

Not sure about lower bound but upper should 64, although it does 
not

make
sense but it is safe. Lower bound is not so problematic.



+   /* create dma mask */


+   mask = ~((1ULL << maskbits) - 1);

+
+   /* get pointer to global configuration */
+   mcfg = rte_eal_get_configuration()->mem_config;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   if (mcfg->memseg[i].addr == NULL)
+   break;


Looking at some other code, it looks like NULL entries might 
exists.

So
should a continue; rather than a break; be used here?


I do not think so. memsegs are allocated sequentially, so first 
with

addr
as NULL implies no more memsegs.



I was referring to the mem walk functions, rte_memseg_list_walk(). 
Maybe

some having more experience with this area can review/comment.



Pre-18.05, all memsegs are allocated continuously. Memseg lists and 
memseg

list walk functions are 18.05+.

Alejandro, perhaps it would be worth it to tag your patchset with
"pre-18.05" to avoid similar confusion in the future?


Yes, that will help. I'm sending a new version shortly and I'll make 
it

clear.


Thanks, I’ll review the new version if it’s ready before the end of 
tomorrow CET, as I will be on PTO.





Re: [dpdk-dev] [dpdk-stable] [PATCH v3 4/6] mem: use address hint for mapping hugepages

2018-07-10 Thread Eelco Chaudron



On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

> Linux kernel uses a really high address as starting address for
> serving mmaps calls. If there exists addressing limitations and
> IOVA mode is VA, this starting address is likely too high for
> those devices. However, it is possible to use a lower address in
> the process virtual address space as with 64 bits there is a lot
> of available space.
>
> This patch adds an address hint as starting address for 64 bits
> systems.
>
> Signed-off-by: Alejandro Lucero 
> Acked-by: Anatoly Burakov 

Looks good to me!

Cheers,

Eelco

Acked-by: Eelco Chaudron 


Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses

2018-07-10 Thread Eelco Chaudron




On 10 Jul 2018, at 12:52, Alejandro Lucero wrote:


On Tue, Jul 10, 2018 at 11:06 AM, Eelco Chaudron 
wrote:




On 10 Jul 2018, at 11:34, Alejandro Lucero wrote:

On Tue, Jul 10, 2018 at 9:56 AM, Eelco Chaudron 

wrote:




On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

A device can suffer addressing limitations. This functions checks


memsegs have iovas within the supported range based on dma mask.

PMD should use this during initialization if supported devices
suffer addressing limitations, returning an error if this function
returns memsegs out of range.

Another potential usage is for emulated IOMMU hardware with 
addressing

limitations.

Signed-off-by: Alejandro Lucero 
Acked-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 33
++
 lib/librte_eal/common/include/rte_memory.h |  3 +++
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 37 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c
b/lib/librte_eal/common/eal_common_memory.c
index fc6c44d..f5efebe 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -109,6 +109,39 @@
}
 }

+/* check memseg iovas are within the required range based on dma 
mask

*/
+int
+rte_eal_check_dma_mask(uint8_t maskbits)
+{
+
+   const struct rte_mem_config *mcfg;
+   uint64_t mask;
+   int i;
+



I think we should add some sanity check to the input maskbits, i.e.
[64,0)
or [64, 32]? What would be a reasonable lower bound.


This is not a user's API, so any invocation will be reviewed, but I 
guess

adding a sanity check here does not harm.

Not sure about lower bound but upper should 64, although it does not 
make

sense but it is safe. Lower bound is not so problematic.




+   /* create dma mask */


+   mask = ~((1ULL << maskbits) - 1);
+
+   /* get pointer to global configuration */
+   mcfg = rte_eal_get_configuration()->mem_config;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   if (mcfg->memseg[i].addr == NULL)
+   break;



Looking at some other code, it looks like NULL entries might exists. 
So

should a continue; rather than a break; be used here?


I do not think so. memsegs are allocated sequentially, so first with 
addr

as NULL implies no more memsegs.


I was referring to the mem walk functions, rte_memseg_list_walk(). Maybe 
some having more experience with this area can review/comment.







+

+   if (mcfg->memseg[i].iova & mask) {
+   RTE_LOG(INFO, EAL,
+   "memseg[%d] iova %"PRIx64" out of
range:\n",
+   i, mcfg->memseg[i].iova);
+
+   RTE_LOG(INFO, EAL, "\tusing dma mask
%"PRIx64"\n",
+   mask);
+   return -1;
+   }
+   }
+
+   return 0;
+}
+
 /* return the number of memory channels */
 unsigned rte_memory_get_nchannel(void)
 {
diff --git a/lib/librte_eal/common/include/rte_memory.h
b/lib/librte_eal/common/include/rte_memory.h
index 80a8fc0..b2a0168 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -209,6 +209,9 @@ struct rte_memseg {
  */
 unsigned rte_memory_get_nrank(void);

+/* check memsegs iovas are within a range based on dma mask */
+int rte_eal_check_dma_mask(uint8_t maskbits);
+
 /**
  * Drivers based on uio will not load unless physical
  * addresses are obtainable. It is only possible to get
diff --git a/lib/librte_eal/rte_eal_version.map
b/lib/librte_eal/rte_eal_version.map
index f4f46c1..aa6cf87 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -184,6 +184,7 @@ DPDK_17.11 {

rte_eal_create_uio_dev;
rte_bus_get_iommu_class;
+   rte_eal_check_dma_mask;
rte_eal_has_pci;
rte_eal_iova_mode;
rte_eal_mbuf_default_mempool_ops;
--
1.9.1












Re: [dpdk-dev] [dpdk-stable] [PATCH v3 6/6] net/nfp: support IOVA VA mode

2018-07-10 Thread Eelco Chaudron



On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

> NFP can handle IOVA as VA. It requires to check those IOVAs
> being in the supported range what is done during initialization.
>
> Signed-off-by: Alejandro Lucero 

Changes look good to me!

Acked-by: Eelco Chaudron 





Re: [dpdk-dev] [dpdk-stable] [PATCH v3 5/6] net/nfp: check hugepages IOVAs based on DMA mask

2018-07-10 Thread Eelco Chaudron



On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

> NFP devices can not handle DMA addresses requiring more than
> 40 bits. This patch uses rte_dev_check_dma_mask with 40 bits
> and avoids device initialization if memory out of NFP range.
>
> Signed-off-by: Alejandro Lucero 

Changes look good to me!

Acked-by: Eelco Chaudron 


Re: [dpdk-dev] [dpdk-stable] [PATCH v3 3/6] bus/pci: use IOVAs check when setting IOVA mode

2018-07-10 Thread Eelco Chaudron




On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:


Although VT-d emulation currently only supports 39 bits, it could
be iovas being within that supported range. This patch allows
IOVA mode in such a case.

Indeed, memory initialization code can be modified for using lower
virtual addresses than those used by the kernel for 64 bits processes
by default, and therefore memsegs iovas can use 39 bits or less for
most system. And this is likely 100% true for VMs.

Signed-off-by: Alejandro Lucero 
---
 drivers/bus/pci/linux/pci.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 74deef3..792c819 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "eal_private.h"
 #include "eal_filesystem.h"
@@ -613,10 +614,12 @@
fclose(fp);

 	mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 
1;

-   if (mgaw < X86_VA_WIDTH)
+
+   if (!rte_eal_check_dma_mask(mgaw))


If think in this case we still need to check the X86_VA_WIDTH, i.e.
if (mgaw < X86_VA_WIDTH && !rte_eal_check_dma_mask(mgaw))



+   return true;
+   else
return false;

-   return true;
 }
 #elif defined(RTE_ARCH_PPC_64)
 static bool
@@ -640,13 +643,17 @@
 {
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
+   int iommu_dma_mask_check_done = 0;

FOREACH_DRIVER_ON_PCIBUS(drv) {
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (!rte_pci_match(drv, dev))
continue;
-   if (!pci_one_device_iommu_support_va(dev))
-   return false;
+   if (!iommu_dma_mask_check_done) {
+   if (pci_one_device_iommu_support_va(dev) < 0)
+   return false;
+   iommu_dma_mask_check_done  = 1;


Not sure why this change? Why do we only need to check one device on the 
bus?


In addition, if this is what was intended, rather than a variable you 
can return true in this case, or did you intended to clear the 
iommu_dma_mask_check_done on every PCI BUS iteration?



+   }
}
}
return true;
--
1.9.1


Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses

2018-07-10 Thread Eelco Chaudron




On 10 Jul 2018, at 11:34, Alejandro Lucero wrote:

On Tue, Jul 10, 2018 at 9:56 AM, Eelco Chaudron  
wrote:





On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

A device can suffer addressing limitations. This functions checks

memsegs have iovas within the supported range based on dma mask.

PMD should use this during initialization if supported devices
suffer addressing limitations, returning an error if this function
returns memsegs out of range.

Another potential usage is for emulated IOMMU hardware with 
addressing

limitations.

Signed-off-by: Alejandro Lucero 
Acked-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 33
++
 lib/librte_eal/common/include/rte_memory.h |  3 +++
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 37 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c
b/lib/librte_eal/common/eal_common_memory.c
index fc6c44d..f5efebe 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -109,6 +109,39 @@
}
 }

+/* check memseg iovas are within the required range based on dma 
mask */

+int
+rte_eal_check_dma_mask(uint8_t maskbits)
+{
+
+   const struct rte_mem_config *mcfg;
+   uint64_t mask;
+   int i;
+



I think we should add some sanity check to the input maskbits, i.e. 
[64,0)

or [64, 32]? What would be a reasonable lower bound.


This is not a user's API, so any invocation will be reviewed, but I 
guess

adding a sanity check here does not harm.

Not sure about lower bound but upper should 64, although it does not 
make

sense but it is safe. Lower bound is not so problematic.




+   /* create dma mask */

+   mask = ~((1ULL << maskbits) - 1);
+
+   /* get pointer to global configuration */
+   mcfg = rte_eal_get_configuration()->mem_config;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   if (mcfg->memseg[i].addr == NULL)
+   break;


Looking at some other code, it looks like NULL entries might exists. So 
should a continue; rather than a break; be used here?



+
+   if (mcfg->memseg[i].iova & mask) {
+   RTE_LOG(INFO, EAL,
+   "memseg[%d] iova %"PRIx64" out of
range:\n",
+   i, mcfg->memseg[i].iova);
+
+   RTE_LOG(INFO, EAL, "\tusing dma mask 
%"PRIx64"\n",

+   mask);
+   return -1;
+   }
+   }
+
+   return 0;
+}
+
 /* return the number of memory channels */
 unsigned rte_memory_get_nchannel(void)
 {
diff --git a/lib/librte_eal/common/include/rte_memory.h
b/lib/librte_eal/common/include/rte_memory.h
index 80a8fc0..b2a0168 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -209,6 +209,9 @@ struct rte_memseg {
  */
 unsigned rte_memory_get_nrank(void);

+/* check memsegs iovas are within a range based on dma mask */
+int rte_eal_check_dma_mask(uint8_t maskbits);
+
 /**
  * Drivers based on uio will not load unless physical
  * addresses are obtainable. It is only possible to get
diff --git a/lib/librte_eal/rte_eal_version.map
b/lib/librte_eal/rte_eal_version.map
index f4f46c1..aa6cf87 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -184,6 +184,7 @@ DPDK_17.11 {

rte_eal_create_uio_dev;
rte_bus_get_iommu_class;
+   rte_eal_check_dma_mask;
rte_eal_has_pci;
rte_eal_iova_mode;
rte_eal_mbuf_default_mempool_ops;
--
1.9.1








Re: [dpdk-dev] [dpdk-stable] [PATCH v3 2/6] ethdev: add function for checking IOVAs by a device

2018-07-10 Thread Eelco Chaudron




On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:

A PMD should invoke this function for checking memsegs iovas are 
within

the supported range by the device.

Signed-off-by: Alejandro Lucero 


Agree with Andrew here, why not call rte_eal_check_dma_mask() directly 
in nfp_net_txq_full()?



---
 lib/librte_ether/rte_ethdev.h   | 13 +
 lib/librte_ether/rte_ethdev_version.map |  1 +
 2 files changed, 14 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.h 
b/lib/librte_ether/rte_ethdev.h

index eba11ca..e51a432 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -2799,6 +2799,19 @@ int rte_eth_dev_set_vlan_ether_type(uint16_t 
port_id,
 int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int 
on);


 /**
+ * check device dma mask within expected range based on dma mask.
+ *
+ * @param maskbits
+ *  mask length in bits
+ *
+ */
+static inline int
+rte_eth_dev_check_dma_mask(uint8_t maskbits)
+{
+   return rte_eal_check_dma_mask(maskbits);
+}
+
+/**
  *
  * Retrieve a burst of input packets from a receive queue of an 
Ethernet
  * device. The retrieved packets are stored in *rte_mbuf* structures 
whose
diff --git a/lib/librte_ether/rte_ethdev_version.map 
b/lib/librte_ether/rte_ethdev_version.map

index e9681ac..0b11b8a 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -191,6 +191,7 @@ DPDK_17.08 {
 DPDK_17.11 {
global:

+   rte_eth_dev_check_dma_mask;
rte_eth_dev_get_sec_ctx;
rte_eth_dev_pool_ops_supported;
rte_eth_dev_reset;
--
1.9.1


Re: [dpdk-dev] [dpdk-stable] [PATCH v3 1/6] mem: add function for checking memsegs IOVAs addresses

2018-07-10 Thread Eelco Chaudron




On 4 Jul 2018, at 14:53, Alejandro Lucero wrote:


A device can suffer addressing limitations. This functions checks
memsegs have iovas within the supported range based on dma mask.

PMD should use this during initialization if supported devices
suffer addressing limitations, returning an error if this function
returns memsegs out of range.

Another potential usage is for emulated IOMMU hardware with addressing
limitations.

Signed-off-by: Alejandro Lucero 
Acked-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 33 
++

 lib/librte_eal/common/include/rte_memory.h |  3 +++
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 37 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c

index fc6c44d..f5efebe 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -109,6 +109,39 @@
}
 }

+/* check memseg iovas are within the required range based on dma mask 
*/

+int
+rte_eal_check_dma_mask(uint8_t maskbits)
+{
+
+   const struct rte_mem_config *mcfg;
+   uint64_t mask;
+   int i;
+


I think we should add some sanity check to the input maskbits, i.e. 
[64,0) or [64, 32]? What would be a reasonable lower bound.



+   /* create dma mask */
+   mask = ~((1ULL << maskbits) - 1);
+
+   /* get pointer to global configuration */
+   mcfg = rte_eal_get_configuration()->mem_config;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   if (mcfg->memseg[i].addr == NULL)
+   break;
+
+   if (mcfg->memseg[i].iova & mask) {
+   RTE_LOG(INFO, EAL,
+   "memseg[%d] iova %"PRIx64" out of range:\n",
+   i, mcfg->memseg[i].iova);
+
+   RTE_LOG(INFO, EAL, "\tusing dma mask %"PRIx64"\n",
+   mask);
+   return -1;
+   }
+   }
+
+   return 0;
+}
+
 /* return the number of memory channels */
 unsigned rte_memory_get_nchannel(void)
 {
diff --git a/lib/librte_eal/common/include/rte_memory.h 
b/lib/librte_eal/common/include/rte_memory.h

index 80a8fc0..b2a0168 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -209,6 +209,9 @@ struct rte_memseg {
  */
 unsigned rte_memory_get_nrank(void);

+/* check memsegs iovas are within a range based on dma mask */
+int rte_eal_check_dma_mask(uint8_t maskbits);
+
 /**
  * Drivers based on uio will not load unless physical
  * addresses are obtainable. It is only possible to get
diff --git a/lib/librte_eal/rte_eal_version.map 
b/lib/librte_eal/rte_eal_version.map

index f4f46c1..aa6cf87 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -184,6 +184,7 @@ DPDK_17.11 {

rte_eal_create_uio_dev;
rte_bus_get_iommu_class;
+   rte_eal_check_dma_mask;
rte_eal_has_pci;
rte_eal_iova_mode;
rte_eal_mbuf_default_mempool_ops;
--
1.9.1


  1   2   >