Re: release schedule change proposal

2021-11-19 Thread Flavio Leitner
Thanks Kevin for bringing this up.
See below.

On Mon, Nov 15, 2021 at 1:06 PM Kevin Traynor  wrote:

> On 15/11/2021 14:58, Thomas Monjalon wrote:
> > For the last 5 years, DPDK was doing 4 releases per year,
> > in February, May, August and November (the LTS one):
> >   .02   .05   .08   .11 (LTS)
> >
> > This schedule has multiple issues:
> >   - clash with China's Spring Festival
> >   - too many rushes, impacting maintainers & testers
> >   - not much buffer, impacting proposal period
> >
> > I propose to switch to a new schedule with 3 releases per year:
> >   .03  .07  .11 (LTS)
> >
> > New LTS branch would start at the same time of the year as before.
> > There would be one less intermediate release during spring/summer:
> > .05 and .08 intermediate releases would become a single .07.
> > I think it has almost no impact for the users.
> > This change could be done starting next year.
> >
> > In details, this is how we could extend some milestones:
> >
> >   ideal schedule so far (in 13 weeks):
> >   proposal deadline: 4
> >   rc1 - API freeze: 5
> >   rc2 - PMD features freeze: 2
> >   rc3 - app features freeze: 1
> >   rc4 - last chance to fix: 1
> >   release: 0
> >
> >   proposed schedule (in 17 weeks):
> >   proposal deadline: 4
> >   rc1 - API freeze: 7
> >   rc2 - PMD features freeze: 3
> >   rc3 - app features freeze: 1
> >   rc4 - more fixes: 1
> >   rc5 - last chance buffer: 1
> >   release: 0
> >
> > Opinions?
> >
> >
>
> Someone else might comment if they spot something, but to me looks ok
> for RH distro and OVS project.
>

That is my impression as well.


>
> RH distro is also using DPDK .11 who's release date is not changing.
> (+cc Timothy/Flavio)
>

My concern is if patches would get delayed to be merged because of this
change, but I don't think that will be the case.


> For OVS project, it only integrates DPDK .11 release too and aims to do
> that by EOY to make the next OVS release. DPDK stable releases are
> integrated into older OVS branches when available. I don't think older
> OVS branch releases have a strict release schedule and having the latest
> stable DPDK release is not a blocker anyway. (+cc Ilya/Ian/ovs-discuss)
>

I agree.

fbl


Re: [dpdk-dev] [PATCH v4 3/3] vhost: fix offload flags in Rx path

2021-05-04 Thread Flavio Leitner
  vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>   vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
>   vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> + vsocket->net_compliant_ol_flags = flags & 
> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>  
>   if (vsocket->async_copy &&
>   (flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
> index c9b6379f73..9abfc0bfe7 100644
> --- a/lib/vhost/vhost.c
> +++ b/lib/vhost/vhost.c
> @@ -752,7 +752,7 @@ vhost_set_ifname(int vid, const char *if_name, unsigned 
> int if_len)
>  }
>  
>  void
> -vhost_set_builtin_virtio_net(int vid, bool enable)
> +vhost_setup_virtio_net(int vid, bool enable, bool compliant_ol_flags)
>  {
>   struct virtio_net *dev = get_device(vid);
>  
> @@ -763,6 +763,10 @@ vhost_set_builtin_virtio_net(int vid, bool enable)
>   dev->flags |= VIRTIO_DEV_BUILTIN_VIRTIO_NET;
>   else
>   dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
> + if (!compliant_ol_flags)
> + dev->flags |= VIRTIO_DEV_LEGACY_OL_FLAGS;
> + else
> + dev->flags &= ~VIRTIO_DEV_LEGACY_OL_FLAGS;
>  }
>  
>  void
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index b303635645..8078ddff79 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -27,15 +27,17 @@
>  #include "rte_vhost_async.h"
>  
>  /* Used to indicate that the device is running on a data core */
> -#define VIRTIO_DEV_RUNNING 1
> +#define VIRTIO_DEV_RUNNING ((uint32_t)1 << 0)
>  /* Used to indicate that the device is ready to operate */
> -#define VIRTIO_DEV_READY 2
> +#define VIRTIO_DEV_READY ((uint32_t)1 << 1)
>  /* Used to indicate that the built-in vhost net device backend is enabled */
> -#define VIRTIO_DEV_BUILTIN_VIRTIO_NET 4
> +#define VIRTIO_DEV_BUILTIN_VIRTIO_NET ((uint32_t)1 << 2)
>  /* Used to indicate that the device has its own data path and configured */
> -#define VIRTIO_DEV_VDPA_CONFIGURED 8
> +#define VIRTIO_DEV_VDPA_CONFIGURED ((uint32_t)1 << 3)
>  /* Used to indicate that the feature negotiation failed */
> -#define VIRTIO_DEV_FEATURES_FAILED 16
> +#define VIRTIO_DEV_FEATURES_FAILED ((uint32_t)1 << 4)
> +/* Used to indicate that the virtio_net tx code should fill TX ol_flags */
> +#define VIRTIO_DEV_LEGACY_OL_FLAGS ((uint32_t)1 << 5)
>  
>  /* Backend value set by guest. */
>  #define VIRTIO_DEV_STOPPED -1
> @@ -683,7 +685,7 @@ int alloc_vring_queue(struct virtio_net *dev, uint32_t 
> vring_idx);
>  void vhost_attach_vdpa_device(int vid, struct rte_vdpa_device *dev);
>  
>  void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
> -void vhost_set_builtin_virtio_net(int vid, bool enable);
> +void vhost_setup_virtio_net(int vid, bool enable, bool legacy_ol_flags);
>  void vhost_enable_extbuf(int vid);
>  void vhost_enable_linearbuf(int vid);
>  int vhost_enable_guest_notification(struct virtio_net *dev,
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 1a34867f3c..8e36f4c340 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -8,6 +8,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -2303,15 +2304,12 @@ parse_ethernet(struct rte_mbuf *m, uint16_t 
> *l4_proto, void **l4_hdr)
>  }
>  
>  static __rte_always_inline void
> -vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
> +vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
>  {
>   uint16_t l4_proto = 0;
>   void *l4_hdr = NULL;
>   struct rte_tcp_hdr *tcp_hdr = NULL;
>  
> - if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
> - return;
> -
>   parse_ethernet(m, &l4_proto, &l4_hdr);
>   if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
>   if (hdr->csum_start == (m->l2_len + m->l3_len)) {
> @@ -2356,6 +2354,94 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, 
> struct rte_mbuf *m)
>   }
>  }
>  
> +static __rte_always_inline void
> +vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
> + bool legacy_ol_flags)
> +{
> + struct rte_net_hdr_lens hdr_lens;
> + int l4_supported = 0;
> + uint32_t ptype;
> +
> + if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
> + return;
> +
> + if (legacy_ol_flags) {
> + vhost_dequeue_offload_legacy(hdr, m);
> + return;
> + }
> +
> + 

Re: [dpdk-dev] [PATCH v2 4/4] vhost: fix offload flags in Rx path

2021-04-29 Thread Flavio Leitner
On Thu, Apr 29, 2021 at 10:04:38AM +0200, David Marchand wrote:
> The vhost library current configures Tx offloading (PKT_TX_*) on any
> packet received from a guest virtio device which asks for some offloading.
> 
> This is problematic, as Tx offloading is something that the application
> must ask for: the application needs to configure devices
> to support every used offloads (ip, tcp checksumming, tso..), and the
> various l2/l3/l4 lengths must be set following any processing that
> happened in the application itself.
> 
> On the other hand, the received packets are not marked wrt current
> packet l3/l4 checksumming info.
> 
> Copy virtio rx processing to fix those offload flags but accepting
> VIRTIO_NET_HDR_GSO_ECN and VIRTIO_NET_HDR_GSO_UDP too.
> 
> The vhost example has been updated accordingly: TSO is applied to any
> packet marked LRO.
> 
> Fixes: 859b480d5afd ("vhost: add guest offload setting")
> 
> Signed-off-by: David Marchand 
> ---
> Changes since v1:
> - updated vhost example,
> - restored VIRTIO_NET_HDR_GSO_ECN and VIRTIO_NET_HDR_GSO_UDP support,
> - restored log on buggy offload request,
> 
> ---
>  examples/vhost/main.c  |  42 +++--
>  lib/vhost/virtio_net.c | 139 +
>  2 files changed, 78 insertions(+), 103 deletions(-)
> 
[...]

> - if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> + /* GSO request, save required information in mbuf */
> + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> + /* Check unsupported modes */
> + if (hdr->gso_size == 0)
> + return -EINVAL;
> +
>   switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
>   case VIRTIO_NET_HDR_GSO_TCPV4:
>   case VIRTIO_NET_HDR_GSO_TCPV6:
> - tcp_hdr = l4_hdr;
> - m->ol_flags |= PKT_TX_TCP_SEG;
> - m->tso_segsz = hdr->gso_size;
> - m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> - break;
>   case VIRTIO_NET_HDR_GSO_UDP:
> - m->ol_flags |= PKT_TX_UDP_SEG;
> + m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;

My understanding of the virtio 1.1 spec is that GSO can be
used independently of CSUM. There is nothing preventing to
send a fully checksummed TSO packet.

Anyways, that's unusual and not the goal of this patch.

Acked-by: Flavio Leitner 

fbl


> + /* Update mss lengths in mbuf */
>   m->tso_segsz = hdr->gso_size;
> - m->l4_len = sizeof(struct rte_udp_hdr);
>   break;
>   default:
>   VHOST_LOG_DATA(WARNING,
>   "unsupported gso type %u.\n", hdr->gso_type);
> - break;
> + return -EINVAL;
>   }
>   }
> +
> + return 0;
>  }
>  
>  static __rte_noinline void
> @@ -2084,8 +2054,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>   prev->data_len = mbuf_offset;
>   m->pkt_len+= mbuf_offset;
>  
> - if (hdr)
> - vhost_dequeue_offload(hdr, m);
> + if (hdr && vhost_dequeue_offload(hdr, m) < 0) {
> + VHOST_LOG_DATA(ERR, "Packet with invalid offloads.\n");
> + error = -1;
> + goto out;
> + }
>  
>  out:
>  
> -- 
> 2.23.0
> 

-- 
fbl


Re: [dpdk-dev] [PATCH v2 2/4] net/virtio: do not touch Tx offload flags

2021-04-29 Thread Flavio Leitner
On Thu, Apr 29, 2021 at 10:04:36AM +0200, David Marchand wrote:
> Tx offload flags are of the application responsibility.
> Leave the mbuf alone and use a local storage for implicit tcp checksum
> offloading in case of TSO.
> 
> Signed-off-by: David Marchand 
> Reviewed-by: Maxime Coquelin 
> ---

Acked-by: Flavio Leitner 



Re: [dpdk-dev] [PATCH 2/5] net/tap: do not touch Tx offload flags

2021-04-09 Thread Flavio Leitner
On Fri, Apr 09, 2021 at 03:30:18PM +0200, Olivier Matz wrote:
> On Thu, Apr 08, 2021 at 09:58:35AM -0300, Flavio Leitner wrote:
> > On Thu, Apr 08, 2021 at 02:05:21PM +0200, Olivier Matz wrote:
> > > On Thu, Apr 08, 2021 at 08:21:58AM -0300, Flavio Leitner wrote:
> > > > On Thu, Apr 08, 2021 at 09:41:59AM +0200, Olivier Matz wrote:
> > > > > On Wed, Apr 07, 2021 at 05:15:39PM -0300, Flavio Leitner wrote:
> > > > > > On Thu, Apr 01, 2021 at 11:52:40AM +0200, David Marchand wrote:
> > > > > > > Tx offload flags are of the application responsibility.
> > > > > > > Leave the mbuf alone and check for TSO where needed.
> > > > > > > 
> > > > > > > Signed-off-by: David Marchand 
> > > > > > > ---
> > > > > > 
> > > > > > The patch looks good, but maybe a better approach would be
> > > > > > to change the documentation to require the TCP_CKSUM flag
> > > > > > when TCP_SEG is used, otherwise this flag adjusting needs
> > > > > > to be replicated every time TCP_SEG is used.
> > > > > > 
> > > > > > The above could break existing applications, so perhaps doing
> > > > > > something like below would be better and backwards compatible?
> > > > > > Then we can remove those places tweaking the flags completely.
> > > > > 
> > > > > As a first step, I suggest to document that:
> > > > > - applications must set TCP_CKSUM when setting TCP_SEG
> > > > 
> > > > That's what I suggest above.
> > > > 
> > > > > - pmds must suppose that TCP_CKSUM is set when TCP_SEG is set
> > > > 
> > > > But that keeps the problem of implying the TCP_CKSUM flag in
> > > > various places.
> > > 
> > > Yes. What I propose is just a first step: better document what is the
> > > current expected behavior, before doing something else.
> > > 
> > > > > This is clearer that what we have today, and I think it does not break
> > > > > anything. This will guide apps in the correct direction, facilitating
> > > > > an eventual future PMD change.
[...]
> > I see your point. Going back to improving the documentation as a
> > first step, what would be the next steps? Are we going to wait few
> > releases and then remove the flag tweaking code assuming that PMDs
> > and apps are ok?
> 
> After this documentation step, in few releases, we could relax the
> constraint on PMD: applications will be expected to set TCP_CKSUM when
> TCP_SEG is set, so no need for the PMD to force TCP_CKSUM to 1 if
> TCP_SEG is set. The documentation will be updated again.
> 
> This plan can be described in the deprecation notice, and later in the
> release note.
> 
> How does it sound?

Works for me.
Thanks,
-- 
fbl


Re: [dpdk-dev] [PATCH 5/5] vhost: fix offload flags in Rx path

2021-04-08 Thread Flavio Leitner
On Thu, Apr 01, 2021 at 11:52:43AM +0200, David Marchand wrote:
> The vhost library current configures Tx offloading (PKT_TX_*) on any
> packet received from a guest virtio device which asks for some offloading.
> 
> This is problematic, as Tx offloading is something that the application
> must ask for: the application needs to configure devices
> to support every used offloads (ip, tcp checksumming, tso..), and the
> various l2/l3/l4 lengths must be set following any processing that
> happened in the application itself.
> 
> On the other hand, the received packets are not marked wrt current
> packet l3/l4 checksumming info.
> 
> Copy virtio rx processing to fix those offload flags.
> 
> The vhost example needs a reworking as it was built with the assumption
> that mbuf TSO configuration is set up by the vhost library.
> This is not done in this patch for now so TSO activation is forcibly
> refused.
> 
> Fixes: 859b480d5afd ("vhost: add guest offload setting")

There is change that before ECN was ignored and now it is invalid.
I think that's the right way to go, but not sure if virtio blocks
the negotiation of that feature.

Reviewed-by: Flavio Leitner 

fbl


Re: [dpdk-dev] [PATCH 4/5] net/virtio: refactor Tx offload helper

2021-04-08 Thread Flavio Leitner
On Thu, Apr 01, 2021 at 11:52:42AM +0200, David Marchand wrote:
> Purely cosmetic but it is rather odd to have an "offload" helper that
> checks if it actually must do something.
> We already have the same checks in most callers, so move this branch
> in them.
> 
> Signed-off-by: David Marchand 
> ---

Reviewed-by: Flavio Leitner 



Re: [dpdk-dev] [PATCH 2/5] net/tap: do not touch Tx offload flags

2021-04-08 Thread Flavio Leitner
On Thu, Apr 08, 2021 at 02:05:21PM +0200, Olivier Matz wrote:
> On Thu, Apr 08, 2021 at 08:21:58AM -0300, Flavio Leitner wrote:
> > On Thu, Apr 08, 2021 at 09:41:59AM +0200, Olivier Matz wrote:
> > > On Wed, Apr 07, 2021 at 05:15:39PM -0300, Flavio Leitner wrote:
> > > > On Thu, Apr 01, 2021 at 11:52:40AM +0200, David Marchand wrote:
> > > > > Tx offload flags are of the application responsibility.
> > > > > Leave the mbuf alone and check for TSO where needed.
> > > > > 
> > > > > Signed-off-by: David Marchand 
> > > > > ---
> > > > 
> > > > The patch looks good, but maybe a better approach would be
> > > > to change the documentation to require the TCP_CKSUM flag
> > > > when TCP_SEG is used, otherwise this flag adjusting needs
> > > > to be replicated every time TCP_SEG is used.
> > > > 
> > > > The above could break existing applications, so perhaps doing
> > > > something like below would be better and backwards compatible?
> > > > Then we can remove those places tweaking the flags completely.
> > > 
> > > As a first step, I suggest to document that:
> > > - applications must set TCP_CKSUM when setting TCP_SEG
> > 
> > That's what I suggest above.
> > 
> > > - pmds must suppose that TCP_CKSUM is set when TCP_SEG is set
> > 
> > But that keeps the problem of implying the TCP_CKSUM flag in
> > various places.
> 
> Yes. What I propose is just a first step: better document what is the
> current expected behavior, before doing something else.
> 
> > > This is clearer that what we have today, and I think it does not break
> > > anything. This will guide apps in the correct direction, facilitating
> > > an eventual future PMD change.
> > > 
> > > > diff --git a/lib/librte_mbuf/rte_mbuf_core.h 
> > > > b/lib/librte_mbuf/rte_mbuf_core.h
> > > > index c17dc95c5..6a0c2cdd9 100644
> > > > --- a/lib/librte_mbuf/rte_mbuf_core.h
> > > > +++ b/lib/librte_mbuf/rte_mbuf_core.h
> > > > @@ -298,7 +298,7 @@ extern "C" {
> > > >   *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag
> > > >   *  - fill the mbuf offload information: l2_len, l3_len, l4_len, 
> > > > tso_segsz
> > > >   */
> > > > -#define PKT_TX_TCP_SEG   (1ULL << 50)
> > > > +#define PKT_TX_TCP_SEG   (1ULL << 50) | PKT_TX_TCP_CKSUM
> > > >  
> > > >  /** TX IEEE1588 packet to timestamp. */
> > > >  #define PKT_TX_IEEE1588_TMST (1ULL << 51)
> > > 
> > > I'm afraid some applications or drivers use extended bit manipulations
> > > to do the conversion from/to another domain (like hardware descriptors
> > > or application-specific flags). They may expect this constant to be a
> > > uniq flag.
> > 
> > Interesting, do you have an example? Because each flag still has an
> > separate meaning.
> 
> Honnestly no, I don't have any good example, just a (maybe unfounded) doubt.
> 
> I have in mind operations that are done with tables or vector
> instructions inside the drivers, but this is mainly done for Rx, not Tx.
> You can look at Tx functions like mlx5_set_cksum_table() or
> nix_xmit_pkts_vector(), or Rx functions like desc_to_olflags_v() or
> enic_noscatter_vec_recv_pkts() to see what kind of stuff I'm talking
> about.

I see your point. Going back to improving the documentation as a
first step, what would be the next steps? Are we going to wait few
releases and then remove the flag tweaking code assuming that PMDs
and apps are ok?

Thanks,
-- 
fbl


Re: [dpdk-dev] [PATCH 2/5] net/tap: do not touch Tx offload flags

2021-04-08 Thread Flavio Leitner
On Thu, Apr 08, 2021 at 09:41:59AM +0200, Olivier Matz wrote:
> On Wed, Apr 07, 2021 at 05:15:39PM -0300, Flavio Leitner wrote:
> > On Thu, Apr 01, 2021 at 11:52:40AM +0200, David Marchand wrote:
> > > Tx offload flags are of the application responsibility.
> > > Leave the mbuf alone and check for TSO where needed.
> > > 
> > > Signed-off-by: David Marchand 
> > > ---
> > 
> > The patch looks good, but maybe a better approach would be
> > to change the documentation to require the TCP_CKSUM flag
> > when TCP_SEG is used, otherwise this flag adjusting needs
> > to be replicated every time TCP_SEG is used.
> > 
> > The above could break existing applications, so perhaps doing
> > something like below would be better and backwards compatible?
> > Then we can remove those places tweaking the flags completely.
> 
> As a first step, I suggest to document that:
> - applications must set TCP_CKSUM when setting TCP_SEG

That's what I suggest above.

> - pmds must suppose that TCP_CKSUM is set when TCP_SEG is set

But that keeps the problem of implying the TCP_CKSUM flag in
various places.

> This is clearer that what we have today, and I think it does not break
> anything. This will guide apps in the correct direction, facilitating
> an eventual future PMD change.
> 
> > diff --git a/lib/librte_mbuf/rte_mbuf_core.h 
> > b/lib/librte_mbuf/rte_mbuf_core.h
> > index c17dc95c5..6a0c2cdd9 100644
> > --- a/lib/librte_mbuf/rte_mbuf_core.h
> > +++ b/lib/librte_mbuf/rte_mbuf_core.h
> > @@ -298,7 +298,7 @@ extern "C" {
> >   *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag
> >   *  - fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz
> >   */
> > -#define PKT_TX_TCP_SEG   (1ULL << 50)
> > +#define PKT_TX_TCP_SEG   (1ULL << 50) | PKT_TX_TCP_CKSUM
> >  
> >  /** TX IEEE1588 packet to timestamp. */
> >  #define PKT_TX_IEEE1588_TMST (1ULL << 51)
> 
> I'm afraid some applications or drivers use extended bit manipulations
> to do the conversion from/to another domain (like hardware descriptors
> or application-specific flags). They may expect this constant to be a
> uniq flag.

Interesting, do you have an example? Because each flag still has an
separate meaning.

-- 
fbl


Re: [dpdk-dev] [PATCH 2/5] net/tap: do not touch Tx offload flags

2021-04-07 Thread Flavio Leitner
On Thu, Apr 01, 2021 at 11:52:40AM +0200, David Marchand wrote:
> Tx offload flags are of the application responsibility.
> Leave the mbuf alone and check for TSO where needed.
> 
> Signed-off-by: David Marchand 
> ---

The patch looks good, but maybe a better approach would be
to change the documentation to require the TCP_CKSUM flag
when TCP_SEG is used, otherwise this flag adjusting needs
to be replicated every time TCP_SEG is used.

The above could break existing applications, so perhaps doing
something like below would be better and backwards compatible?
Then we can remove those places tweaking the flags completely.

diff --git a/lib/librte_mbuf/rte_mbuf_core.h b/lib/librte_mbuf/rte_mbuf_core.h
index c17dc95c5..6a0c2cdd9 100644
--- a/lib/librte_mbuf/rte_mbuf_core.h
+++ b/lib/librte_mbuf/rte_mbuf_core.h
@@ -298,7 +298,7 @@ extern "C" {
  *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag
  *  - fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz
  */
-#define PKT_TX_TCP_SEG   (1ULL << 50)
+#define PKT_TX_TCP_SEG   (1ULL << 50) | PKT_TX_TCP_CKSUM
 
 /** TX IEEE1588 packet to timestamp. */
 #define PKT_TX_IEEE1588_TMST (1ULL << 51)

Thanks,
fbl


Re: [dpdk-dev] [PATCH 1/5] mbuf: mark old offload flag as deprecated

2021-04-07 Thread Flavio Leitner
On Thu, Apr 01, 2021 at 11:52:39AM +0200, David Marchand wrote:
> PKT_RX_EIP_CKSUM_BAD has been declared deprecated quite some time ago,
> but there was no warning to applications still using it.
> Fix this by marking as deprecated with the newly introduced
> RTE_DEPRECATED.
> 
> Signed-off-by: David Marchand 
> ---

Reviewed-by: Flavio Leitner 



Re: [dpdk-dev] [PATCH v2] vhost: fix mbuf alloc failure

2020-05-04 Thread Flavio Leitner
On Mon, May 04, 2020 at 10:41:17PM +0530, Sivaprasad Tummala wrote:
> vhost buffer allocation is successful for packets that fit
> into a linear buffer. If it fails, vhost library is expected
> to drop the current packet and skip to the next.
> 
> The patch fixes the error scenario by skipping to next packet.
> Note: Drop counters are not currently supported.
> 
> Fixes: c3ff0ac70acb ("vhost: improve performance by supporting large buffer")
> Cc: sta...@dpdk.org
> Cc: f...@sysclose.org
> 
> ---
> v2:
>  * fixed review comments - Maxime Coquelin
>  * fixed mbuf alloc errors for packed virtqueues - Maxime Coquelin
>  * fixed mbuf copy errors - Flavio Leitner
> 
> Signed-off-by: Sivaprasad Tummala 
> ---
>  lib/librte_vhost/virtio_net.c | 50 ++-
>  1 file changed, 37 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 1fc30c681..764c514fd 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1674,6 +1674,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>  {
>   uint16_t i;
>   uint16_t free_entries;
> + uint16_t dropped = 0;
>  
>   if (unlikely(dev->dequeue_zero_copy)) {
>   struct zcopy_mbuf *zmbuf, *next;
> @@ -1737,13 +1738,31 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>   update_shadow_used_ring_split(vq, head_idx, 0);
>  
>   pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
> - if (unlikely(pkts[i] == NULL))
> + if (unlikely(pkts[i] == NULL)) {
> + /*
> +  * mbuf allocation fails for jumbo packets when external
> +  * buffer allocation is not allowed and linear buffer
> +  * is required. Drop this packet.
> +  */
> +#ifdef RTE_LIBRTE_VHOST_DEBUG
> + VHOST_LOG_DATA(ERR,
> + "Failed to allocate memory for mbuf. Packet 
> dropped!\n");
> +#endif

That message is useful to spot that missing packets that happens
once in a while, so we should be able to see it even in production
without debug enabled. However, we can't let it flood the log.

I am not sure if librte eal has this functionality, but if not you
could limit by using a static bool:

static bool allocerr_warned = false;

if (allocerr_warned) {
VHOST_LOG_DATA(ERR,
"Failed to allocate memory for mbuf. Packet dropped!\n");
allocerr_warned = true;
}



> + dropped += 1;
> + i++;
>   break;
> + }
>  
>   err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
>   mbuf_pool);
>   if (unlikely(err)) {
>   rte_pktmbuf_free(pkts[i]);
> +#ifdef RTE_LIBRTE_VHOST_DEBUG
> + VHOST_LOG_DATA(ERR,
> + "Failed to copy desc to mbuf. Packet 
> dropped!\n");
> +#endif

Same here.


> + dropped += 1;
> + i++;
>   break;
>   }
>  
> @@ -1753,6 +1772,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>   zmbuf = get_zmbuf(vq);
>   if (!zmbuf) {
>   rte_pktmbuf_free(pkts[i]);
> + dropped += 1;
> + i++;
>   break;
>   }
>   zmbuf->mbuf = pkts[i];
> @@ -1782,7 +1803,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>   }
>   }
>  
> - return i;
> + return (i - dropped);
>  }
>  
>  static __rte_always_inline int
> @@ -1946,21 +1967,24 @@ virtio_dev_tx_single_packed(struct virtio_net *dev,
>   struct rte_mbuf **pkts)
>  {
>  
> - uint16_t buf_id, desc_count;
> + uint16_t buf_id, desc_count = 0;
> + int ret;
>  
> - if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
> - &desc_count))
> - return -1;
> + ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
> + &desc_count);
>  
> - if (virtio_net_is_inorder(dev))
> - vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
> -  

Re: [dpdk-dev] [PATCH v1] vhost: fix mbuf allocation failures

2020-04-29 Thread Flavio Leitner
On Wed, Apr 29, 2020 at 10:43:01AM +0200, Maxime Coquelin wrote:
> Hi Sivaprasad,
> 
> On 4/28/20 11:52 AM, Sivaprasad Tummala wrote:
> > vhost buffer allocation is successful for packets that fit
> > into a linear buffer. If it fails, vhost library is expected
> > to drop the current buffer descriptor and skip to the next.
> > 
> > The patch fixes the error scenario by skipping to next descriptor.
> > Note: Drop counters are not currently supported.

In that case shouldn't we continue to process the ring?

Also, don't we have the same issue with copy_desc_to_mbuf() 
and get_zmbuf()?

fbl

> Fixes tag is missing here, and sta...@dpdk.org should be cc'ed if
> necessary.
> 
> > Signed-off-by: Sivaprasad Tummala 
> > ---
> >  lib/librte_vhost/virtio_net.c | 16 ++--
> >  1 file changed, 14 insertions(+), 2 deletions(-)
> > 
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index 37c47c7dc..b0d3a85c2 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1688,6 +1688,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> > vhost_virtqueue *vq,
> >  {
> 
> You only fix split ring path, but not packed ring.
> 
> > uint16_t i;
> > uint16_t free_entries;
> > +   uint16_t dropped = 0;
> >  
> > if (unlikely(dev->dequeue_zero_copy)) {
> > struct zcopy_mbuf *zmbuf, *next;
> > @@ -1751,8 +1752,19 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> > vhost_virtqueue *vq,
> > update_shadow_used_ring_split(vq, head_idx, 0);
> >  
> > pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
> > -   if (unlikely(pkts[i] == NULL))
> > +   if (unlikely(pkts[i] == NULL)) {
> > +   /*
> > +* mbuf allocation fails for jumbo packets with
> > +* linear buffer flag set. Drop this packet and
> > +* proceed with the next available descriptor to
> > +* avoid HOL blocking
> > +*/
> > +   VHOST_LOG_DATA(WARNING,
> > +   "Failed to allocate memory for mbuf. Packet 
> > dropped!\n");
> 
> I think we need a better logging, otherwise it is going to flood the log
> file quite rapidly if issue happens. Either some rate-limited logging or
> warn-once would be better.
> 
> The warning message could be also improved, because when using linear
> buffers, one would expect that the size of the mbufs could handle a
> jumbo frame. So it should differentiate two
> 
> > +   dropped += 1;
> > +   i++;
> > break;
> > +   }
> >  
> > err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> > mbuf_pool);
> > @@ -1796,7 +1808,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct 
> > vhost_virtqueue *vq,
> > }
> > }
> >  
> > -   return i;
> > +   return (i - dropped);
> >  }
> >  
> >  static __rte_always_inline int
> > 
> 

-- 
fbl



Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers

2019-10-29 Thread Flavio Leitner
On Tue, 29 Oct 2019 10:02:57 +0100
David Marchand  wrote:

> On Tue, Oct 15, 2019 at 9:00 PM Flavio Leitner 
> wrote:
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 5b85b832d..da69ab1db 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1289,6 +1289,93 @@ get_zmbuf(struct vhost_virtqueue *vq)
> > return NULL;
> >  }
> >
> > +static void
> > +virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
> > +{
> > +   rte_free(opaque);
> > +}
> > +
> > +static int
> > +virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
> > +{
> > +   struct rte_mbuf_ext_shared_info *shinfo = NULL;
> > +   uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
> > +   uint16_t buf_len;
> > +   rte_iova_t iova;
> > +   void *buf;
> > +
> > +   /* Try to use pkt buffer to store shinfo to reduce the
> > amount of memory
> > +* required, otherwise store shinfo in the new buffer.
> > +*/
> > +   if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
> > +   shinfo = rte_pktmbuf_mtod(pkt,
> > + struct
> > rte_mbuf_ext_shared_info *);
> > +   else {
> > +   total_len += sizeof(*shinfo) + sizeof(uintptr_t);
> > +   total_len = RTE_ALIGN_CEIL(total_len,
> > sizeof(uintptr_t));
> > +   }
> > +
> > +   if (unlikely(total_len > UINT16_MAX))
> > +   return -ENOSPC;
> > +
> > +   buf_len = total_len;
> > +   buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);  
> 
> Using rte_malloc() means that the allocation can end up on any numa
> node. This external buffer might end up on a different node than the
> mbuf (which resides on mp->socket_id node).

Only if there is no memory in the local socket. It seems better than a
failure to me.

fbl


Re: [dpdk-dev] [PATCH] vhost: fix IPv4 csum calculation

2019-10-24 Thread Flavio Leitner
On Thu, 24 Oct 2019 16:32:43 +0200
Maxime Coquelin  wrote:

> Hi Flavio,
> 
> On 10/24/19 4:28 PM, Flavio Leitner wrote:
> > Currently the IPv4 header checksum is calculated including its
> > current value, which can be a valid checksum or just garbage.
> > In any case, if the original value is not zero, then the result
> > is always wrong.
> > 
> > The IPv4 checksum is defined in RFC791, page 14 says:
> >   Header Checksum:  16 bits
> > 
> >   The checksum algorithm is:
> >   The checksum field is the 16 bit one's complement of the one's
> >   complement sum of all 16 bit words in the header.  For purposes of
> >   computing the checksum, the value of the checksum field is zero.
> > 
> > Thus force the csum field to always be zero.  
> 
> I think we need Fixes tag:
> 
> Fixes: b08b8cfeb2ae ("vhost: fix IP checksum")
> Cc: sta...@dpdk.org
> 
> Also adding stable in Cc.
> 
> 
> > Signed-off-by: Flavio Leitner 
> > ---
> >  lib/librte_vhost/virtio_net.c | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index eae7825f04..cde7498c76 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -445,6 +445,7 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf,
> > struct virtio_net_hdr *net_hdr) 
> > ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct
> > rte_ipv4_hdr *, m_buf->l2_len);
> > +   ipv4_hdr->hdr_checksum = 0;
> > ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
> > }
> >  
> >   
> 
> Other than that:
> 
> Reviewed-by: Maxime Coquelin 
> 
> I'll add the Fixes tag while applying.

Not sure if you will send to stable, so I did v2 just in case.
Feel free to drop either one if you took care of those items already.
Thanks!
fbl




[dpdk-dev] [PATCH v2] vhost: fix IPv4 csum calculation

2019-10-24 Thread Flavio Leitner
Currently the IPv4 header checksum is calculated including its
current value, which can be a valid checksum or just garbage.
In any case, if the original value is not zero, then the result
is always wrong.

The IPv4 checksum is defined in RFC791, page 14 says:
  Header Checksum:  16 bits

  The checksum algorithm is:
  The checksum field is the 16 bit one's complement of the one's
  complement sum of all 16 bit words in the header.  For purposes of
  computing the checksum, the value of the checksum field is zero.

Thus force the csum field to always be zero.

Fixes: b08b8cfeb2ae ("vhost: fix IP checksum")
Cc: sta...@dpdk.org

Signed-off-by: Flavio Leitner 
---
 lib/librte_vhost/virtio_net.c | 1 +
 1 file changed, 1 insertion(+)

v2:
  CC stable
  Added 'Fixes:' line.

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index eae7825f04..cde7498c76 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -445,6 +445,7 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
 
ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
   m_buf->l2_len);
+   ipv4_hdr->hdr_checksum = 0;
ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
}
 
-- 
2.20.1



[dpdk-dev] [PATCH] vhost: fix IPv4 csum calculation

2019-10-24 Thread Flavio Leitner
Currently the IPv4 header checksum is calculated including its
current value, which can be a valid checksum or just garbage.
In any case, if the original value is not zero, then the result
is always wrong.

The IPv4 checksum is defined in RFC791, page 14 says:
  Header Checksum:  16 bits

  The checksum algorithm is:
  The checksum field is the 16 bit one's complement of the one's
  complement sum of all 16 bit words in the header.  For purposes of
  computing the checksum, the value of the checksum field is zero.

Thus force the csum field to always be zero.

Signed-off-by: Flavio Leitner 
---
 lib/librte_vhost/virtio_net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index eae7825f04..cde7498c76 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -445,6 +445,7 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
 
ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
   m_buf->l2_len);
+   ipv4_hdr->hdr_checksum = 0;
ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
}
 
-- 
2.20.1



Re: [dpdk-dev] [PATCH] vhost: return error message for mbuf allocation failure

2019-10-16 Thread Flavio Leitner
On Wed, 16 Oct 2019 16:31:07 +0200
Ilya Maximets  wrote:

> mbuf allocation failure is a hard failure that highlights some
> significant issues with memory pool size or a mbuf leak.
> 
> We still have the message for subsequent chained mbufs, but not
> for the first one.  It was removed while introducing extbuf
> support for large buffers.  But it was useful for catching
> mempool issues and needs to be returned back.
> 
> Cc: Flavio Leitner 
> 
> Fixes: 5005bcda7123 ("vhost: add support for large buffers")
> Signed-off-by: Ilya Maximets 


Reviewed-by: Flavio Leitner 
Thanks Ilya
fbl

> ---
>  lib/librte_vhost/virtio_net.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c
> b/lib/librte_vhost/virtio_net.c index 66f0c7206..f8af4e0b3 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> *dev, struct rte_mempool *mp, {
>   struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
>  
> - if (unlikely(pkt == NULL))
> + if (unlikely(pkt == NULL)) {
> + RTE_LOG(ERR, VHOST_DATA,
> + "Failed to allocate memory for mbuf.\n");
>   return NULL;
> + }
>  
>   if (rte_pktmbuf_tailroom(pkt) >= data_len)
>   return pkt;



Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers

2019-10-16 Thread Flavio Leitner
On Wed, 16 Oct 2019 16:08:54 +0200
Ilya Maximets  wrote:

> On 16.10.2019 16:02, Flavio Leitner wrote:
> > On Wed, 16 Oct 2019 15:46:15 +0200
> > Maxime Coquelin  wrote:
> >   
> >> On 10/16/19 3:32 PM, Ilya Maximets wrote:  
> >>> On 16.10.2019 13:13, Maxime Coquelin wrote:  
> >>>>
> >>>>
> >>>> On 10/15/19 8:59 PM, Flavio Leitner wrote:  
> >>>>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> >>>>> If the data fits into a buffer, then all data is copied and a
> >>>>> single linear buffer is returned. Otherwise it allocates
> >>>>> additional mbufs and chains them together to return a multiple
> >>>>> segments mbuf.
> >>>>>
> >>>>> While that covers most use cases, it forces applications that
> >>>>> need to work with larger data sizes to support multiple segments
> >>>>> mbufs. The non-linear characteristic brings complexity and
> >>>>> performance implications to the application.
> >>>>>
> >>>>> To resolve the issue, add support to attach external buffer
> >>>>> to a pktmbuf and let the host provide during registration if
> >>>>> attaching an external buffer to pktmbuf is supported and if
> >>>>> only linear buffer are supported.
> >>>>>
> >>>>> Signed-off-by: Flavio Leitner 
> >>>>> ---
> >>>>>    doc/guides/prog_guide/vhost_lib.rst |  35 +
> >>>>>    lib/librte_vhost/rte_vhost.h    |   4 +
> >>>>>    lib/librte_vhost/socket.c   |  22 ++
> >>>>>    lib/librte_vhost/vhost.c    |  22 ++
> >>>>>    lib/librte_vhost/vhost.h    |   4 +
> >>>>>    lib/librte_vhost/virtio_net.c   | 109
> >>>>>  6 files changed, 182 insertions(+),
> >>>>> 14 deletions(-)
> >>>>>
> >>>>> - Changelog:
> >>>>>     v5:
> >>>>>   - fixed to destroy mutex if incompatible flags
> >>>>>     v4:
> >>>>>   - allow to use pktmbuf if there is exact space
> >>>>>   - removed log message if the buffer is too big
> >>>>>   - fixed the length to include align padding
> >>>>>   - free allocated buf if shinfo fails
> >>>>>     v3:
> >>>>>   - prevent the new features to be used with zero copy
> >>>>>   - fixed sizeof() usage
> >>>>>   - fixed log msg indentation
> >>>>>   - removed/replaced asserts
> >>>>>   - used the correct virt2iova function
> >>>>>   - fixed the patch's title
> >>>>>   - OvS PoC code:
> >>>>>     https://github.com/fleitner/ovs/tree/rte_malloc-v3
> >>>>>     v2:
> >>>>>   - Used rte_malloc() instead of another mempool as
> >>>>> suggested by Shahaf.
> >>>>>   - Added the documentation section.
> >>>>>   - Using driver registration to negotiate the features.
> >>>>>   - OvS PoC code:
> >>>>>
> >>>>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> >>>>> 
> >>>>
> >>>> Applied to dpdk-next-virtio/master.  
> >>>
> >>> Thanks Maxime,
> >>>
> >>> But can we return back the mbuf allocation failure message?  
> >>
> >> Good point, I missed it.
> >>  
> >>> I mean:
> >>>
> >>> diff --git a/lib/librte_vhost/virtio_net.c
> >>> b/lib/librte_vhost/virtio_net.c index 66f0c7206..f8af4e0b3 100644
> >>> --- a/lib/librte_vhost/virtio_net.c
> >>> +++ b/lib/librte_vhost/virtio_net.c
> >>> @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> >>> *dev, struct rte_mempool *mp,
> >>>   {
> >>>   struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> >>>   
> >>> -    if (unlikely(pkt == NULL))
> >>> +    if (unlikely(pkt == NULL)) {
> >>> +    RTE_LOG(ERR, VHOST_DATA,
> >>> +    "Failed to allocate memory for mbuf.\n");
> >>>   return NULL;
> >>> +    }
>

Re: [dpdk-dev] [PATCH v5] vhost: add support for large buffers

2019-10-16 Thread Flavio Leitner
On Wed, 16 Oct 2019 15:46:15 +0200
Maxime Coquelin  wrote:

> On 10/16/19 3:32 PM, Ilya Maximets wrote:
> > On 16.10.2019 13:13, Maxime Coquelin wrote:  
> >>
> >>
> >> On 10/15/19 8:59 PM, Flavio Leitner wrote:  
> >>> The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> >>> If the data fits into a buffer, then all data is copied and a
> >>> single linear buffer is returned. Otherwise it allocates
> >>> additional mbufs and chains them together to return a multiple
> >>> segments mbuf.
> >>>
> >>> While that covers most use cases, it forces applications that
> >>> need to work with larger data sizes to support multiple segments
> >>> mbufs. The non-linear characteristic brings complexity and
> >>> performance implications to the application.
> >>>
> >>> To resolve the issue, add support to attach external buffer
> >>> to a pktmbuf and let the host provide during registration if
> >>> attaching an external buffer to pktmbuf is supported and if
> >>> only linear buffer are supported.
> >>>
> >>> Signed-off-by: Flavio Leitner 
> >>> ---
> >>>   doc/guides/prog_guide/vhost_lib.rst |  35 +
> >>>   lib/librte_vhost/rte_vhost.h    |   4 +
> >>>   lib/librte_vhost/socket.c   |  22 ++
> >>>   lib/librte_vhost/vhost.c    |  22 ++
> >>>   lib/librte_vhost/vhost.h    |   4 +
> >>>   lib/librte_vhost/virtio_net.c   | 109
> >>>  6 files changed, 182 insertions(+),
> >>> 14 deletions(-)
> >>>
> >>> - Changelog:
> >>>    v5:
> >>>  - fixed to destroy mutex if incompatible flags
> >>>    v4:
> >>>  - allow to use pktmbuf if there is exact space
> >>>  - removed log message if the buffer is too big
> >>>  - fixed the length to include align padding
> >>>  - free allocated buf if shinfo fails
> >>>    v3:
> >>>  - prevent the new features to be used with zero copy
> >>>  - fixed sizeof() usage
> >>>  - fixed log msg indentation
> >>>  - removed/replaced asserts
> >>>  - used the correct virt2iova function
> >>>  - fixed the patch's title
> >>>  - OvS PoC code:
> >>>    https://github.com/fleitner/ovs/tree/rte_malloc-v3
> >>>    v2:
> >>>  - Used rte_malloc() instead of another mempool as suggested
> >>> by Shahaf.
> >>>  - Added the documentation section.
> >>>  - Using driver registration to negotiate the features.
> >>>  - OvS PoC code:
> >>>   
> >>> https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> >>>  
> >>
> >> Applied to dpdk-next-virtio/master.  
> > 
> > Thanks Maxime,
> > 
> > But can we return back the mbuf allocation failure message?  
> 
> Good point, I missed it.
> 
> > I mean:
> > 
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 66f0c7206..f8af4e0b3 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1354,8 +1354,11 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> > *dev, struct rte_mempool *mp,
> >  {
> >  struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> >  
> > -    if (unlikely(pkt == NULL))
> > +    if (unlikely(pkt == NULL)) {
> > +    RTE_LOG(ERR, VHOST_DATA,
> > +    "Failed to allocate memory for mbuf.\n");
> >  return NULL;
> > +    }
> >  
> >  if (rte_pktmbuf_tailroom(pkt) >= data_len)
> >  return pkt;
> > ---
> > 
> > It's a hard failure that highlights some significant issues with
> > memory pool size or a mbuf leak.  
> 
> I agree, we need this error message.

If it runs out of memory and there are many packets coming, then it
will flood the logs. Maybe we could use something to report in a more
moderated way, for example:

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index da69ab1db..b1572b3a4 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1354,8 +1354,14 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct 
rte_mempool *mp,
 {
struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
 
-   if (unlikely(pkt == NULL))
+   if (unlikely(pkt == NULL)) {
+   static int rate_lim = 0;
+
+   if (rate_lim++ % 1000 == 0)
+   RTE_LOG...
+
return NULL;
+   }
 
if (rte_pktmbuf_tailroom(pkt) >= data_len)
return pkt;


Or track this at mempool level and keep stats of failed requests and
report that in OvS. That would cover other points too.

fbl


[dpdk-dev] [PATCH v5] vhost: add support for large buffers

2019-10-15 Thread Flavio Leitner
The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, add support to attach external buffer
to a pktmbuf and let the host provide during registration if
attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner 
---
 doc/guides/prog_guide/vhost_lib.rst |  35 +
 lib/librte_vhost/rte_vhost.h|   4 +
 lib/librte_vhost/socket.c   |  22 ++
 lib/librte_vhost/vhost.c|  22 ++
 lib/librte_vhost/vhost.h|   4 +
 lib/librte_vhost/virtio_net.c   | 109 
 6 files changed, 182 insertions(+), 14 deletions(-)

- Changelog:
  v5:
- fixed to destroy mutex if incompatible flags
  v4:
- allow to use pktmbuf if there is exact space
- removed log message if the buffer is too big
- fixed the length to include align padding
- free allocated buf if shinfo fails
  v3:
- prevent the new features to be used with zero copy
- fixed sizeof() usage
- fixed log msg indentation
- removed/replaced asserts
- used the correct virt2iova function
- fixed the patch's title
- OvS PoC code:
  https://github.com/fleitner/ovs/tree/rte_malloc-v3
  v2:
- Used rte_malloc() instead of another mempool as suggested by Shahaf.
- Added the documentation section.
- Using driver registration to negotiate the features.
- OvS PoC code:
  
https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7

diff --git a/doc/guides/prog_guide/vhost_lib.rst 
b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API 
functions:
 Enabling this flag should only be done when the calling application does
 not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+Enabling this flag forces vhost dequeue function to only provide linear
+pktmbuf (no multi-segmented pktmbuf).
+
+The vhost library by default provides a single pktmbuf for given a
+packet, but if for some reason the data doesn't fit into a single
+pktmbuf (e.g., TSO is enabled), the library will allocate additional
+pktmbufs from the same mempool and chain them together to create a
+multi-segmented pktmbuf.
+
+However, the vhost application needs to support multi-segmented format.
+If the vhost application does not support that format and requires large
+buffers to be dequeue, this flag should be enabled to force only linear
+buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+Enabling this flag allows vhost dequeue function to allocate and attach
+an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+space to store all data.
+
+This is useful when the vhost application wants to support large packets
+but doesn't want to increase the default mempool object size nor to
+support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+is allocated using rte_malloc() which gets attached to a pktmbuf using
+rte_pktmbuf_attach_extbuf().
+
+See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+mbufs for application that doesn't support chained mbufs.
+
+It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT  (1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT   (1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..a8d433c49 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib

Re: [dpdk-dev] [PATCH v4] vhost: add support for large buffers

2019-10-15 Thread Flavio Leitner
On Tue, 15 Oct 2019 19:41:52 +0200
Ilya Maximets  wrote:

> Hi.
> Not a full review.  Few comments inline.
> 
> Best regards, Ilya Maximets.

Thanks for reviewing Ilya, see below.

[...]
> > @@ -870,6 +878,8 @@ rte_vhost_driver_register(const char *path,
> > uint64_t flags) goto out_free;
> > }
> > vsocket->dequeue_zero_copy = flags &
> > RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> > +   vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> > +   vsocket->linearbuf = flags &
> > RTE_VHOST_USER_LINEARBUF_SUPPORT; 
> > /*
> >  * Set the supported features correctly for the builtin
> > vhost-user @@ -894,6 +904,18 @@ rte_vhost_driver_register(const
> > char *path, uint64_t flags)
> >  * not compatible with postcopy.
> >  */
> > if (vsocket->dequeue_zero_copy) {
> > +   if (vsocket->extbuf) {
> > +   RTE_LOG(ERR, VHOST_CONFIG,
> > +   "error: zero copy is incompatible with
> > external buffers\n");
> > +   ret = -1;
> > +   goto out_free;  
> 
> There should be 'out_mutex'.

Good catch, going to fix that.

> 
> > +   }
> > +   if (vsocket->linearbuf) {
> > +   RTE_LOG(ERR, VHOST_CONFIG,
> > +   "error: zero copy is incompatible with
> > linear buffers\n");
> > +   ret = -1;
> > +   goto out_free;  
> 
> Ditto.

Yeah

[...]
> > +/*
> > + * Allocate a host supported pktmbuf.
> > + */
> > +static __rte_always_inline struct rte_mbuf *
> > +virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct
> > rte_mempool *mp,
> > +uint32_t data_len)
> > +{
> > +   struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> > +
> > +   if (unlikely(pkt == NULL))
> > +   return NULL;
> > +
> > +   if (rte_pktmbuf_tailroom(pkt) >= data_len)
> > +   return pkt;
> > +
> > +   /* attach an external buffer if supported */
> > +   if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
> > +   return pkt;
> > +
> > +   /* check if chained buffers are allowed */
> > +   if (!dev->linearbuf)
> > +   return pkt;  
> 
> I guess, checking of the 'linearbuf' should go before checking the
> 'extbuf'. The usecase is that allocation of several buffers from the
> memory pool is, probably, faster than rte_malloc() + memory
> attaching.  So, if the 'linearbuf' is not requested, it might be
> faster to use chained mbufs.

The speed seems to depend on the length of the packet and it is not
just the allocation, but the overall overhead to deal with multiple
segments. If the preferred way is chained buffers, then just don't
pass any of the new flags because that's the default behavior today.
 
> BTW, I'm not sure if we really need 2 separate options for this.
> i.e. 1. +linear +extbuf --> extbuf allocated
>   2. +linear -extbuf --> buffer dropped (is this case is useful
> at all?) 3. -linear +extbuf --> chained mbufs might be preferred (see
> above) 4. -linear -extbuf --> chained mbufs
> 
> Case 4 is a default case. 

Yes, in this case if the packet length fits into the pktmbuf, a single
linear mbuf is returned otherwise it uses chained mbufs.

>  Case 1 is our target case for supporting
> large buffers. 

Right, here it enables external buffers, but we don't want chained
mbufs.

> Case 3 might makes no much sense as the result is
> equal to case 4.

Well case #3 either uses a single mbuf from mpool or use an external
buffer for big packets while in case #4 there is no support for big
packets at all.

> Case 2 might be not interesting for as at all,

Case #2 says the app only supports what fits into a single mbuf and
no external buffer nor chained buffers is supported.

> because it will lead to random packet drops depending on their size.

Either that or wrong processing.

> But, if only cases 1 and 4 are valid and interesting to us, we could
> easily merge both flags.

For OvS use-case today yes, but I created different flags because they
seemed different useful features to me and changing flags later can get
the API very confusing.

Thanks!
fbl


[dpdk-dev] [PATCH v4] vhost: add support for large buffers

2019-10-15 Thread Flavio Leitner
The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, add support to attach external buffer
to a pktmbuf and let the host provide during registration if
attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner 
---
 doc/guides/prog_guide/vhost_lib.rst |  35 +
 lib/librte_vhost/rte_vhost.h|   4 +
 lib/librte_vhost/socket.c   |  22 ++
 lib/librte_vhost/vhost.c|  22 ++
 lib/librte_vhost/vhost.h|   4 +
 lib/librte_vhost/virtio_net.c   | 109 
 6 files changed, 182 insertions(+), 14 deletions(-)


- Changelog:
  v4:
- allow to use pktmbuf if there is exact space
- removed log message if the buffer is too big
- fixed the length to include align padding
- free allocated buf if shinfo fails
  v3:
- prevent the new features to be used with zero copy
- fixed sizeof() usage
- fixed log msg indentation
- removed/replaced asserts
- used the correct virt2iova function
- fixed the patch's title
- OvS PoC code:
  https://github.com/fleitner/ovs/tree/rte_malloc-v3
  v2:
- Used rte_malloc() instead of another mempool as suggested by Shahaf.
- Added the documentation section.
- Using driver registration to negotiate the features.
- OvS PoC code:
  
https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7



diff --git a/doc/guides/prog_guide/vhost_lib.rst 
b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API 
functions:
 Enabling this flag should only be done when the calling application does
 not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+Enabling this flag forces vhost dequeue function to only provide linear
+pktmbuf (no multi-segmented pktmbuf).
+
+The vhost library by default provides a single pktmbuf for given a
+packet, but if for some reason the data doesn't fit into a single
+pktmbuf (e.g., TSO is enabled), the library will allocate additional
+pktmbufs from the same mempool and chain them together to create a
+multi-segmented pktmbuf.
+
+However, the vhost application needs to support multi-segmented format.
+If the vhost application does not support that format and requires large
+buffers to be dequeue, this flag should be enabled to force only linear
+buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+Enabling this flag allows vhost dequeue function to allocate and attach
+an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+space to store all data.
+
+This is useful when the vhost application wants to support large packets
+but doesn't want to increase the default mempool object size nor to
+support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+is allocated using rte_malloc() which gets attached to a pktmbuf using
+rte_pktmbuf_attach_extbuf().
+
+See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+mbufs for application that doesn't support chained mbufs.
+
+It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT  (1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT   (1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..e546be2a8 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_so

[dpdk-dev] [PATCH v3] vhost: add support for large buffers

2019-10-11 Thread Flavio Leitner
The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, add support to attach external buffer
to a pktmbuf and let the host provide during registration if
attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner 
---
 doc/guides/prog_guide/vhost_lib.rst |  35 +
 lib/librte_vhost/rte_vhost.h|   4 ++
 lib/librte_vhost/socket.c   |  22 ++
 lib/librte_vhost/vhost.c|  22 ++
 lib/librte_vhost/vhost.h|   4 ++
 lib/librte_vhost/virtio_net.c   | 108 
 6 files changed, 181 insertions(+), 14 deletions(-)

- Changelog:
  V3:
- prevent the new features to be used with zero copy
- fixed sizeof() usage
- fixed log msg indentation
- removed/replaced asserts
- used the correct virt2iova function
- fixed the patch's title
- OvS PoC code:
  https://github.com/fleitner/ovs/tree/rte_malloc-v3
  V2:
- Used rte_malloc() instead of another mempool as suggested by Shahaf.
- Added the documentation section.
- Using driver registration to negotiate the features.
- OvS PoC code:
  
https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7


diff --git a/doc/guides/prog_guide/vhost_lib.rst 
b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API 
functions:
 Enabling this flag should only be done when the calling application does
 not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+Enabling this flag forces vhost dequeue function to only provide linear
+pktmbuf (no multi-segmented pktmbuf).
+
+The vhost library by default provides a single pktmbuf for given a
+packet, but if for some reason the data doesn't fit into a single
+pktmbuf (e.g., TSO is enabled), the library will allocate additional
+pktmbufs from the same mempool and chain them together to create a
+multi-segmented pktmbuf.
+
+However, the vhost application needs to support multi-segmented format.
+If the vhost application does not support that format and requires large
+buffers to be dequeue, this flag should be enabled to force only linear
+buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+Enabling this flag allows vhost dequeue function to allocate and attach
+an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+space to store all data.
+
+This is useful when the vhost application wants to support large packets
+but doesn't want to increase the default mempool object size nor to
+support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+is allocated using rte_malloc() which gets attached to a pktmbuf using
+rte_pktmbuf_attach_extbuf().
+
+See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+mbufs for application that doesn't support chained mbufs.
+
+It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT  (1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT   (1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..e546be2a8 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_socket {
bool dequeue_zero_copy;
bool iommu_support;
bool use_builtin_virtio_net;
+   bool extbuf;
+   bool linearbuf;
 
/*
 * The &quo

Re: [dpdk-dev] [PATCH v2] vhost: add support for large buffers.

2019-10-10 Thread Flavio Leitner


Hi Tiwei,

Thanks for the review. I will follow up with an improved patch.
fbl

On Thu, 10 Oct 2019 13:12:57 +0800
Tiwei Bie  wrote:

> [PATCH v2] vhost: add support for large buffers.
> 
> There is a warning reported by devtools/check-git-log.sh
> The '.' at the end of the title should be dropped.
> 
> On Fri, Oct 04, 2019 at 05:10:08PM -0300, Flavio Leitner wrote:
> > The rte_vhost_dequeue_burst supports two ways of dequeuing data.
> > If the data fits into a buffer, then all data is copied and a
> > single linear buffer is returned. Otherwise it allocates
> > additional mbufs and chains them together to return a multiple
> > segments mbuf.
> > 
> > While that covers most use cases, it forces applications that
> > need to work with larger data sizes to support multiple segments
> > mbufs. The non-linear characteristic brings complexity and
> > performance implications to the application.
> > 
> > To resolve the issue, let the host provide during registration
> > if attaching an external buffer to pktmbuf is supported and if
> > only linear buffer are supported.
> > 
> > Signed-off-by: Flavio Leitner 
> > ---
> >  doc/guides/prog_guide/vhost_lib.rst |  35 ++
> >  lib/librte_vhost/rte_vhost.h|   4 ++
> >  lib/librte_vhost/socket.c   |  10 +++
> >  lib/librte_vhost/vhost.c|  22 ++
> >  lib/librte_vhost/vhost.h|   4 ++
> >  lib/librte_vhost/virtio_net.c   | 103
> > ++-- 6 files changed, 172 insertions(+), 6
> > deletions(-)
> > 
> > - Changelog:
> >   V2:
> > - Used rte_malloc() instead of another mempool as suggested by
> > Shahaf.
> > - Added the documentation section.
> > - Using driver registration to negotiate the features.
> > - OvS PoC code:
> >   
> > https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7
> > 
> > diff --git a/doc/guides/prog_guide/vhost_lib.rst
> > b/doc/guides/prog_guide/vhost_lib.rst index fc3ee4353..07e40e3c5
> > 100644 --- a/doc/guides/prog_guide/vhost_lib.rst
> > +++ b/doc/guides/prog_guide/vhost_lib.rst
> > @@ -117,6 +117,41 @@ The following is an overview of some key Vhost
> > API functions: Enabling this flag should only be done when the
> > calling application does not pre-fault the guest shared memory,
> > otherwise migration would fail. 
> > +  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
> > +
> > +Enabling this flag forces vhost dequeue function to only
> > provide linear
> > +pktmbuf (no multi-segmented pktmbuf).
> > +
> > +The vhost library by default provides a single pktmbuf for
> > given a
> > +packet, but if for some reason the data doesn't fit into a
> > single
> > +pktmbuf (e.g., TSO is enabled), the library will allocate
> > additional
> > +pktmbufs from the same mempool and chain them together to
> > create a
> > +multi-segmented pktmbuf.
> > +
> > +However, the vhost application needs to support
> > multi-segmented format.
> > +If the vhost application does not support that format and
> > requires large
> > +buffers to be dequeue, this flag should be enabled to force
> > only linear
> > +buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
> > +
> > +It is disabled by default.
> > +
> > +  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
> > +
> > +Enabling this flag allows vhost dequeue function to allocate
> > and attach
> > +an external buffer to a pktmbuf if the pkmbuf doesn't provide
> > enough
> > +space to store all data.
> > +
> > +This is useful when the vhost application wants to support
> > large packets
> > +but doesn't want to increase the default mempool object size
> > nor to
> > +support multi-segmented mbufs (non-linear). In this case, a
> > fresh buffer
> > +is allocated using rte_malloc() which gets attached to a
> > pktmbuf using
> > +rte_pktmbuf_attach_extbuf().
> > +
> > +See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable
> > multi-segmented
> > +mbufs for application that doesn't support chained mbufs.
> > +
> > +It is disabled by default.
> > +
> >  * ``rte_vhost_driver_set_features(path, features)``
> >  
> >This function sets the feature bits the vhost-user driver
> > supports. The diff --git a/lib/librte_vhost/rte_vhost.h
> > b/lib/librte_vhost/rte_vhost.h index 19474bca0..b821

[dpdk-dev] [PATCH v2] vhost: add support for large buffers.

2019-10-04 Thread Flavio Leitner
The rte_vhost_dequeue_burst supports two ways of dequeuing data.
If the data fits into a buffer, then all data is copied and a
single linear buffer is returned. Otherwise it allocates
additional mbufs and chains them together to return a multiple
segments mbuf.

While that covers most use cases, it forces applications that
need to work with larger data sizes to support multiple segments
mbufs. The non-linear characteristic brings complexity and
performance implications to the application.

To resolve the issue, let the host provide during registration
if attaching an external buffer to pktmbuf is supported and if
only linear buffer are supported.

Signed-off-by: Flavio Leitner 
---
 doc/guides/prog_guide/vhost_lib.rst |  35 ++
 lib/librte_vhost/rte_vhost.h|   4 ++
 lib/librte_vhost/socket.c   |  10 +++
 lib/librte_vhost/vhost.c|  22 ++
 lib/librte_vhost/vhost.h|   4 ++
 lib/librte_vhost/virtio_net.c   | 103 ++--
 6 files changed, 172 insertions(+), 6 deletions(-)

- Changelog:
  V2:
- Used rte_malloc() instead of another mempool as suggested by Shahaf.
- Added the documentation section.
- Using driver registration to negotiate the features.
- OvS PoC code:
  
https://github.com/fleitner/ovs/commit/8fc197c40b1d4fda331686a7b919e9e2b670dda7

diff --git a/doc/guides/prog_guide/vhost_lib.rst 
b/doc/guides/prog_guide/vhost_lib.rst
index fc3ee4353..07e40e3c5 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -117,6 +117,41 @@ The following is an overview of some key Vhost API 
functions:
 Enabling this flag should only be done when the calling application does
 not pre-fault the guest shared memory, otherwise migration would fail.
 
+  - ``RTE_VHOST_USER_LINEARBUF_SUPPORT``
+
+Enabling this flag forces vhost dequeue function to only provide linear
+pktmbuf (no multi-segmented pktmbuf).
+
+The vhost library by default provides a single pktmbuf for given a
+packet, but if for some reason the data doesn't fit into a single
+pktmbuf (e.g., TSO is enabled), the library will allocate additional
+pktmbufs from the same mempool and chain them together to create a
+multi-segmented pktmbuf.
+
+However, the vhost application needs to support multi-segmented format.
+If the vhost application does not support that format and requires large
+buffers to be dequeue, this flag should be enabled to force only linear
+buffers (see RTE_VHOST_USER_EXTBUF_SUPPORT) or drop the packet.
+
+It is disabled by default.
+
+  - ``RTE_VHOST_USER_EXTBUF_SUPPORT``
+
+Enabling this flag allows vhost dequeue function to allocate and attach
+an external buffer to a pktmbuf if the pkmbuf doesn't provide enough
+space to store all data.
+
+This is useful when the vhost application wants to support large packets
+but doesn't want to increase the default mempool object size nor to
+support multi-segmented mbufs (non-linear). In this case, a fresh buffer
+is allocated using rte_malloc() which gets attached to a pktmbuf using
+rte_pktmbuf_attach_extbuf().
+
+See RTE_VHOST_USER_LINEARBUF_SUPPORT as well to disable multi-segmented
+mbufs for application that doesn't support chained mbufs.
+
+It is disabled by default.
+
 * ``rte_vhost_driver_set_features(path, features)``
 
   This function sets the feature bits the vhost-user driver supports. The
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b821b5df4 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -30,6 +30,10 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 #define RTE_VHOST_USER_POSTCOPY_SUPPORT(1ULL << 4)
+/* support mbuf with external buffer attached */
+#define RTE_VHOST_USER_EXTBUF_SUPPORT  (1ULL << 5)
+/* support only linear buffers (no chained mbufs) */
+#define RTE_VHOST_USER_LINEARBUF_SUPPORT   (1ULL << 6)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 274988c4d..0ba610fda 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -40,6 +40,8 @@ struct vhost_user_socket {
bool dequeue_zero_copy;
bool iommu_support;
bool use_builtin_virtio_net;
+   bool extbuf;
+   bool linearbuf;
 
/*
 * The "supported_features" indicates the feature bits the
@@ -232,6 +234,12 @@ vhost_user_add_connection(int fd, struct vhost_user_socket 
*vsocket)
if (vsocket->dequeue_zero_copy)
vhost_enable_dequeue_zero_copy(vid);
 
+   if (vsocket->extbuf)
+   vhost_enable_extbuf(vid);
+
+   if (vsocket->linearbuf)
+   vhost_enab

Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs

2019-10-03 Thread Flavio Leitner
On Thu, 3 Oct 2019 18:57:32 +0200
Ilya Maximets  wrote:

> On 02.10.2019 20:15, Flavio Leitner wrote:
> > On Wed, 2 Oct 2019 17:50:41 +
> > Shahaf Shuler  wrote:
> >   
> >> Wednesday, October 2, 2019 3:59 PM, Flavio Leitner:  
> >>> Obrembski MichalX ; Stokes Ian
> >>> 
> >>> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> >>> mbufs
> >>>
> >>>
> >>> Hi Shahaf,
> >>>
> >>> Thanks for looking into this, see my inline comments.
> >>>
> >>> On Wed, 2 Oct 2019 09:00:11 +
> >>> Shahaf Shuler  wrote:
> >>>  
> >>>> Wednesday, October 2, 2019 11:05 AM, David Marchand:  
> >>>>> Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large
> >>>>> linear mbufs
> >>>>>
> >>>>> Hello Shahaf,
> >>>>>
> >>>>> On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler
> >>>>>  wrote:  
> >>>>>> 
> >>
> >> [...]
> >>  
> >>>>>
> >>>>> I am missing some piece here.
> >>>>> Which pool would the PMD take those external buffers from?  
> >>>>
> >>>> The mbuf is always taken from the single mempool associated w/
> >>>> the rxq. The buffer for the mbuf may be allocated (in case virtio
> >>>> payload is bigger than current mbuf size) from DPDK hugepages or
> >>>> any other system memory and be attached to the mbuf.
> >>>>
> >>>> You can see example implementation of it in mlx5 PMD (checkout
> >>>> rte_pktmbuf_attach_extbuf call)  
> >>>
> >>> Thanks, I wasn't aware of external buffers.
> >>>
> >>> I see that attaching external buffers of the correct size would be
> >>> more efficient in terms of saving memory/avoiding sparsing.
> >>>
> >>> However, we still need to be prepared to the worse case scenario
> >>> (all packets 64K), so that doesn't help with the total memory
> >>> required.  
> >>
> >> Am not sure why.
> >> The allocation can be per demand. That is - only when you
> >> encounter a large buffer.
> >>
> >> Having buffer allocated in advance will benefit only from removing
> >> the cost of the rte_*malloc. However on such big buffers, and
> >> further more w/ device offloads like TSO, am not sure that is an
> >> issue.  
> > 
> > Now I see what you're saying. I was thinking we had to reserve the
> > memory before, like mempool does, then get the buffers as needed.
> > 
> > OK, I can give a try with rte_*malloc and see how it goes.  
> 
> This way we actually could have a nice API.  For example, by
> introducing some new flag RTE_VHOST_USER_NO_CHAINED_MBUFS (there
> might be better name) which could be passed to driver_register().
> On receive, depending on this flag, function will create chained
> mbufs or allocate new contiguous memory chunk and attach it as
> an external buffer if the data could not be stored in a single
> mbuf from the registered memory pool.
> 
> Supporting external memory in mbufs will require some additional
> work from the OVS side (e.g. better work with ol_flags), but
> we'll have to do it anyway for upgrade to DPDK 19.11.

Agreed. Looks like rte_malloc is fast enough after all. I have a PoC
running iperf3 from VM to another baremetal using vhost-user client
with TSO enabled:
[...]
[  5] 140.00-141.00 sec  4.60 GBytes  39.5 Gbits/sec0   1.26 MBytes
[  5] 141.00-142.00 sec  4.65 GBytes  39.9 Gbits/sec0   1.26 MBytes
[  5] 142.00-143.00 sec  4.65 GBytes  40.0 Gbits/sec0   1.26 MBytes
[  5] 143.00-144.00 sec  4.65 GBytes  39.9 Gbits/sec9   1.04 MBytes
[  5] 144.00-145.00 sec  4.59 GBytes  39.4 Gbits/sec0   1.16 MBytes
[  5] 145.00-146.00 sec  4.58 GBytes  39.3 Gbits/sec0   1.26 MBytes
[  5] 146.00-147.00 sec  4.48 GBytes  38.5 Gbits/sec  700973 KBytes
[...]
(The physical link is 40Gbps)

I will clean that, test more and post the patches soon.
Thanks!
fbl


Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs

2019-10-02 Thread Flavio Leitner
On Wed, 2 Oct 2019 17:50:41 +
Shahaf Shuler  wrote:

> Wednesday, October 2, 2019 3:59 PM, Flavio Leitner:
> > Obrembski MichalX ; Stokes Ian
> > 
> > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> > mbufs
> > 
> > 
> > Hi Shahaf,
> > 
> > Thanks for looking into this, see my inline comments.
> > 
> > On Wed, 2 Oct 2019 09:00:11 +
> > Shahaf Shuler  wrote:
> >   
> > > Wednesday, October 2, 2019 11:05 AM, David Marchand:  
> > > > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large
> > > > linear mbufs
> > > >
> > > > Hello Shahaf,
> > > >
> > > > On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler
> > > >  wrote:  
> > > > >  
> 
> [...]
> 
> > > >
> > > > I am missing some piece here.
> > > > Which pool would the PMD take those external buffers from?  
> > >
> > > The mbuf is always taken from the single mempool associated w/ the
> > > rxq. The buffer for the mbuf may be allocated (in case virtio
> > > payload is bigger than current mbuf size) from DPDK hugepages or
> > > any other system memory and be attached to the mbuf.
> > >
> > > You can see example implementation of it in mlx5 PMD (checkout
> > > rte_pktmbuf_attach_extbuf call)  
> > 
> > Thanks, I wasn't aware of external buffers.
> > 
> > I see that attaching external buffers of the correct size would be
> > more efficient in terms of saving memory/avoiding sparsing.
> > 
> > However, we still need to be prepared to the worse case scenario
> > (all packets 64K), so that doesn't help with the total memory
> > required.  
> 
> Am not sure why. 
> The allocation can be per demand. That is - only when you encounter a
> large buffer. 
> 
> Having buffer allocated in advance will benefit only from removing
> the cost of the rte_*malloc. However on such big buffers, and further
> more w/ device offloads like TSO, am not sure that is an issue. 

Now I see what you're saying. I was thinking we had to reserve the
memory before, like mempool does, then get the buffers as needed.

OK, I can give a try with rte_*malloc and see how it goes.

> > The current patch pushes the decision to the application which
> > knows better the workload. If more memory is available, it can
> > optionally use large buffers, otherwise just don't pass that. Or
> > even decide whether to share the same 64K mempool between multiple
> > vhost ports or use one mempool per port.
> > 
> > Perhaps I missed something, but managing memory with mempool still
> > require us to have buffers of 64K regardless if the data consumes
> > less space. Otherwise the application or the PMD will have to
> > manage memory itself.
> > 
> > If we let the PMD manages the memory, what happens if a port/queue
> > is closed and one or more buffers are still in use (switching)? I
> > don't see how to solve this cleanly.  
> 
> Closing of the dev should return EBUSY till all buffers are free. 
> What is the use case of closing a port while still having packet
> pending on other port of the switch? And why we cannot wait for them
> to complete transmission?

The vswitch gets the request from outside and the assumption is that
the command will succeed. AFAIK, there is no retry mechanism.

Thanks Shahaf!
fbl


Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs

2019-10-02 Thread Flavio Leitner


Hi Shahaf,

Thanks for looking into this, see my inline comments.

On Wed, 2 Oct 2019 09:00:11 +
Shahaf Shuler  wrote:

> Wednesday, October 2, 2019 11:05 AM, David Marchand:
> > Subject: Re: [dpdk-dev] [PATCH] vhost: add support to large linear
> > mbufs
> > 
> > Hello Shahaf,
> > 
> > On Wed, Oct 2, 2019 at 6:46 AM Shahaf Shuler 
> > wrote:  
> > >
> > > Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:  
> > > > Subject: [dpdk-dev] [PATCH] vhost: add support to large linear
> > > > mbufs
> > > >
> > > > The rte_vhost_dequeue_burst supports two ways of dequeuing
> > > > data. If the data fits into a buffer, then all data is copied
> > > > and a single linear buffer is returned. Otherwise it allocates
> > > > additional mbufs and chains them together to return a multiple
> > > > segments mbuf.
> > > >
> > > > While that covers most use cases, it forces applications that
> > > > need to work with larger data sizes to support multiple
> > > > segments mbufs. The non-linear characteristic brings complexity
> > > > and performance implications to the application.
> > > >
> > > > To resolve the issue, change the API so that the application can
> > > > optionally provide a second mempool containing larger mbufs. If
> > > > that is not provided (NULL), the behavior remains as before the
> > > > change. Otherwise, the data size is checked and the
> > > > corresponding mempool is used to return linear mbufs.  
> > >
> > > I understand the motivation.
> > > However, providing a static pool w/ large buffers is not so
> > > efficient in terms  
> > of memory footprint. You will need to prepare to worst case (all
> > packet are large) w/ max size of 64KB.  
> > > Also, the two mempools are quite restrictive as the memory fill
> > > of the  
> > mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K ,
> > mempool2 mbuf.size = 64K, packet size 4KB.  
> > >
> > > Instead, how about using the mbuf external buffer feature?
> > > The flow will be:
> > > 1. vhost PMD always receive a single mempool (like today) 2. on
> > > dequeue, PMD looks on the virtio packet size. If smaller than the
> > > mbuf size use the mbuf as is (like today) 3. otherwise, allocate
> > > a new buffer (inside the PMD) and link it to the mbuf as external
> > > buffer (rte_pktmbuf_attach_extbuf)  
> > 
> > I am missing some piece here.
> > Which pool would the PMD take those external buffers from?  
> 
> The mbuf is always taken from the single mempool associated w/ the
> rxq. The buffer for the mbuf may be allocated (in case virtio payload
> is bigger than current mbuf size) from DPDK hugepages or any other
> system memory and be attached to the mbuf.
> 
> You can see example implementation of it in mlx5 PMD (checkout
> rte_pktmbuf_attach_extbuf call)

Thanks, I wasn't aware of external buffers.

I see that attaching external buffers of the correct size would be more
efficient in terms of saving memory/avoiding sparsing.

However, we still need to be prepared to the worse case scenario (all
packets 64K), so that doesn't help with the total memory required.

The current patch pushes the decision to the application which knows
better the workload. If more memory is available, it can optionally use
large buffers, otherwise just don't pass that. Or even decide whether
to share the same 64K mempool between multiple vhost ports or use one
mempool per port.

Perhaps I missed something, but managing memory with mempool still
require us to have buffers of 64K regardless if the data consumes less
space. Otherwise the application or the PMD will have to manage memory
itself.

If we let the PMD manages the memory, what happens if a port/queue
is closed and one or more buffers are still in use (switching)? I don't
see how to solve this cleanly.

fbl

> 
> > 
> > If it is from an additional mempool passed to the vhost pmd, I
> > can't see the difference with Flavio proposal.
> > 
> >   
> > > The pros of this approach is that you have full flexibility on
> > > the memory  
> > allocation, and therefore a lower footprint.  
> > > The cons is the OVS will need to know how to handle mbuf w/
> > > external  
> > buffers (not too complex IMO).
> > 
> > 
> > --
> > David Marchand  



Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs

2019-10-01 Thread Flavio Leitner
On Tue,  1 Oct 2019 19:19:35 -0300
Flavio Leitner  wrote:

> The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
> the data fits into a buffer, then all data is copied and a single
> linear buffer is returned. Otherwise it allocates additional mbufs
> and chains them together to return a multiple segments mbuf.
> 
> While that covers most use cases, it forces applications that need
> to work with larger data sizes to support multiple segments mbufs.
> The non-linear characteristic brings complexity and performance
> implications to the application.
> 
> To resolve the issue, change the API so that the application can
> optionally provide a second mempool containing larger mbufs. If that
> is not provided (NULL), the behavior remains as before the change.
> Otherwise, the data size is checked and the corresponding mempool
> is used to return linear mbufs.
> 
> Signed-off-by: Flavio Leitner 

The use case behind the patch is TSO support in DPDK accelerated Open
vSwitch (customers are using ovs-dpdk to send big packets too!). The
effort has been going on for many months now. There is a recent
proposal in OvS dev mailing using the API as is today here:
https://mail.openvswitch.org/pipermail/ovs-dev/2019-September/362573.html

There is also a reply from me in that thread with the patches
implementing a Proof-Of-Concept with the proposed API change:
https://mail.openvswitch.org/pipermail/ovs-dev/2019-September/362881.html

I opted to improve the current API instead of adding a new one, but I
have no strong opinion either way.

I know it's a bit late for v19.11, but since the TSO support effort is
going on for months now and OvS uses only LTS, I wonder if this can be
part of v19.11.

Thanks,
fbl


[dpdk-dev] [PATCH] vhost: add support to large linear mbufs

2019-10-01 Thread Flavio Leitner
The rte_vhost_dequeue_burst supports two ways of dequeuing data. If
the data fits into a buffer, then all data is copied and a single
linear buffer is returned. Otherwise it allocates additional mbufs
and chains them together to return a multiple segments mbuf.

While that covers most use cases, it forces applications that need
to work with larger data sizes to support multiple segments mbufs.
The non-linear characteristic brings complexity and performance
implications to the application.

To resolve the issue, change the API so that the application can
optionally provide a second mempool containing larger mbufs. If that
is not provided (NULL), the behavior remains as before the change.
Otherwise, the data size is checked and the corresponding mempool
is used to return linear mbufs.

Signed-off-by: Flavio Leitner 
---
 drivers/net/vhost/rte_eth_vhost.c |  4 +--
 examples/tep_termination/main.c   |  2 +-
 examples/vhost/main.c |  2 +-
 lib/librte_vhost/rte_vhost.h  |  5 ++-
 lib/librte_vhost/virtio_net.c | 57 +++
 5 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 46f01a7f4..ce7f68a5b 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t 
nb_bufs)
 VHOST_MAX_PKT_BURST);
 
nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
- r->mb_pool, &bufs[nb_rx],
- num);
+ r->mb_pool, NULL,
+ &bufs[nb_rx], num);
 
nb_rx += nb_pkts;
nb_receive -= nb_pkts;
diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index ab956ad7c..3ebf0fa6e 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg)
if (likely(!vdev->remove)) {
/* Handle guest TX*/
tx_count = rte_vhost_dequeue_burst(vdev->vid,
-   VIRTIO_TXQ, mbuf_pool,
+   VIRTIO_TXQ, mbuf_pool, NULL,
pkts_burst, MAX_PKT_BURST);
/* If this is the first received packet we need 
to learn the MAC */
if (unlikely(vdev->ready == 
DEVICE_MAC_LEARNING) && tx_count) {
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index ab649bf14..e9b306af3 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
pkts, MAX_PKT_BURST);
} else {
count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
-   mbuf_pool, pkts, MAX_PKT_BURST);
+   mbuf_pool, NULL, pkts, MAX_PKT_BURST);
}
 
/* setup VMDq for the first packet */
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 19474bca0..b05fd8e2a 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
  *  virtio queue index in mq case
  * @param mbuf_pool
  *  mbuf_pool where host mbuf is allocated.
+ * @param mbuf_pool_large
+ *  mbuf_pool where larger host mbuf is allocated.
  * @param pkts
  *  array to contain packets to be dequeued
  * @param count
@@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
  *  num of packets dequeued
  */
 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-   struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+   struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+   struct rte_mbuf **pkts, uint16_t count);
 
 /**
  * Get guest mem table: a list of memory regions.
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5b85b832d..da9d77732 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq)
 
 static __rte_noinline uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
-   struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+   struct rte_mempool *mbuf_pool, struct rte_mempool *mbuf_pool_large,
+   struct rte_mbuf **pkts, uint16_t count)
 {
uint16_t i;
uint16_t free_entries;
+   uint16_t mbuf_avail;
 
if (unlikely(dev->d

Re: [dpdk-dev] kernel binding of devices + hotplug

2018-04-18 Thread Flavio Leitner
On Wed, Apr 18, 2018 at 11:17:47AM -0700, Stephen Hemminger wrote:
> On Wed, 18 Apr 2018 11:11:01 -0300
> Flavio Leitner  wrote:
> 
> > On Sun, Apr 15, 2018 at 01:48:36AM +, Stephen Hemminger wrote:
> > > My vote is to work with udev and not try to replace it.
> > > 
> > > Driverctl works well. Just not for bifurcated driver  
> > 
> > I second that.  We also have other system configs to care about like
> > kernel parameters and hugepage configuration which I think follow the
> > same idea that they are system wide configs and should not be managed
> > by DPDK itself.
> 
> Maybe teach driverctl (and udev) to handle bifurcated drivers.

I don't know the challenges to tech driverctl to handle bifurcated
drivers but I would agree that it should be our first place to look at.

> Unfortunately, vendors are very fractured on how network devices are managed.

You mean distros? hw vendors? all vendors? :)

Perhaps if community focus on something, then they might follow at some
point.

-- 
Flavio


Re: [dpdk-dev] kernel binding of devices + hotplug

2018-04-18 Thread Flavio Leitner
On Sun, Apr 15, 2018 at 01:48:36AM +, Stephen Hemminger wrote:
> My vote is to work with udev and not try to replace it.
> 
> Driverctl works well. Just not for bifurcated driver

I second that.  We also have other system configs to care about like
kernel parameters and hugepage configuration which I think follow the
same idea that they are system wide configs and should not be managed
by DPDK itself.

fbl


> 
> On Fri, Apr 13, 2018, 9:31 AM Thomas Monjalon  wrote:
> 
> > It's time to think (again) how we bind devices with kernel modules.
> > We need to decide how we want to manage hotplugged devices with DPDK.
> >
> > A bit of history first.
> > There was some code in DPDK for bind/unbind, but it has been removed
> > in DPDK 1.7 - http://dpdk.org/commit/5d8751b83
> > Copy of the commit message (in 2014):
> > "
> > The bind/unbind operations should not be handled by the eal.
> > These operations should be either done outside of dpdk or
> > inside the PMDs themselves as these are their problems.
> > "
> >
> > The question raised at this time (4 years ago) is still under discussion.
> > Should we manage binding inside or outside DPDK?
> > Should it be controlled in the application or in the OS base?
> >
> > As you know, we use dpdk-devbind.py.
> > This tool lacks two major features:
> > - persistent configuration
> > - hotplug
> >
> > If we consider that the DPDK applications should be able to apply its own
> > policy to choose the devices to bind, then we need to implement binding
> > in the PMD (with EAL helpers).
> >
> > On the other hand, if we consider that it is the system responsibility,
> > then we could choose systemd/udev and driverctl.
> >
> > The debate is launched!
> >
> > Please find more details in the references below.
> >
> > Announce of driverctl:
> > http://dpdk.org/ml/archives/dev/2015-December/029500.html
> > Repository of driverctl:
> > https://gitlab.com/driverctl/driverctl
> >
> > Discussion about binding script and driverctl:
> > http://dpdk.org/ml/archives/dev/2018-April/095687.html
> >
> > Patch to implement binding in DPDK (for hotplug):
> > http://dpdk.org/ml/archives/dev/2018-April/095714.html
> >
> > Discussion in the same hotplug series:
> > http://dpdk.org/ml/archives/dev/2018-April/097058.html
> >
> >
> >
> >

-- 
Flavio


Re: [dpdk-dev] [RFC] Wireless Base Band Device (bbdev)

2017-10-04 Thread Flavio Leitner
On Tue, 03 Oct 2017 17:17:53 +0200
Thomas Monjalon  wrote:

> 03/10/2017 16:29, Mokhtar, Amr:
> > From: Thomas Monjalon [mailto:tho...@monjalon.net]  
> > > 25/08/2017 15:46, Amr Mokhtar:  
> > > > +int
> > > > +rte_bbdev_configure(uint8_t dev_id, uint16_t num_queues,
> > > > +   const struct rte_bbdev_conf *conf);  
> > > 
> > > I am not convinced by the "configure all" function in ethdev.
> > > We break the ABI each time we add a new feature to configure.
> > > And it does not really help to have all configurations in one struct.
> > > Would you mind to split the struct rte_bbdev_conf and split the function
> > > accordingly?  
> > 
> > There is nothing to split tbh. The only parameter it has is the socket_id.
> > And in fact, it's optional, can be null. The only config we need is 
> > num_queues.  
> 
> Indeed, there is nothing in this struct.
> If you need only to allocate queues, you just have to rename this function.
> 
> > I don't see in the near future that we may need to add more config params.
> > As a side, in the time of the implementation we were trying to avoid any
> > diversions from the current design ideology of ethdev and cryptodev.  
> 
> There is no ideology in ethdev, just some mistakes ;)
> 
> > Can we leave it for consideration with future releases?  
> 
> No it should be addressed from the beginning.
> 
> When you will need to add something more to configure port-wise,
> you should add a new function instead of breaking the ABI
> of the global conf struct.
> That's why the configure option should be more specialized.
> 
> Distro people were complaining about ABI breakage last week.
> This is exactly an example of how to avoid it from the beginning.

Exactly, and fixing in future is unlikely to happen or could be
more difficult and even if it happens, it will cause another breakage.

-- 
Flavio


[dpdk-dev] [PATCH] eal: check cpu flags at init

2016-09-27 Thread Flavio Leitner
On Mon, Sep 26, 2016 at 11:43:37AM -0400, Aaron Conole wrote:
> My only concern is whether this change would be considered ABI
> breaking.  I wouldn't think so, since it doesn't seem as though an
> application would want to call this explicitly (and is spelled out as
> such), but I can't be sure that it isn't already included in the
> standard application API, and therefore needs to go through the change
> process.

I didn't want to change the original behavior more than needed.

I think another patch would be necessary to change the whole EAL
initialization because there's a bunch of rte_panic() there which
aren't friendly with callers either.

-- 
fbl



[dpdk-dev] [PATCH] eal: check cpu flags at init

2016-09-23 Thread Flavio Leitner
An application might be linked to DPDK but not really use it,
so move the cpu flag check to the EAL initialization instead.

Signed-off-by: Flavio Leitner 
---
 lib/librte_eal/bsdapp/eal/eal.c | 3 +++
 lib/librte_eal/common/eal_common_cpuflags.c | 6 --
 lib/librte_eal/linuxapp/eal/eal.c   | 3 +++
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index a0c8f8c..c4b22af 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -496,6 +496,9 @@ rte_eal_init(int argc, char **argv)
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];

+   /* checks if the machine is adequate */
+   rte_cpu_check_supported();
+
if (!rte_atomic32_test_and_set(&run_once))
return -1;

diff --git a/lib/librte_eal/common/eal_common_cpuflags.c 
b/lib/librte_eal/common/eal_common_cpuflags.c
index ecb1240..b5f76f7 100644
--- a/lib/librte_eal/common/eal_common_cpuflags.c
+++ b/lib/librte_eal/common/eal_common_cpuflags.c
@@ -39,14 +39,8 @@
 /**
  * Checks if the machine is adequate for running the binary. If it is not, the
  * program exits with status 1.
- * The function attribute forces this function to be called before main(). But
- * with ICC, the check is generated by the compiler.
  */
-#ifndef __INTEL_COMPILER
-void __attribute__ ((__constructor__))
-#else
 void
-#endif
 rte_cpu_check_supported(void)
 {
/* This is generated at compile-time by the build system */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c 
b/lib/librte_eal/linuxapp/eal/eal.c
index d5b81a3..4e88cfc 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -740,6 +740,9 @@ rte_eal_init(int argc, char **argv)
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];

+   /* checks if the machine is adequate */
+   rte_cpu_check_supported();
+
if (!rte_atomic32_test_and_set(&run_once))
return -1;

-- 
2.7.4




[dpdk-dev] [RFC] vhost-user public struct refactor (was Re: [PATCH RFC 2/4] vhost: make buf vector for scatter RX) local.

2016-04-06 Thread Flavio Leitner
On Tue, Apr 05, 2016 at 01:47:33PM +0800, Yuanhan Liu wrote:
> On Fri, Feb 19, 2016 at 03:06:50PM +0800, Yuanhan Liu wrote:
> > On Fri, Feb 19, 2016 at 09:32:41AM +0300, Ilya Maximets wrote:
> > > Array of buf_vector's is just an array for temporary storing information
> > > about available descriptors. It used only locally in virtio_dev_merge_rx()
> > > and there is no reason for that array to be shared.
> > > 
> > > Fix that by allocating local buf_vec inside virtio_dev_merge_rx().
> > > 
> > > Signed-off-by: Ilya Maximets 
> > > ---
> > >  lib/librte_vhost/rte_virtio_net.h |  1 -
> > >  lib/librte_vhost/vhost_rxtx.c | 45 
> > > ---
> > >  2 files changed, 23 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/lib/librte_vhost/rte_virtio_net.h 
> > > b/lib/librte_vhost/rte_virtio_net.h
> > > index 10dcb90..ae1e4fb 100644
> > > --- a/lib/librte_vhost/rte_virtio_net.h
> > > +++ b/lib/librte_vhost/rte_virtio_net.h
> > > @@ -91,7 +91,6 @@ struct vhost_virtqueue {
> > >   int kickfd; /**< Currently unused 
> > > as polling mode is enabled. */
> > >   int enabled;
> > >   uint64_treserved[16];   /**< Reserve some 
> > > spaces for future extension. */
> > > - struct buf_vector   buf_vec[BUF_VECTOR_MAX];/**< for 
> > > scatter RX. */
> > >  } __rte_cache_aligned;
> > 
> > I like this kind of cleanup, however, it breaks ABI.
> 
> So, I was considering to add vhost-user Tx delayed-copy (or zero copy)
> support recently, which comes to yet another ABI violation, as we need
> add a new field to virtio_memory_regions struct to do guest phys addr
> to host phys addr translation. You may ask, however, that why do we need
> expose virtio_memory_regions struct to users at all?
> 
> You are right, we don't have to. And here is the thing: we exposed way
> too many fields (or even structures) than necessary. Say, vhost_virtqueue
> struct should NOT be exposed to user at all: application just need to
> tell the right queue id to locate a specific queue, and that's all.
> The structure should be defined in an internal header file. With that,
> we could do any changes to it we want, without worrying about that we
> may offense the painful ABI rules.
> 
> Similar changes could be done to virtio_net struct as well, just exposing
> very few fields that are necessary and moving all others to an internal
> structure.
> 
> Huawei then suggested a more radical yet much cleaner one: just exposing
> a virtio_net handle to application, just like the way kernel exposes an
> fd to user for locating a specific file. However, it's more than an ABI
> change; it's also an API change: some fields are referenced by applications,
> such as flags, virt_qp_nb. We could expose some new functions to access
> them though.
> 
> I'd vote for this one, as it sounds very clean to me. This would also
> solve the block issue of this patch. Though it would break OVS, I'm thinking
> that'd be okay, as OVS has dependence on DPDK version: what we need to
> do is just to send few patches to OVS, and let it points to next release,
> say DPDK v16.07. Flavio, please correct me if I'm wrong.

There is a plan to use vHost PMD, so from OVS point of view the virtio
stuff would be hidden because vhost PMD would look like just as a
regular ethernet, right?

I think we are waiting for 16.04 to be released with that so we can
start push changes to OVS as well.

-- 
fbl



[dpdk-dev] [PATCH v7 4/8] vhost: rxtx: use queue id instead of constant ring index

2015-11-17 Thread Flavio Leitner
On Tue, Nov 17, 2015 at 10:23:38AM +0200, Michael S. Tsirkin wrote:
> On Mon, Nov 16, 2015 at 02:20:57PM -0800, Flavio Leitner wrote:
> > On Wed, Oct 28, 2015 at 11:12:25PM +0200, Michael S. Tsirkin wrote:
> > > On Wed, Oct 28, 2015 at 06:30:41PM -0200, Flavio Leitner wrote:
> > > > On Sat, Oct 24, 2015 at 08:47:10PM +0300, Michael S. Tsirkin wrote:
> > > > > On Sat, Oct 24, 2015 at 12:34:08AM -0200, Flavio Leitner wrote:
> > > > > > On Thu, Oct 22, 2015 at 02:32:31PM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Thu, Oct 22, 2015 at 05:49:55PM +0800, Yuanhan Liu wrote:
> > > > > > > > On Wed, Oct 21, 2015 at 05:26:18PM +0300, Michael S. Tsirkin 
> > > > > > > > wrote:
> > > > > > > > > On Wed, Oct 21, 2015 at 08:48:15PM +0800, Yuanhan Liu wrote:
> > > > > > > > > > > Please note that for virtio devices, guest is supposed to
> > > > > > > > > > > control the placement of incoming packets in RX queues.
> > > > > > > > > > 
> > > > > > > > > > I may not follow you.
> > > > > > > > > > 
> > > > > > > > > > Enqueuing packets to a RX queue is done at vhost lib, 
> > > > > > > > > > outside the
> > > > > > > > > > guest, how could the guest take the control here?
> > > > > > > > > > 
> > > > > > > > > > --yliu
> > > > > > > > > 
> > > > > > > > > vhost should do what guest told it to.
> > > > > > > > > 
> > > > > > > > > See virtio spec:
> > > > > > > > >   5.1.6.5.5 Automatic receive steering in multiqueue mode
> > > > > > > > 
> > > > > > > > Spec says:
> > > > > > > > 
> > > > > > > > After the driver transmitted a packet of a flow on 
> > > > > > > > transmitqX,
> > > > > > > > the device SHOULD cause incoming packets for that flow to be
> > > > > > > > steered to receiveqX.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Michael, I still have no idea how vhost could know the flow even
> > > > > > > > after discussion with Huawei. Could you be more specific about
> > > > > > > > this? Say, how could guest know that? And how could guest tell
> > > > > > > > vhost which RX is gonna to use?
> > > > > > > > 
> > > > > > > > Thanks.
> > > > > > > > 
> > > > > > > > --yliu
> > > > > > > 
> > > > > > > I don't really understand the question.
> > > > > > > 
> > > > > > > When guests transmits a packet, it makes a decision
> > > > > > > about the flow to use, and maps that to a tx/rx pair of queues.
> > > > > > > 
> > > > > > > It sends packets out on the tx queue and expects device to
> > > > > > > return packets from the same flow on the rx queue.
> > > > > > 
> > > > > > Why? I can understand that there should be a mapping between
> > > > > > flows and queues in a way that there is no re-ordering, but
> > > > > > I can't see the relation of receiving a flow with a TX queue.
> > > > > > 
> > > > > > fbl
> > > > > 
> > > > > That's the way virtio chose to program the rx steering logic.
> > > > > 
> > > > > It's low overhead (no special commands), and
> > > > > works well for TCP when user is an endpoint since rx and tx
> > > > > for tcp are generally tied (because of ack handling).
> > 
> > It is low overhead for the control plane, but not for the data plane.
> 
> Well, there's zero data plane overhead within the guest.
> You can't go lower :)

I agree, but I am talking about vhost-user or whatever means we use to
provide packets to the virtio backend. That will have to distribute
the packets according to the guest's mapping which is not zero overhead.


> > > > > We can discuss other ways, e.g. special commands for guests to
> > > > > program s

[dpdk-dev] [PATCH v7 4/8] vhost: rxtx: use queue id instead of constant ring index

2015-11-16 Thread Flavio Leitner
On Wed, Oct 28, 2015 at 11:12:25PM +0200, Michael S. Tsirkin wrote:
> On Wed, Oct 28, 2015 at 06:30:41PM -0200, Flavio Leitner wrote:
> > On Sat, Oct 24, 2015 at 08:47:10PM +0300, Michael S. Tsirkin wrote:
> > > On Sat, Oct 24, 2015 at 12:34:08AM -0200, Flavio Leitner wrote:
> > > > On Thu, Oct 22, 2015 at 02:32:31PM +0300, Michael S. Tsirkin wrote:
> > > > > On Thu, Oct 22, 2015 at 05:49:55PM +0800, Yuanhan Liu wrote:
> > > > > > On Wed, Oct 21, 2015 at 05:26:18PM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Oct 21, 2015 at 08:48:15PM +0800, Yuanhan Liu wrote:
> > > > > > > > > Please note that for virtio devices, guest is supposed to
> > > > > > > > > control the placement of incoming packets in RX queues.
> > > > > > > > 
> > > > > > > > I may not follow you.
> > > > > > > > 
> > > > > > > > Enqueuing packets to a RX queue is done at vhost lib, outside 
> > > > > > > > the
> > > > > > > > guest, how could the guest take the control here?
> > > > > > > > 
> > > > > > > > --yliu
> > > > > > > 
> > > > > > > vhost should do what guest told it to.
> > > > > > > 
> > > > > > > See virtio spec:
> > > > > > >   5.1.6.5.5 Automatic receive steering in multiqueue mode
> > > > > > 
> > > > > > Spec says:
> > > > > > 
> > > > > > After the driver transmitted a packet of a flow on transmitqX,
> > > > > > the device SHOULD cause incoming packets for that flow to be
> > > > > > steered to receiveqX.
> > > > > > 
> > > > > > 
> > > > > > Michael, I still have no idea how vhost could know the flow even
> > > > > > after discussion with Huawei. Could you be more specific about
> > > > > > this? Say, how could guest know that? And how could guest tell
> > > > > > vhost which RX is gonna to use?
> > > > > > 
> > > > > > Thanks.
> > > > > > 
> > > > > > --yliu
> > > > > 
> > > > > I don't really understand the question.
> > > > > 
> > > > > When guests transmits a packet, it makes a decision
> > > > > about the flow to use, and maps that to a tx/rx pair of queues.
> > > > > 
> > > > > It sends packets out on the tx queue and expects device to
> > > > > return packets from the same flow on the rx queue.
> > > > 
> > > > Why? I can understand that there should be a mapping between
> > > > flows and queues in a way that there is no re-ordering, but
> > > > I can't see the relation of receiving a flow with a TX queue.
> > > > 
> > > > fbl
> > > 
> > > That's the way virtio chose to program the rx steering logic.
> > > 
> > > It's low overhead (no special commands), and
> > > works well for TCP when user is an endpoint since rx and tx
> > > for tcp are generally tied (because of ack handling).

It is low overhead for the control plane, but not for the data plane.

> > > We can discuss other ways, e.g. special commands for guests to
> > > program steering.
> > > We'd have to first see some data showing the current scheme
> > > is problematic somehow.

The issue is that the spec assumes the packets are coming in
a serialized way and the distribution will be made by vhost-user
but that isn't necessarily true.


> > The issue is that the restriction imposes operations to be done in the
> > data path.  For instance, Open vSwitch has N number of threads to manage
> > X RX queues. We distribute them in round-robin fashion.  So, the thread
> > polling one RX queue will do all the packet processing and push it to the
> > TX queue of the other device (vhost-user or not) using the same 'id'.
> > 
> > Doing so we can avoid locking between threads and TX queues and any other
> > extra computation while still keeping the packet ordering/distribution fine.
> > 
> > However, if vhost-user has to send packets according with guest mapping,
> > it will require locking between queues and additional operations to select
> > the appropriate queue.  Those actions will cause performance issues.
> 
> You only need 

[dpdk-dev] [PATCH v3] vhost: Fix wrong handling of virtqueue array index

2015-10-29 Thread Flavio Leitner
On Wed, Oct 28, 2015 at 12:47:40PM +0900, Tetsuya Mukawa wrote:
> The patch fixes wrong handling of virtqueue array index when
> GET_VRING_BASE message comes.
> 
> Signed-off-by: Tetsuya Mukawa 
> ---

vhost-user segfaults without this patch with MQ enabled.
LGTM

Acked-by: Flavio Leitner 




[dpdk-dev] [PATCH v7 4/8] vhost: rxtx: use queue id instead of constant ring index

2015-10-28 Thread Flavio Leitner
On Sat, Oct 24, 2015 at 08:47:10PM +0300, Michael S. Tsirkin wrote:
> On Sat, Oct 24, 2015 at 12:34:08AM -0200, Flavio Leitner wrote:
> > On Thu, Oct 22, 2015 at 02:32:31PM +0300, Michael S. Tsirkin wrote:
> > > On Thu, Oct 22, 2015 at 05:49:55PM +0800, Yuanhan Liu wrote:
> > > > On Wed, Oct 21, 2015 at 05:26:18PM +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Oct 21, 2015 at 08:48:15PM +0800, Yuanhan Liu wrote:
> > > > > > > Please note that for virtio devices, guest is supposed to
> > > > > > > control the placement of incoming packets in RX queues.
> > > > > > 
> > > > > > I may not follow you.
> > > > > > 
> > > > > > Enqueuing packets to a RX queue is done at vhost lib, outside the
> > > > > > guest, how could the guest take the control here?
> > > > > > 
> > > > > > --yliu
> > > > > 
> > > > > vhost should do what guest told it to.
> > > > > 
> > > > > See virtio spec:
> > > > >   5.1.6.5.5 Automatic receive steering in multiqueue mode
> > > > 
> > > > Spec says:
> > > > 
> > > > After the driver transmitted a packet of a flow on transmitqX,
> > > > the device SHOULD cause incoming packets for that flow to be
> > > > steered to receiveqX.
> > > > 
> > > > 
> > > > Michael, I still have no idea how vhost could know the flow even
> > > > after discussion with Huawei. Could you be more specific about
> > > > this? Say, how could guest know that? And how could guest tell
> > > > vhost which RX is gonna to use?
> > > > 
> > > > Thanks.
> > > > 
> > > > --yliu
> > > 
> > > I don't really understand the question.
> > > 
> > > When guests transmits a packet, it makes a decision
> > > about the flow to use, and maps that to a tx/rx pair of queues.
> > > 
> > > It sends packets out on the tx queue and expects device to
> > > return packets from the same flow on the rx queue.
> > 
> > Why? I can understand that there should be a mapping between
> > flows and queues in a way that there is no re-ordering, but
> > I can't see the relation of receiving a flow with a TX queue.
> > 
> > fbl
> 
> That's the way virtio chose to program the rx steering logic.
> 
> It's low overhead (no special commands), and
> works well for TCP when user is an endpoint since rx and tx
> for tcp are generally tied (because of ack handling).
> 
> We can discuss other ways, e.g. special commands for guests to
> program steering.
> We'd have to first see some data showing the current scheme
> is problematic somehow.

The issue is that the restriction imposes operations to be done in the
data path.  For instance, Open vSwitch has N number of threads to manage
X RX queues. We distribute them in round-robin fashion.  So, the thread
polling one RX queue will do all the packet processing and push it to the
TX queue of the other device (vhost-user or not) using the same 'id'.

Doing so we can avoid locking between threads and TX queues and any other
extra computation while still keeping the packet ordering/distribution fine.

However, if vhost-user has to send packets according with guest mapping,
it will require locking between queues and additional operations to select
the appropriate queue.  Those actions will cause performance issues.

I see no real benefit from enforcing the guest mapping outside to
justify all the computation cost, so my suggestion is to change the
spec to suggest that behavior, but not to require that to be compliant.

Does that make sense?

Thanks,
fbl



[dpdk-dev] [PATCH v7 4/8] vhost: rxtx: use queue id instead of constant ring index

2015-10-24 Thread Flavio Leitner
On Thu, Oct 22, 2015 at 02:32:31PM +0300, Michael S. Tsirkin wrote:
> On Thu, Oct 22, 2015 at 05:49:55PM +0800, Yuanhan Liu wrote:
> > On Wed, Oct 21, 2015 at 05:26:18PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Oct 21, 2015 at 08:48:15PM +0800, Yuanhan Liu wrote:
> > > > > Please note that for virtio devices, guest is supposed to
> > > > > control the placement of incoming packets in RX queues.
> > > > 
> > > > I may not follow you.
> > > > 
> > > > Enqueuing packets to a RX queue is done at vhost lib, outside the
> > > > guest, how could the guest take the control here?
> > > > 
> > > > --yliu
> > > 
> > > vhost should do what guest told it to.
> > > 
> > > See virtio spec:
> > >   5.1.6.5.5 Automatic receive steering in multiqueue mode
> > 
> > Spec says:
> > 
> > After the driver transmitted a packet of a flow on transmitqX,
> > the device SHOULD cause incoming packets for that flow to be
> > steered to receiveqX.
> > 
> > 
> > Michael, I still have no idea how vhost could know the flow even
> > after discussion with Huawei. Could you be more specific about
> > this? Say, how could guest know that? And how could guest tell
> > vhost which RX is gonna to use?
> > 
> > Thanks.
> > 
> > --yliu
> 
> I don't really understand the question.
> 
> When guests transmits a packet, it makes a decision
> about the flow to use, and maps that to a tx/rx pair of queues.
> 
> It sends packets out on the tx queue and expects device to
> return packets from the same flow on the rx queue.

Why? I can understand that there should be a mapping between
flows and queues in a way that there is no re-ordering, but
I can't see the relation of receiving a flow with a TX queue.

fbl

> During transmit, device needs to figure out the flow
> of packets as they are received from guest, and track
> which flows go on which tx queue.
> When it selects the rx queue, it has to use the same table.
> 
> There is currently no provision for controlling
> steering for uni-directional
> flows which are possible e.g. with UDP.
> 
> We might solve this in a future spec - for example, set a flag notifying
> guest that steering information is missing for a given flow, for example
> by setting a flag in a packet, or using the command queue, and have
> guest send a dummy empty packet to set steering rule for this flow.
> 
> 
> -- 
> MST
> 



[dpdk-dev] [PATCH v6 00/13] vhost-user multiple queues enabling

2015-10-19 Thread Flavio Leitner
On Fri, Oct 09, 2015 at 01:45:59PM +0800, Yuanhan Liu wrote:
> This patch set enables vhost-user multiple queues.
> 
> Overview
> 
> 
> It depends on some QEMU patches that has already been merged to upstream.
> Those qemu patches introduce some new vhost-user messages, for vhost-user
> mq enabling negotiation. Here is the main negotiation steps (Qemu
> as master, and DPDK vhost-user as slave):
> 
> - Master queries features by VHOST_USER_GET_FEATURES from slave
> 
> - Check if VHOST_USER_F_PROTOCOL_FEATURES exist. If not, mq is not
>   supported. (check patch 1 for why VHOST_USER_F_PROTOCOL_FEATURES
>   is introduced)
> 
> - Master then sends another command, VHOST_USER_GET_QUEUE_NUM, for
>   querying how many queues the slave supports.
> 
>   Master will compare the result with the requested queue number.
>   Qemu exits if the former is smaller.
> 
> - Master then tries to initiate all queue pairs by sending some vhost
>   user commands, including VHOST_USER_SET_VRING_CALL, which will
>   trigger the slave to do related vring setup, such as vring allocation.
> 
> 
> Till now, all necessary initiation and negotiation are done. And master
> could send another message, VHOST_USER_SET_VRING_ENABLE, to enable/disable
> a specific queue dynamically later.
> 
> 
> Patchset
> 
> 
> Patch 1-6 are all prepare works for enabling mq; they are all atomic
> changes, with "do not breaking anything" beared in mind while making
> them.
> 
> Patch 7 acutally enables mq feature, by setting two key feature flags.
> 
> Patch 8 handles VHOST_USER_SET_VRING_ENABLE message, which is for enabling
> disabling a specific virt queue pair, and there is only one queue pair is
> enabled by default.
> 
> Patch 9-12 is for demostrating the mq feature.
> 
> Patch 13 udpates the doc release note.
> 
> 
> Testing
> ===
> 
> Host side
> --
> 
> - # Start vhost-switch
> 
>   sudo mount -t hugetlbfs nodev /mnt/huge
>   sudo modprobe uio
>   sudo insmod $RTE_SDK/$RTE_TARGET/kmod/igb_uio.ko
> 
>   sudo $RTE_SDK/tools/dpdk_nic_bind.py --bind igb_uio :08:00.0
> 
>   sudo $RTE_SDK/examples/vhost/build/vhost-switch -c 0xf0 -n 4 \
>--huge-dir /mnt/huge --socket-mem 2048,0 -- -p 1 --vm2vm 0  \
>--dev-basename usvhost --rxq 2
> 
>   # Above common generates a usvhost socket file at PWD. You could also
>   # specify "--stats 1" option to enable stats dumping.
> 
> 
> 
> - # start qemu
> 
> 
>   sudo sudo mount -t hugetlbfs nodev $HOME/hugetlbfs
>   $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 -machine accel=kvm -m 4G \
> -object 
> memory-backend-file,id=mem,size=4G,mem-path=$HOME/hugetlbfs,share=on \
>   -numa node,memdev=mem -chardev socket,id=chr0,path=/path/to/usvhost \
>   -netdev vhost-user,id=net0,chardev=chr0,vhostforce,queues=2 \
>   -device 
> virtio-net-pci,netdev=net0,mq=on,vectors=6,mac=52:54:00:12:34:58,csum=off,gso=off,guest_tso4=off,guest_tso6=off,guest_ecn=off
>  \
>   -hda $HOME/iso/fc-22-x86_64.img -smp 10 -cpu 
> core2duo,+sse3,+sse4.1,+sse4.2
> 
> 
> Guest side
> --
> 
>modprobe uio
>insmod $RTE_SDK/$RTE_TARGET/kmod/igb_uio.ko
>echo 1024 > 
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
>./tools/dpdk_nic_bind.py --bind igb_uio 00:03.0
> 
>$RTE_SDK/$RTE_TARGET/app/testpmd -c 1f -n 4 -- --rxq=2 --txq=2 \
> --nb-cores=4 -i --disable-hw-vlan --txqflags 0xf00
> 
>> set fwd mac
>> start tx_first
>  
> 
> After those setups, you then could use packet generator for packet tx/rx 
> testing.
> 
> 
> 
> Test with OVS
> =
> 
> Marcel also created a simple yet quite clear test guide with OVS at:
> 
>http://wiki.qemu.org/Features/vhost-user-ovs-dpdk
> 
> BTW, Marcel, would you please complete the page on mq testing?
> 
> 
> ---

I reviewed the patch and tested with success using Open vSwitch.
The Open vSwitch patch is posted here:
http://openvswitch.org/pipermail/dev/2015-October/061413.html

Acked-by: Flavio Leitner 

Thanks,
fbl



[dpdk-dev] DPDK 2.2 roadmap

2015-09-10 Thread Flavio Leitner
On Thu, Sep 10, 2015 at 02:43:59PM +0200, Thomas Monjalon wrote:
> 2015-09-09 12:56, O'Driscoll, Tim:
> > Vhost Multi-Queue Support:  The vhost-user library will be updated to 
> > provide multi-queue support in the host similar to the RSS model so that 
> > guest driver may allocate multiple rx/tx queues which then may be used to 
> > load balance between cores. See 
> > http://dpdk.org/ml/archives/dev/2015-August/022758.html for more details.
> 
> Does it require a patch on Qemu?

Yes, it does. Last patchset seems to be v7.
fbl



[dpdk-dev] [ovs-dev] Status of Open vSwitch with DPDK

2015-08-15 Thread Flavio Leitner
On Fri, Aug 14, 2015 at 04:04:40PM +, Gray, Mark D wrote:
> Hi Daniele,
> 
> Thanks for starting this conversation. It is a good list :) I have 
> crossed-posted this
> to dpdk.org as I feel that some of the points could be interesting to that 
> community
> as they are related to how DPDK is used.
> 
> How do "users" of OVS with DPDK feel about this list? Does anyone disagree or
> does anyone have any additions? What are your experiences?
> 
> > 
> > There has been some discussion lately about the status of the Open vSwitch
> > port to DPDK.  While part of the code has been tested for quite some time,
> > I think we can agree that there are a few rough spots that prevent it from
> > being easily deployed and used.
> > 
> > I was hoping to get some feedback from the community about those rough
> > spots,
> > i.e. areas where OVS+DPDK can/needs to improve to become more
> > "production
> > ready" and user-friendly.
> > 
> > - PMD threads and queues management: the code has shown several bugs
> > and
> > the
> >   netdev interfaces don't seem up to the job anymore.
> 
> You had a few ideas about how to refactor this before but I was concerned 
> about the effect it would have on throughput. I can't find the thread. 
> 
> Do you have some further ideas about how to achieve this?

I miss the fact that we can't tell which queue can go to each PMD and
also that all devices must have the same number of rx queues. I agree
that there are other issues, but it seems the kind of configuration
knobs I am looking for might not be the end goal since what has been
said is to look for a more automated way.  Having said so, I also
would like to hear if you have further ideas about how to archive that.


> >   There's a lot of margin of improvement: we could factor out the code from
> >   dpif-netdev, add configuration parameters for advanced users, and figure
> > out
> >   a way to add unit tests.
> > 
> 
> I think this is a general issue with both the kernel datapath (and netdevs)
> and the userspace datapath. There isn't much unit testing (or testing) outside
> of the slow path. 

Maybe we could exercise the interfaces using pcap pmd.


> >   Related to this, the system should be as fast as possible out-of-the-box,
> >   without requiring too much tuning.
> 
> This is a good point. I think the kernel datapath has a similar issue. You can
> get a certain level of performance without compiling with -Ofast or
> pinning threads but you will (even with the kernel datapath) get better
> performance if you pin threads (and possibly compile differently). I guess
> it is more visible with the dpdk datapath as performance is one of the key
> values. It is also more detrimental to the performance if you don't set it
> up correctly.

Not only that, you need to consider how the resources will be
distributed upfront so that you don't run out of hugepages, perhaps
isolate PMD CPUs from the Linux scheduler, etc.  So, I think a more
realistic goal would be: the system should require minimal/none tuning
to run with acceptable performance.


> Perhaps we could provide scripts to help do this?

Or profiles (if that isn't included in your scripts definition)


> I think this is also interesting to the DPDK community. There is 
> knowledge required when running DPDK enabled apps to
> get good performance: core pinning is one thing that comes to mind.
> 
> > 
> > - Userspace tunneling: while the code has been there for quite some time it
> >   hasn't received the level of testing that the Linux kernel datapath
> > tunneling
> >   has.
> > 
> 
> Again, there is a lack of test infrastructure in general for OVS. vsperf is a 
> good
> start, and it would be great to see more people use and contribute to it!

Yes.


> > - Documentation: other than a step by step tutorial,  it cannot be said
> > that
> >   DPDK is a first class citizen in the OVS documentation.  Manpages could
> > be
> >   improved.
> 
> Easily done. The INSTALL guide is pretty good but the structure could be 
> better. 
> There is also a lack of manpages. Good point.

Yup.


> > - Vhost: the code has not received the level of testing of the kernel
> > vhost.
> >   Another doubt shared by some developers is whether we should keep
> >   vhost-cuse, given its relatively low ease of use and the overlapping with
> >   the far more standard vhost-user.
> 
> vhost-cuse is required for older versions of qemu. I'm aware of some companies
> using it as they are restricted to an older version of qemu. I think it is 
> deprecated
> at the moment? Is there a notice to that effect? We just need a plan for when 
> to
> remove it and make sure that plan is clear?

Apparently having two solutions to address the same issue causes more
harm than good, so removing vhost-cuse would be helpful.  I agree that
we need a clear plan with a soak time so users can either upgrade to
vhost-user or tell why they can't.


> > - Interface management and naming: interfaces must be manually removed
> > from
> >  

[dpdk-dev] [PATCH v4 02/12] vhost: support multiple queues in virtio dev

2015-08-14 Thread Flavio Leitner
On Fri, Aug 14, 2015 at 02:29:51AM +, Ouyang, Changchun wrote:
> > -Original Message-
> > From: Flavio Leitner [mailto:fbl at sysclose.org]
> > Sent: Thursday, August 13, 2015 8:52 PM
> > To: Ouyang, Changchun
> > Cc: dev at dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v4 02/12] vhost: support multiple queues in
> > virtio dev
> > 
> > On Wed, Aug 12, 2015 at 04:02:37PM +0800, Ouyang Changchun wrote:
> > >   file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> > > + cur_qp_idx = file.index >> 1;
> > > +
> > > + if (dev->virt_qp_nb < cur_qp_idx + 1) {
> > > + if (alloc_vring_queue_pair(dev, cur_qp_idx) == 0)
> > > + dev->virt_qp_nb = cur_qp_idx + 1;
> > 
> > Looks like it is missing vring initialization here.
> > 
> > if (dev->virt_qp_nb < cur_qp_idx + 1) {
> > if (alloc_vring_queue_pair(dev, cur_qp_idx) == 0) {
> > dev->virt_qp_nb = cur_qp_idx + 1;
> > init_vring_queue_pair(dev, cur_qp_idx);
> 
> I have called the init_vring_queue_pair inside function 
> alloc_vring_queue_pair,
> It has same effect as your suggestion.

Yup, I missed that.
Thanks!
fbl





[dpdk-dev] [PATCH v4 02/12] vhost: support multiple queues in virtio dev

2015-08-13 Thread Flavio Leitner
On Wed, Aug 12, 2015 at 04:02:37PM +0800, Ouyang Changchun wrote:
> Each virtio device could have multiple queues, say 2 or 4, at most 8.
> Enabling this feature allows virtio device/port on guest has the ability to
> use different vCPU to receive/transmit packets from/to each queue.
> 
> In multiple queues mode, virtio device readiness means all queues of
> this virtio device are ready, cleanup/destroy a virtio device also
> requires clearing all queues belong to it.
> 
> Signed-off-by: Changchun Ouyang 
> ---
> Changes in v4:
>   - rebase and fix conflicts
>   - resolve comments
>   - init each virtq pair if mq is on
> 
> Changes in v3:
>   - fix coding style
>   - check virtqueue idx validity
> 
> Changes in v2:
>   - remove the q_num_set api
>   - add the qp_num_get api
>   - determine the queue pair num from qemu message
>   - rework for reset owner message handler
>   - dynamically alloc mem for dev virtqueue
>   - queue pair num could be 0x8000
>   - fix checkpatch errors
> 
>  lib/librte_vhost/rte_virtio_net.h |  10 +-
>  lib/librte_vhost/vhost-net.h  |   1 +
>  lib/librte_vhost/vhost_rxtx.c |  52 +---
>  lib/librte_vhost/vhost_user/vhost-net-user.c  |   4 +-
>  lib/librte_vhost/vhost_user/virtio-net-user.c |  76 +---
>  lib/librte_vhost/vhost_user/virtio-net-user.h |   2 +
>  lib/librte_vhost/virtio-net.c | 165 
> +-
>  7 files changed, 222 insertions(+), 88 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_virtio_net.h 
> b/lib/librte_vhost/rte_virtio_net.h
> index b9bf320..d9e887f 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -59,7 +59,6 @@ struct rte_mbuf;
>  /* Backend value set by guest. */
>  #define VIRTIO_DEV_STOPPED -1
>  
> -
>  /* Enum for virtqueue management. */
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
>  
> @@ -96,13 +95,14 @@ struct vhost_virtqueue {
>   * Device structure contains all configuration information relating to the 
> device.
>   */
>  struct virtio_net {
> - struct vhost_virtqueue  *virtqueue[VIRTIO_QNUM];/**< Contains 
> all virtqueue information. */
>   struct virtio_memory*mem;   /**< QEMU memory and memory 
> region information. */
> + struct vhost_virtqueue  **virtqueue;/**< Contains all virtqueue 
> information. */
>   uint64_tfeatures;   /**< Negotiated feature set. */
>   uint64_tdevice_fh;  /**< device identifier. */
>   uint32_tflags;  /**< Device flags. Only used to 
> check if device is running on data core. */
>  #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>   charifname[IF_NAME_SZ]; /**< Name of the tap 
> device or socket path. */
> + uint32_tvirt_qp_nb;
>   void*priv;  /**< private context */
>  } __rte_cache_aligned;
>  
> @@ -235,4 +235,10 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, 
> uint16_t queue_id,
>  uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>   struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
>  
> +/**
> + * This function get the queue pair number of one vhost device.
> + * @return
> + *  num of queue pair of specified virtio device.
> + */
> +uint16_t rte_vhost_qp_num_get(struct virtio_net *dev);
>  #endif /* _VIRTIO_NET_H_ */
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index c69b60b..7dff14d 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -115,4 +115,5 @@ struct vhost_net_device_ops {
>  
>  
>  struct vhost_net_device_ops const *get_virtio_net_callbacks(void);
> +int alloc_vring_queue_pair(struct virtio_net *dev, uint16_t qp_idx);
>  #endif /* _VHOST_NET_CDEV_H_ */
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 0d07338..db4ad88 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -43,6 +43,18 @@
>  #define MAX_PKT_BURST 32
>  
>  /**
> + * Check the virtqueue idx validility,
> + * return 1 if pass, otherwise 0.
> + */
> +static inline uint8_t __attribute__((always_inline))
> +check_virtqueue_idx(uint16_t virtq_idx, uint8_t is_tx, uint32_t virtq_num)
> +{
> + if ((is_tx ^ (virtq_idx & 0x1)) || (virtq_idx >= virtq_num))
> + return 0;
> + return 1;
> +}
> +
> +/**
>   * This function adds buffers to the virtio devices RX virtqueue. Buffers can
>   * be received from the physical port or from another virtio device. A packet
>   * count is returned to indicate the number of packets that are succesfully
> @@ -68,12 +80,15 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>   uint8_t success = 0;
>  
>   LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
> - if (unlikely(queue_id != VIRTIO_RXQ)) {
> -  

[dpdk-dev] DPDK2.1 (rc3 & rc4) major performance drop.

2015-08-12 Thread Flavio Leitner
On Tue, Aug 11, 2015 at 01:10:10PM +, Mcnamara, John wrote:
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Weglicki, MichalX
> > Sent: Tuesday, August 11, 2015 11:40 AM
> > To: dev at dpdk.org
> > Subject: [dpdk-dev] DPDK2.1 (rc3 & rc4) major performance drop.
> > 
> > Hello,
> > 
> > Currently I'm integrating OVS head with DPDK 2.1. Based on my tests
> > performance in all scenarios (confirmed on Phy2Phy and Vhostuser) has
> > dropped about 10%. Please find example results below:
> 
> Also:
> 
> > Michal:
> > It seems I can fix it on OVS side by passing old hardcoded 
> > size(2048 + RTE_PKTMBUF_HEADROOM) as argument instead of NULL.
> 
> Hi,
> 
> In commit 1d493a49490fa the bahaviour of rte_pktmbuf_pool_init() changed:
> 
> commit 1d493a49490fa90e09689d49280cff0d51d0193e
> Author: Olivier Matz 
> Date:   Wed Apr 22 11:57:18 2015 +0200
> 
> mbuf: fix data room size calculation in pool init
> 
> Previously passing opaque_arg == NULL initialized mbuf_data_room_size = 2048 
> + RTE_PKTMBUF_HEADROOM.
> 
> Now it is set as follows:
> 
> +   /* if no structure is provided, assume no mbuf private area */
> +   user_mbp_priv = opaque_arg;
> +   if (user_mbp_priv == NULL) {
> +   default_mbp_priv.mbuf_priv_size = 0;
> +   if (mp->elt_size > sizeof(struct rte_mbuf))
> +   roomsz = mp->elt_size - sizeof(struct rte_mbuf);
> +   else
> +   roomsz = 0;
> +   default_mbp_priv.mbuf_data_room_size = roomsz;
> +   user_mbp_priv = &default_mbp_priv;
> +   }
> 
> A workaround, for OVS, would be to pass the new opaque_arg struct with the 
> required default set. However, perhaps this should be fixed in DPDK.
> 
> The updated doc in the same patch says:
> 
> +DPDK 2.0 to DPDK 2.1
> +
> +
> +*   The second argument of rte_pktmbuf_pool_init(mempool, opaque) is now a
> +pointer to a struct rte_pktmbuf_pool_private instead of a uint16_t
> +casted into a pointer. Backward compatibility is preserved when the
> +argument was NULL which is the majority of use cases, but not if the
> +opaque pointer was not NULL, as it is not technically feasible. In
> +this case, the application has to be modified to properly fill a
> +rte_pktmbuf_pool_private structure and pass it to
> +rte_pktmbuf_pool_init().
> +
> 
> I think the OVS issue shows that backward compatibility isn't preserved (in 
> the strictest sense).

The text is referring to the fact that passing NULL is still valid and
wouldn't segfault because it isn't passing a valid rte_pktmbuf_pool_private.

> Should this be fixed? Opinions?

If we fix OVS, then older OVS versions will compile but see a performance
issue with new DPDK code.  On the other hand, fixed OVS won't work with
previous DPDK code.  Those are bad user experiences.  The option would
be to fix DPDK to be true backwards compatible and allocate the old default
value for the NULL case as before.

fbl



[dpdk-dev] [PATCH v3 2/9] lib_vhost: Support multiple queues in virtio dev

2015-06-18 Thread Flavio Leitner
On Mon, Jun 15, 2015 at 03:56:39PM +0800, Ouyang Changchun wrote:
> Each virtio device could have multiple queues, say 2 or 4, at most 8.
> Enabling this feature allows virtio device/port on guest has the ability to
> use different vCPU to receive/transmit packets from/to each queue.
> 
> In multiple queues mode, virtio device readiness means all queues of
> this virtio device are ready, cleanup/destroy a virtio device also
> requires clearing all queues belong to it.
> 
> Changes in v3:
>   - fix coding style
>   - check virtqueue idx validity
> 
> Changes in v2:
>   - remove the q_num_set api
>   - add the qp_num_get api
>   - determine the queue pair num from qemu message
>   - rework for reset owner message handler
>   - dynamically alloc mem for dev virtqueue
>   - queue pair num could be 0x8000
>   - fix checkpatch errors
> 
> Signed-off-by: Changchun Ouyang 
> ---
>  lib/librte_vhost/rte_virtio_net.h |  10 +-
>  lib/librte_vhost/vhost-net.h  |   1 +
>  lib/librte_vhost/vhost_rxtx.c |  49 +---
>  lib/librte_vhost/vhost_user/vhost-net-user.c  |   4 +-
>  lib/librte_vhost/vhost_user/virtio-net-user.c |  76 +---
>  lib/librte_vhost/vhost_user/virtio-net-user.h |   2 +
>  lib/librte_vhost/virtio-net.c | 161 
> +-
>  7 files changed, 216 insertions(+), 87 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_virtio_net.h 
> b/lib/librte_vhost/rte_virtio_net.h
> index 5d38185..873be3e 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -59,7 +59,6 @@ struct rte_mbuf;
>  /* Backend value set by guest. */
>  #define VIRTIO_DEV_STOPPED -1
>  
> -
>  /* Enum for virtqueue management. */
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
>  
> @@ -96,13 +95,14 @@ struct vhost_virtqueue {
>   * Device structure contains all configuration information relating to the 
> device.
>   */
>  struct virtio_net {
> - struct vhost_virtqueue  *virtqueue[VIRTIO_QNUM];/**< Contains 
> all virtqueue information. */
>   struct virtio_memory*mem;   /**< QEMU memory and memory 
> region information. */
> + struct vhost_virtqueue  **virtqueue;/**< Contains all virtqueue 
> information. */
>   uint64_tfeatures;   /**< Negotiated feature set. */
>   uint64_tdevice_fh;  /**< device identifier. */
>   uint32_tflags;  /**< Device flags. Only used to 
> check if device is running on data core. */
>  #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>   charifname[IF_NAME_SZ]; /**< Name of the tap 
> device or socket path. */
> + uint32_tnum_virt_queues;
>   void*priv;  /**< private context */
>  } __rte_cache_aligned;


As already pointed out, this breaks ABI.
Do you have a plan for that or are you pushing this for dpdk 2.2?


> @@ -220,4 +220,10 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, 
> uint16_t queue_id,
>  uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>   struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
>  
> +/**
> + * This function get the queue pair number of one vhost device.
> + * @return
> + *  num of queue pair of specified virtio device.
> + */
> +uint16_t rte_vhost_qp_num_get(struct virtio_net *dev);

This needs to go to rte_vhost_version.map too.

fbl



[dpdk-dev] [PATCH v3 2/9] lib_vhost: Support multiple queues in virtio dev

2015-06-18 Thread Flavio Leitner
On Mon, Jun 15, 2015 at 03:56:39PM +0800, Ouyang Changchun wrote:
> Each virtio device could have multiple queues, say 2 or 4, at most 8.
> Enabling this feature allows virtio device/port on guest has the ability to
> use different vCPU to receive/transmit packets from/to each queue.
> 
> In multiple queues mode, virtio device readiness means all queues of
> this virtio device are ready, cleanup/destroy a virtio device also
> requires clearing all queues belong to it.
> 
> Changes in v3:
>   - fix coding style
>   - check virtqueue idx validity
> 
> Changes in v2:
>   - remove the q_num_set api
>   - add the qp_num_get api
>   - determine the queue pair num from qemu message
>   - rework for reset owner message handler
>   - dynamically alloc mem for dev virtqueue
>   - queue pair num could be 0x8000
>   - fix checkpatch errors
> 
> Signed-off-by: Changchun Ouyang 
[...]

> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index fced2ab..aaea7d5 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -67,10 +67,10 @@ static struct virtio_net_config_ll *ll_root;
>  #define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
>   (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
>   (1ULL << VIRTIO_NET_F_CTRL_RX) | \
> - (1ULL << VHOST_F_LOG_ALL))
> + (1ULL << VHOST_F_LOG_ALL)) | \
> + (1ULL << VIRTIO_NET_F_MQ))

One extra parenthesis after VHOST_F_LOG_ALL
BTW, this series need rebase with latest dpdk.
fbl

>  static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;



[dpdk-dev] [PATCH] vhost: make vhost lockless enqueue configurable

2015-04-29 Thread Flavio Leitner
On Wed, 29 Apr 2015 13:56:58 +0200
Thomas Monjalon  wrote:

> 2015-04-29 13:38 GMT+02:00 Panu Matilainen :
> > On 04/29/2015 02:29 PM, Huawei Xie wrote:
> >>
> >> vhost enabled vSwitch could have their own thread-safe vring
> >> enqueue policy.
> >> Add the RTE_LIBRTE_VHOST_LOCKLESS_ENQ macro for vhost lockless
> >> enqueue. Turn it off by default.
> >>
> >> Signed-off-by: Huawei Xie 
> >> ---
> >>   config/common_linuxapp|  1 +
> >>   lib/librte_vhost/vhost_rxtx.c | 24 +++-
> >>   2 files changed, 24 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/config/common_linuxapp b/config/common_linuxapp
> >> index 0078dc9..7f59499 100644
> >> --- a/config/common_linuxapp
> >> +++ b/config/common_linuxapp
> >> @@ -421,6 +421,7 @@ CONFIG_RTE_KNI_VHOST_DEBUG_TX=n
> >>   #
> >>   CONFIG_RTE_LIBRTE_VHOST=n
> >>   CONFIG_RTE_LIBRTE_VHOST_USER=y
> >> +CONFIG_RTE_LIBRTE_VHOST_LOCKLESS_ENQ=n
> >>   CONFIG_RTE_LIBRTE_VHOST_DEBUG=n
> [...]
> >
> > These things should be runtime configurable, not build options.
> > Please do not assume everybody builds DPDK separately for each and
> > every application that might ever be.
> 
> +1
> Adding new build options must be exceptions and very well justified.

+1, that's specially hard when dpdk is packaged and distributed on
distros like Fedora/RHEL.

fbl




[dpdk-dev] [PATCH] vhost: Add -lfuse into the LDFLAGS list

2015-01-20 Thread Flavio Leitner
On Tuesday, January 20, 2015 03:49:58 PM Neil Horman wrote:
> the vhost library relies on libfuse, and thats included when we do a normal
> shared object build, but when we specify combined libs, its gets left out.  
> Add
> it back in
> 
> Signed-off-by: Neil Horman 

Fixes the build issue here.
Reported-by: Flavio Leitner 
Tested-by: Flavio Leitner 

Thanks Neil!
fbl


[dpdk-dev] [PATCH] net: get rid of SET_ETHTOOL_OPS

2014-07-02 Thread Flavio Leitner
On Wed, Jul 02, 2014 at 11:48:51AM +0200, Thomas Monjalon wrote:
> Hi Flavio,
> 
> 2014-07-01 15:19, Flavio Leitner:
> > The SET_ETHTOOL_OPS has been removed from upstream, so it
> > breaks the dpdk build with recent kernels.
> > 
> > Signed-off-by: Flavio Leitner 
> 
> You are removing SET_ETHTOOL_OPS calls.

Yes.

> In a previous patch, Aaro Koskinen made the choice to redefine the macro
> in kcompat files.

I missed that one.

> I don't know what will be the choice of Intel in the sourceforge base driver.
> So I've applied Aaro's patch as there are less modifications of the base 
> driver:
>   http://dpdk.org/browse/dpdk/commit/?id=e0b7ca0c0383411

Since the macro is a simple pointer assignment, I didn't see any reason for
keeping it around. But maybe I am missing something.

fbl


[dpdk-dev] [PATCH] net: get rid of SET_ETHTOOL_OPS

2014-07-01 Thread Flavio Leitner
The SET_ETHTOOL_OPS has been removed from upstream, so it
breaks the dpdk build with recent kernels.

Signed-off-by: Flavio Leitner 
---
 lib/librte_eal/linuxapp/kni/ethtool/igb/igb_ethtool.c | 4 ++--
 lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe.h | 1 -
 lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_ethtool.c | 2 +-
 lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c| 2 +-
 lib/librte_eal/linuxapp/kni/kni_ethtool.c | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_ethtool.c 
b/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_ethtool.c
index f3c48b2..9a1a177 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_ethtool.c
+++ b/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_ethtool.c
@@ -2844,14 +2844,14 @@ static const struct ethtool_ops_ext igb_ethtool_ops_ext 
= {

 void igb_set_ethtool_ops(struct net_device *netdev)
 {
-   SET_ETHTOOL_OPS(netdev, &igb_ethtool_ops);
+   netdev->ethtool_ops = &igb_ethtool_ops;
set_ethtool_ops_ext(netdev, &igb_ethtool_ops_ext);
 }
 #else
 void igb_set_ethtool_ops(struct net_device *netdev)
 {
/* have to "undeclare" const on this struct to remove warnings */
-   SET_ETHTOOL_OPS(netdev, (struct ethtool_ops *)&igb_ethtool_ops);
+   netdev->ethtool_ops = (struct ethtool_ops *)&igb_ethtool_ops;
 }
 #endif /* HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT */
 #endif /* SIOCETHTOOL */
diff --git a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe.h 
b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe.h
index 222c2c7..23e2899 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe.h
+++ b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe.h
@@ -818,7 +818,6 @@ extern u8 ixgbe_dcb_txq_to_tc(struct ixgbe_adapter 
*adapter, u8 index);
 /* needed by ixgbe_main.c */
 extern int ixgbe_validate_mac_addr(u8 *mc_addr);
 extern void ixgbe_check_options(struct ixgbe_adapter *adapter);
-extern void ixgbe_assign_netdev_ops(struct net_device *netdev);

 /* needed by ixgbe_ethtool.c */
 extern char ixgbe_driver_name[];
diff --git a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_ethtool.c 
b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_ethtool.c
index 11472bd..e0802c1 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_ethtool.c
+++ b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_ethtool.c
@@ -2896,6 +2896,6 @@ struct ethtool_ops ixgbe_ethtool_ops = {

 void ixgbe_set_ethtool_ops(struct net_device *netdev)
 {
-   SET_ETHTOOL_OPS(netdev, &ixgbe_ethtool_ops);
+   netdev->ethtool_ops = &ixgbe_ethtool_ops;
 }
 #endif /* SIOCETHTOOL */
diff --git a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c 
b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c
index cb56906..ba2aedb 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c
+++ b/lib/librte_eal/linuxapp/kni/ethtool/ixgbe/ixgbe_main.c
@@ -2454,7 +2454,7 @@ int ixgbe_kni_probe(struct pci_dev *pdev,
err = -EIO;
goto err_ioremap;
}
-   //ixgbe_assign_netdev_ops(netdev);
+
ixgbe_set_ethtool_ops(netdev);

strlcpy(netdev->name, pci_name(pdev), sizeof(netdev->name));
diff --git a/lib/librte_eal/linuxapp/kni/kni_ethtool.c 
b/lib/librte_eal/linuxapp/kni/kni_ethtool.c
index d0673e5..06b6d46 100644
--- a/lib/librte_eal/linuxapp/kni/kni_ethtool.c
+++ b/lib/librte_eal/linuxapp/kni/kni_ethtool.c
@@ -213,5 +213,5 @@ struct ethtool_ops kni_ethtool_ops = {
 void
 kni_set_ethtool_ops(struct net_device *netdev)
 {
-   SET_ETHTOOL_OPS(netdev, &kni_ethtool_ops);
+   netdev->ethtool_ops = &kni_ethtool_ops;
 }
-- 
2.0.0