date:20150120



> -Original Message-
> From: Wodkowski, PawelX
> Sent: Monday, January 19, 2015 5:14 PM
> To: Ouyang, Changchun; dev at dpdk.org
> Cc: Thomas Monjalon; Vlad Zolotarov
> Subject: RE: [dpdk-dev] [PATCH v6 3/6] ixgbe: Get VF queue number
> 
> 
> 
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Ouyang
> Changchun
> > Sent: Monday, January 12, 2015 6:59 AM
> > To: dev at dpdk.org
> > Subject: [dpdk-dev] [PATCH v6 3/6] ixgbe: Get VF queue number
> >
> > Get the available Rx and Tx queue number when receiving
> > IXGBE_VF_GET_QUEUES message from VF.
> >
> > Signed-off-by: Changchun Ouyang 
> >
> > changes in v5
> >   - Add some 'FIX ME' comments for IXGBE_VF_TRANS_VLAN.
> >
> > ---
> >  lib/librte_pmd_ixgbe/ixgbe_pf.c | 40
> > +++-
> >  1 file changed, 39 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_pmd_ixgbe/ixgbe_pf.c
> > b/lib/librte_pmd_ixgbe/ixgbe_pf.c index 495aff5..dbda9b5 100644
> > --- a/lib/librte_pmd_ixgbe/ixgbe_pf.c
> > +++ b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> > @@ -53,6 +53,8 @@
> >  #include "ixgbe_ethdev.h"
> >
> >  #define IXGBE_MAX_VFTA (128)
> > +#define IXGBE_VF_MSG_SIZE_DEFAULT 1
> > +#define IXGBE_VF_GET_QUEUE_MSG_SIZE 5
> >
> >  static inline uint16_t
> >  dev_num_vf(struct rte_eth_dev *eth_dev) @@ -491,9 +493,41 @@
> > ixgbe_negotiate_vf_api(struct rte_eth_dev *dev, uint32_t vf, uint32_t
> > *msgbuf)  }
> >
> >  static int
> > +ixgbe_get_vf_queues(struct rte_eth_dev *dev, uint32_t vf, uint32_t
> > +*msgbuf) {
> > +   struct ixgbe_vf_info *vfinfo =
> > +   *IXGBE_DEV_PRIVATE_TO_P_VFDATA(dev->data-
> >dev_private);
> > +   uint32_t default_q = vf * RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool;
> > +
> > +   /* Verify if the PF supports the mbox APIs version or not */
> > +   switch (vfinfo[vf].api_version) {
> > +   case ixgbe_mbox_api_20:
> > +   case ixgbe_mbox_api_11:
> > +   break;
> > +   default:
> > +   return -1;
> > +   }
> > +
> > +   /* Notify VF of Rx and Tx queue number */
> > +   msgbuf[IXGBE_VF_RX_QUEUES] =
> RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool;
> > +   msgbuf[IXGBE_VF_TX_QUEUES] =
> RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool;
> 
> Are you sure this is good approach to pass nb_q_per_pool to VF as the
> number of available queues? What if PF does not use RSS nor DCB? Are thos
> queues always available in that case?
> 

In that case(neither rss nor dcb), nb_q_per_pool is 1, so it also works, I have 
validated it.
Thanks
Changchun

[dpdk-dev] [PATCH v6 4/6] ether: Check VMDq RSS mode



> -Original Message-
> From: Wodkowski, PawelX
> Sent: Monday, January 19, 2015 6:31 PM
> To: Ouyang, Changchun; dev at dpdk.org
> Cc: Thomas Monjalon; Vlad Zolotarov
> Subject: RE: [dpdk-dev] [PATCH v6 4/6] ether: Check VMDq RSS mode
> 
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Ouyang
> Changchun
> > Sent: Monday, January 12, 2015 6:59 AM
> > To: dev at dpdk.org
> > Subject: [dpdk-dev] [PATCH v6 4/6] ether: Check VMDq RSS mode
> >
> > Check mq mode for VMDq RSS, handle it correctly instead of returning
> > an error; Also remove the limitation of per pool queue number has max
> > value of 1, because the per pool queue number could be 2 or 4 if it is
> > VMDq RSS mode;
> >
> > The number of rxq specified in config will determine the mq mode for
> > VMDq RSS.
> >
> > Signed-off-by: Changchun Ouyang 
> >
> > changes in v6:
> >   - More clear error message when queue number is invalid.
> >
> > changes in v5:
> >   - Fix '<' issue, it should be '<=' to test rxq number;
> >   - Extract a function to remove the embeded switch-case statement.
> >
> > ---
> >  lib/librte_ether/rte_ethdev.c | 51
> > ++-
> >  1 file changed, 46 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/librte_ether/rte_ethdev.c
> > b/lib/librte_ether/rte_ethdev.c index 95f2ceb..e9e3368 100644
> > --- a/lib/librte_ether/rte_ethdev.c
> > +++ b/lib/librte_ether/rte_ethdev.c
> > @@ -503,6 +503,31 @@ rte_eth_dev_tx_queue_config(struct
> rte_eth_dev
> > *dev, uint16_t nb_queues)  }
> >
> >  static int
> > +rte_eth_dev_check_vf_rss_rxq_num(uint8_t port_id, uint16_t nb_rx_q)
> {
> > +   struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > +   switch (nb_rx_q) {
> > +   case 1:
> > +   case 2:
> > +   RTE_ETH_DEV_SRIOV(dev).active =
> > +   ETH_64_POOLS;
> > +   break;
> > +   case 4:
> > +   RTE_ETH_DEV_SRIOV(dev).active =
> > +   ETH_32_POOLS;
> > +   break;
> > +   default:
> > +   return -EINVAL;
> > +   }
> > +
> > +   RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool = nb_rx_q;
> > +   RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx =
> > +   dev->pci_dev->max_vfs * nb_rx_q;
> > +
> > +   return 0;
> > +}
> > +
> > +static int
> >  rte_eth_dev_check_mq_mode(uint8_t port_id, uint16_t nb_rx_q,
> uint16_t
> > nb_tx_q,
> >   const struct rte_eth_conf *dev_conf)  { @@ -510,8 +535,7
> @@
> > rte_eth_dev_check_mq_mode(uint8_t port_id, uint16_t nb_rx_q,
> uint16_t
> > nb_tx_q,
> >
> > if (RTE_ETH_DEV_SRIOV(dev).active != 0) {
> > /* check multi-queue mode */
> > -   if ((dev_conf->rxmode.mq_mode == ETH_MQ_RX_RSS) ||
> > -   (dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB) ||
> > +   if ((dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB) ||
> > (dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB_RSS)
> ||
> > (dev_conf->txmode.mq_mode == ETH_MQ_TX_DCB)) {
> > /* SRIOV only works in VMDq enable mode */ @@ -
> 525,7 +549,6 @@
> > rte_eth_dev_check_mq_mode(uint8_t port_id, uint16_t nb_rx_q,
> uint16_t
> > nb_tx_q,
> > }
> >
> > switch (dev_conf->rxmode.mq_mode) {
> > -   case ETH_MQ_RX_VMDQ_RSS:
> > case ETH_MQ_RX_VMDQ_DCB:
> > case ETH_MQ_RX_VMDQ_DCB_RSS:
> > /* DCB/RSS VMDQ in SRIOV mode, not implement
> yet */ @@ -534,6
> > +557,26 @@ rte_eth_dev_check_mq_mode(uint8_t port_id, uint16_t
> > nb_rx_q, uint16_t nb_tx_q,
> > "unsupported VMDQ mq_mode rx
> > %u\n",
> > port_id, dev_conf-
> > >rxmode.mq_mode);
> > return (-EINVAL);
> > +   case ETH_MQ_RX_RSS:
> > +   PMD_DEBUG_TRACE("ethdev port_id=%" PRIu8
> > +   " SRIOV active, "
> > +   "Rx mq mode is changed from:"
> > +   "mq_mode %u into VMDQ mq_mode
> > %u\n",
> > +   port_id,
> > +   dev_conf->rxmode.mq_mode,
> > +   dev->data-
> > >dev_conf.rxmode.mq_mode);
> > +   case ETH_MQ_RX_VMDQ_RSS:
> > +   dev->data->dev_conf.rxmode.mq_mode =
> > ETH_MQ_RX_VMDQ_RSS;
> > +   if (nb_rx_q <=
> > RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool)
> > +   if
> (rte_eth_dev_check_vf_rss_rxq_num(port_id,
> > nb_rx_q) != 0) {
> > +   PMD_DEBUG_TRACE("ethdev
> > port_id=%d"
> > +   " SRIOV active, invalid queue"
> > +   " number for VMDQ RSS,
> > allowed"
> > +   " value are 1, 2 or 4\n",
> > +   port_id);
> > +   return

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine



> -Original Message-
> From: Olivier MATZ [mailto:olivier.matz at 6wind.com]
> Sent: Monday, January 19, 2015 2:39 PM
> To: Ananyev, Konstantin; Liu, Jijiang
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum 
> forwarding engine
> 
> Hi Konstantin,
> 
> On 01/19/2015 02:04 PM, Ananyev, Konstantin wrote:
> >> case 2) calculate checksum of out_ip and out_udp
> >>
> >>mb->l2_len = len(out_eth)
> >>mb->l3_len = len(out_ip)
> >>mb->ol_flags |= PKT_TX_IPV4 | PKT_TX_IP_CSUM | PKT_TX_UDP_CKSUM
> >>set out_ip checksum to 0 in the packet
> >>set out_udp checksum to pseudo header using rte_ipv4_phdr_cksum()
> >>
> >>supported on hardware advertising DEV_TX_OFFLOAD_IPV4_CKSUM and
> >>DEV_TX_OFFLOAD_UDP_CKSUM
> >>
> >>*Problem 1*: The comment above PKT_TX_IPV4 says "Packet is IPv4
> >>without requiring IP checksum offload" [2], and the help of L4
> >>checksum and TSO says that it is required to set the PKT_TX_IPV4
> >>flag [3]. This is not coherent.
> >
> > So what is the problem?
> > Comments in rte_mbuf.h are not coherent?
> 
> No there're not coherent

Ok, if the problem is just comments - let's fix it.

> 
> >>We are back on the debate about the meaning of PKT_TX_IPV4 vs
> >>PKT_TX_IP_CSUM from [4]. This incoherency in comments are introduced
> >>by patch [5]. The question is "when an application should set
> >>this flag? for any IP packet that does not require IP checksum?".
> >
> > Yes, if it is an IPv4 packet and application required TX offload for L4 
> > checksum or TSO,
> > but doesn't want HW offload ofr IPV4 checksum calculation.
> >
> >>This would break many applications.
> >
> > Which ones?
> > As I know, so far nothing is broken.
> 
> The problem today is that it's not obvious for a developper to
> know when an application should set the PKT_TX_IPV4 flag. From the
> comments, we could think that an application has to set it for any
> transmitted IP packet, even for packets that do not require tx
> offload. Asking to do this in the API would break many applications.
> 
> The comment should at least say that this flag is *only* required
> when asking for L4 checksum. As TSO implies IP checksum, it means the
> PKT_TX_IPV4 should not be set, but PKT_TX_IP_CSUM instead.

Ok, so the problem is in comments again?
If so, sure let's update them to make things clear.

> 
> >> I think a good definition would
> >>be:
> >>
> >>Packet is IPv4. This flag must be set when using any offload
> >>feature (TSO, L3 or L4 checksum) to tell the NIC that the packet
> >>is an IPv4 packet.
> >>
> >>That's why I added PKT_TX_IPV4 in the examples.
> >
> > I suppose we discussed it several times: both ways are possible.
> > From PMD perspective - treating PKT_TX_IPV4 and PKT_TX_IP_CSUM
> > As mutually exclusive seems a bit more plausible.
> > From the upper layer - my understanding, that it is doesn't really matter.
> > I thought we had an agreement about it in 1.8, no?
> 
> Indeed, this was already discussed, but there was a lot of pressure
> for 1.8.0 to push something, even not perfect. The fog around comments
> shows that the API was not very clearly defined for 1.8.0. If you read
> the comments of the API, it is impossible to understand when the
> PKT_TX_IPV4 or PKT_TX_IP_CSUM flags must be set. I would even say
> more: the only place where the comments bring a valuable information
> (L4 checksum and TSO) describe the case where PKT_TX_IPV4 and
> PKT_TX_IP_CSUM are not exclusive...
> 
> So I will fix that in my coming patch series. Just for information,
> I'm pretty sure that having PKT_TX_IPV4 and PKT_TX_IP_CSUM as not
> exclusive flag would not require any change anywhere in the PMDs (even
> in i40e).

Right now - no.
Though as I said from PMD perspective having them exclusive is a bit preferable.
Again, I don't see any big difference from upper layer code.

> On the contrary, making them exclusive would require to
> change the ixgbe TSO code because we check PKT_TX_IPV4.

Hmm, so you are saying there is a bug somewhere  in ixbe_rxtx.c?
What particular place you are talking about?

> 
> >>*Problem 3*: without using the word "fortville", it is difficult
> >>to understand the goal of the flag PKT_TX_UDP_TUNNEL_PKT. Indeed,
> >>once PKT_TX_OUTER_IPV4/6 is set, it looks obvious that it's a
> >>tunnel packet. I suggest to remove the PKT_TX_UDP_TUNNEL_PKT
> >>flag. In linux, the driver doesn't care about the tunnel type,
> >>it always set I40E_TXD_CTX_UDP_TUNNELING for all encapsulations [6].
> >
> > It might be obvious that it is a tunnel packet from PKT_TX_OUTER_* is set,
> > but it is not obvious what type of tunnelling it would be.
> > FVL HW supports HW TX offloads for different type of tunnelling and
> > requires that SW provide information about tunnelling type.
> > From i40e datasheet:
> > L4TUNT L4 Tunneling Type (Teredo / GRE header / VXLAN header) indicati

[dpdk-dev] [PATCH v2 2/4] ethdev: prevent changing of nb_q_per_pool in rte_eth_dev_check_mq_mode()



> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Pawel Wodkowski
> Sent: Monday, January 19, 2015 9:02 PM
> To: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH v2 2/4] ethdev: prevent changing of
> nb_q_per_pool in rte_eth_dev_check_mq_mode()
> 
> If SRIOV is used and device configuration does not use MQ the
> RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool is set to 1 in
> rte_eth_dev_check_mq_mode().
> This is bad becouse of two reasons:
> 1. Port reconfiguration from non-MQ mode to MQ mode is impossible 2.
> Confguring RX and TX side in different way is impossible.
> 

This case is possible:
rxmode.mq_mode is ETH_MQ_RX_VMDQ_RSS, and txmode.mq_mode is ETH_MQ_TX_NONE.

[dpdk-dev] [PATCH v2 3/4] pmd: add support for DCB in SRIOV mode for ixgbe driver.



> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Pawel Wodkowski
> Sent: Monday, January 19, 2015 9:03 PM
> To: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH v2 3/4] pmd: add support for DCB in SRIOV
> mode for ixgbe driver.
> 
> Add support for DCB in SRIOV mode. When no PFC is enabled this feature
> might be used as multiple queues for VF (up to 8 queues if VFs num is less or
> equal 16 or 4 if FVs num is less or equal 32).
> 
> The PF must initializes RX in ETH_MQ_RX_VMDQ_DCB and TX in
> ETH_MQ_TX_VMDQ_DCB.
> VF should initialize Rx in ETH_MQ_RX_DCB and Tx in ETH_MQ_TX_DCB to use
> multiple queues and/or DCB.
> 
> Signed-off-by: Pawel Wodkowski 
> ---
>  lib/librte_ether/rte_ethdev.c |   32 
>  lib/librte_ether/rte_ethdev.h |2 +-
>  lib/librte_pmd_ixgbe/ixgbe_pf.c   |   42 +++--
> 
>  lib/librte_pmd_ixgbe/ixgbe_rxtx.c |7 +++
>  4 files changed, 54 insertions(+), 29 deletions(-)
> 
> diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
> index 85385f8..115465e 100644
> --- a/lib/librte_ether/rte_ethdev.c
> +++ b/lib/librte_ether/rte_ethdev.c
> @@ -532,6 +532,7 @@ rte_eth_dev_check_mq_mode(uint8_t port_id,
> uint16_t nb_rx_q, uint16_t nb_tx_q,
> const struct rte_eth_conf *dev_conf)  {
>   struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> + struct rte_eth_dev_info dev_info;
> 
>   if (RTE_ETH_DEV_SRIOV(dev).active != 0) {
>   /* check multi-queue mode */
> @@ -553,8 +554,9 @@ rte_eth_dev_check_mq_mode(uint8_t port_id,
> uint16_t nb_rx_q, uint16_t nb_tx_q,
> 
>   switch (dev_conf->rxmode.mq_mode) {
>   case ETH_MQ_RX_VMDQ_DCB:
> + break;
>   case ETH_MQ_RX_VMDQ_DCB_RSS:
> - /* DCB/RSS VMDQ in SRIOV mode, not implement
> yet */
> + /* DCB+RSS VMDQ in SRIOV mode, not implement
> yet */
>   PMD_DEBUG_TRACE("ethdev port_id=%" PRIu8
>   " SRIOV active, "
>   "unsupported VMDQ mq_mode
> rx %u\n", @@ -589,13 +591,8 @@ rte_eth_dev_check_mq_mode(uint8_t
> port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
>   }
> 
>   switch (dev_conf->txmode.mq_mode) {
> - case ETH_MQ_TX_VMDQ_DCB:
> - /* DCB VMDQ in SRIOV mode, not implement yet */
> - PMD_DEBUG_TRACE("ethdev port_id=%" PRIu8
> - " SRIOV active, "
> - "unsupported VMDQ mq_mode
> tx %u\n",
> - port_id, dev_conf-
> >txmode.mq_mode);
> - return (-EINVAL);
> + case ETH_MQ_TX_VMDQ_DCB: /* DCB VMDQ in SRIOV
> mode*/
> + break;
>   default: /* ETH_MQ_TX_VMDQ_ONLY or
> ETH_MQ_TX_NONE */
>   /* if nothing mq mode configure, use default scheme
> */
>   dev->data->dev_conf.txmode.mq_mode =
> ETH_MQ_TX_VMDQ_ONLY; @@ -612,7 +609,7 @@
> rte_eth_dev_check_mq_mode(uint8_t port_id, uint16_t nb_rx_q, uint16_t
> nb_tx_q,
>   return (-EINVAL);
>   }
>   } else {
> - /* For vmdb+dcb mode check our configuration before we
> go further */
> + /* For vmdq+dcb mode check our configuration before we
> go further */
>   if (dev_conf->rxmode.mq_mode ==
> ETH_MQ_RX_VMDQ_DCB) {
>   const struct rte_eth_vmdq_dcb_conf *conf;
> 
> @@ -651,11 +648,20 @@ rte_eth_dev_check_mq_mode(uint8_t port_id,
> uint16_t nb_rx_q, uint16_t nb_tx_q,
>   }
>   }
> 
> - /* For DCB mode check our configuration before we go
> further */
> + /* For DCB we need to obtain maximum number of queues
> dinamically,
> +  * as this depends on max VF exported in PF */
> + if ((dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB) ||
> + (dev_conf->txmode.mq_mode ==
> ETH_MQ_TX_DCB)) {
> +
> + FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >dev_infos_get, -ENOTSUP);
> + (*dev->dev_ops->dev_infos_get)(dev,
> &dev_info);
> + }
> +
> + /* For DCB mode check out configuration before we go
> further */
>   if (dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB) {
>   const struct rte_eth_dcb_rx_conf *conf;
> 
> - if (nb_rx_q != ETH_DCB_NUM_QUEUES) {
> + if (nb_rx_q != dev_info.max_rx_queues) {
>   PMD_DEBUG_TRACE("ethdev port_id=%d
> DCB, nb_rx_q "
>   "!= %d\n",
>   port_id,
> ETH_DCB_NUM_QUEUES);
> @@ -675,7 +681,7 @@ rte_eth_dev_check_mq_mode(uint8_t port_id,
> u

[dpdk-dev] FW: [PATCH v4 01/11] eal/pci, ethdev: Remove assumption that port will not be detached

2015-01-20 Thread Tetsuya Mukawa

Hi Michael,

On 2015/01/19 23:24, Qiu, Michael wrote:
> Hi, Tetsuya
>
> You see lots of places have below three lines:
>
>   if (rte_eth_dev_validate_port(port_id) == DEV_INVALID) {
>   PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
>   return -EINVAL;
>   }
>
> They are all the same(only few has the print log different), so can we 
> improve it?
>
> See below:
>
> +static int
> +rte_eth_dev_validate_port(uint8_t port_id, bool trace) {
> + if (port_id >= RTE_MAX_ETHPORTS ||
> + rte_eth_devices[port_id].attached != DEV_CONNECTED) {
> + if (trace)
> + PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
> + return DEV_INVALID;
> + }
> + else
> + return DEV_VALID;
> +}
>
> For normal case we just call this function use
> rte_eth_dev_validate_port(port_id, 1)
> here 1 could be a enmu value(Thus trace should be defined as int).
> After call, we didn't need to add PMD_DEBUG_TRACE() any more.
>  
> For few cases like:
>   PMD_DEBUG_TRACE("set VF rate limit:invalid port id=%d\n",
>   port_id);
>
> We can call the function with secondary param set to "0", and add the trace 
> log after the function called, just as before.
>
> I think after this enhancement, the code seems more clean and efficiency?

I appreciate your comment.
Sounds nice. I changes like above.

Thanks,
Tetsuya


>
> Thanks,
> Michael
>
>>> -Original Message-
>>> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Tetsuya Mukawa
>>> Sent: Monday, January 19, 2015 10:40 AM
>>> To: dev at dpdk.org
>>> Subject: [dpdk-dev] [PATCH v4 01/11] eal/pci, ethdev: Remove assumption 
>>> that port will not be
>>> detached
>>>
>>> To remove assumption, do like followings.
>>>
>>> This patch adds "RTE_PCI_DRV_DETACHABLE" to drv_flags of rte_pci_driver 
>>> structure. The flags
>>> indicates the driver can detach devices at runtime.
>>> Also remove assumption that port will not be detached.
>>>
>>> To remove the assumption.
>>> - Add 'attached' member to rte_eth_dev structure.
>>>   This member is used for indicating the port is attached, or not.
>>> - Add rte_eth_dev_allocate_new_port().
>>>   This function is used for allocating new port.
>>>
>>> v4:
>>> - Use braces with 'for' loop.
>>> - Fix indent of 'if' statement.
>>>
>>> Signed-off-by: Tetsuya Mukawa 
>>> ---
>>>  lib/librte_eal/common/include/rte_pci.h |   2 +
>>>  lib/librte_ether/rte_ethdev.c   | 248 
>>> ++--
>>>  lib/librte_ether/rte_ethdev.h   |   5 +
>>>  3 files changed, 151 insertions(+), 104 deletions(-)
>>>
>>> diff --git a/lib/librte_eal/common/include/rte_pci.h 
>>> b/lib/librte_eal/common/include/rte_pci.h
>>> index 66ed793..dd1da28 100644
>>> --- a/lib/librte_eal/common/include/rte_pci.h
>>> +++ b/lib/librte_eal/common/include/rte_pci.h
>>> @@ -199,6 +199,8 @@ struct rte_pci_driver {  #define 
>>> RTE_PCI_DRV_FORCE_UNBIND 0x0004
>>>  /** Device driver supports link state interrupt */
>>>  #define RTE_PCI_DRV_INTR_LSC   0x0008
>>> +/** Device driver supports detaching capability */
>>> +#define RTE_PCI_DRV_DETACHABLE 0x0010
>>>
>>>  /**< Internal use only - Macro used by pci addr parsing functions **/
>>>  #define GET_PCIADDR_FIELD(in, fd, lim, dlm)   \
>>> diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c 
>>> index 077d430..46cabaf
>>> 100644
>>> --- a/lib/librte_ether/rte_ethdev.c
>>> +++ b/lib/librte_ether/rte_ethdev.c
>>> @@ -175,6 +175,16 @@ enum {
>>> STAT_QMAP_RX
>>>  };
>>>
>>> +enum {
>>> +   DEV_INVALID = 0,
>>> +   DEV_VALID,
>>> +};
>>> +
>>> +enum {
>>> +   DEV_DISCONNECTED = 0,
>>> +   DEV_CONNECTED
>>> +};
>>> +
>>>  static inline void
>>>  rte_eth_dev_data_alloc(void)
>>>  {
>>> @@ -201,19 +211,34 @@ rte_eth_dev_allocated(const char *name)  {
>>> unsigned i;
>>>
>>> -   for (i = 0; i < nb_ports; i++) {
>>> -   if (strcmp(rte_eth_devices[i].data->name, name) == 0)
>>> +   for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
>>> +   if ((rte_eth_devices[i].attached == DEV_CONNECTED) &&
>>> +   strcmp(rte_eth_devices[i].data->name, name) == 0)
>>> return &rte_eth_devices[i];
>>> }
>>> return NULL;
>>>  }
>>>
>>> +static uint8_t
>>> +rte_eth_dev_allocate_new_port(void)
>>> +{
>>> +   unsigned i;
>>> +
>>> +   for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
>>> +   if (rte_eth_devices[i].attached == DEV_DISCONNECTED)
>>> +   return i;
>>> +   }
>>> +   return RTE_MAX_ETHPORTS;
>>> +}
>>> +
>>>  struct rte_eth_dev *
>>>  rte_eth_dev_allocate(const char *name)
>>>  {
>>> +   uint8_t port_id;
>>> struct rte_eth_dev *eth_dev;
>>>
>>> -   if (nb_ports == RTE_MAX_ETHPORTS) {
>>> +   port_id = rte_eth_dev_allocate_new_port();
>>> +   if (port_id == RTE_MAX_ETHPORTS) {
>>> PMD_DEBUG_TRACE("Reached maximum number of Ethernet ports\n");
>>> return

[dpdk-dev] [RFC 01/17] mbuf: add definitions of unified packet types

2015-01-20 Thread Zhang, Helin



> -Original Message-
> From: Olivier MATZ [mailto:olivier.matz at 6wind.com]
> Sent: Tuesday, January 20, 2015 1:27 AM
> To: Neil Horman; Zhang, Helin
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [RFC 01/17] mbuf: add definitions of unified packet
> types
> 
> Hi,
> 
> On 01/19/2015 05:33 PM, Neil Horman wrote:
> > On Mon, Jan 19, 2015 at 11:23:07AM +0800, Helin Zhang wrote:
> >> As there are only 6 bit flags in ol_flags for indicating packet
> >> types, which is not enough to describe all the possible packet types
> >> hardware can recognize. For example, i40e hardware can recognize more
> >> than 150 packet types. Unified packet type is composed of tunnel
> >> type, L3 type,
> >> L4 type and inner L3 type fields, and can be stored in 16 bits mbuf
> >> field of 'packet_type'.
> >>
> >> Signed-off-by: Helin Zhang 
> >> Signed-off-by: Cunming Liang 
> >> Signed-off-by: Jijiang Liu 
> >> ---
> >>  lib/librte_mbuf/rte_mbuf.h | 68
> >> ++
> >>  1 file changed, 68 insertions(+)
> >>
> >> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> >> index 16059c6..94eb38f 100644
> >> --- a/lib/librte_mbuf/rte_mbuf.h
> >> +++ b/lib/librte_mbuf/rte_mbuf.h
> >> @@ -165,6 +165,74 @@ extern "C" {
> >>  /* Use final bit of flags to indicate a control mbuf */
> >>  #define CTRL_MBUF_FLAG   (1ULL << 63) /**< Mbuf contains
> control data */
> >>
> >> +/*
> >> + * Sixteen bits are divided into several fields to mark packet
> >> +types. Note that
> >> + * each field is indexical.
> >> + * - Bit 3:0 is for tunnel types.
> >> + * - Bit 7:4 is for L3 or outer L3 (for tunneling case) types.
> >> + * - Bit 10:8 is for L4 types. It can also be used for inner L4 types for
> >> + *   tunneling packets.
> > This seems a bit sparse, in that the protocol field is 8 bits wide in a 
> > packet.
> > There are several common protocls that you don't have listed, and
> > you've already exhausted your namespace with the list you have.
> > Neil
I have reviewed all packet types supported in igb, ixgbe and i40e, and read the
code to get the packet types used in vmxnet3, bond, enic ,etc.
Current design can support all packet types used in above PMDs.
Yes, we don't have too many space reserved for future, but we can try to make
more bits for packet_type field later, as we can save 6 bits in ol_flags with 
this
patch set.

> 
> Another question I've asked several times[1][2] : what does having
> RTE_PTYPE_TUNNEL_IP mean? What fields are checked by the hardware (or
> the driver) and what fields should be checked by the application?
> Are you sure that all the drivers (ixgbe, i40e, vmxnet3, enic) check the same
> fields? (ethertype, ip version, ip len correct, ip checksum correct, flags, 
> ...)
RTE_PTYPE_TUNNEL_IP means hardware recognizes the received packet as an
IP-in-IP packet.
All the fields are filled by PMD which is recognized by hardware. The 
application
can just use it which can save some cpu cycles to recognize the packet type by
software.
Drivers is responsible for filling with correct values according to the packet 
types
recognized by its hardware. Different PMDs may fill with different values based 
on
different capabilities.

> 
> To be clearer: Let's say I have a network stack that parses and validates an 
> IP
> packet. What tests can I remove if I get RTE_PTYPE_TUNNEL_IP?
That means it is a IP-in-IP tunnel packet, but not others. Also you can check 
other
fields in packet_type to get more information of the packet (e.g. L4 type).

> 
> This question can be asked for all defined packet type. To be usable by an
> application, I think a formal definition would be needed. This is also 
> important
> to know this for people wanting to develop a new PMD based on a new
> hardware. If the hardware does not behave exactly like ixgbe, i40e (I hope all
> drivers you implemented behave exactly the same), some work has to be done
> in the driver or the feature cannot be used.
The unified packet type defined here is aiming to support all hardwares. I40e 
has
different values from ixgbe. We can add more in the future if needed for future 
NICs.

> 
> One na?ve question: are we sure that at the end, using these complex packet
> types is faster than parsing the packet?
I guess yes for almost all cases, as hardware reported the packet types, and PMD
just puts the correct values into packet_type field.
Later, we will try to measure the differences.

Regards,
Helin

> 
> Regards,
> Olivier
> 
> 
> [1] http://dpdk.org/ml/archives/dev/2014-November/008534.html
> [2] http://dpdk.org/ml/archives/dev/2014-November/008367.html

[dpdk-dev] [PATCH v4 1/2] librte_pmd_null: Add null PMD

2015-01-20 Thread Tetsuya Mukawa

'null PMD' is a driver of the virtual device particulary designed to measure
performance of DPDK PMDs. When an application call rx, null PMD just allocates
mbufs and returns those. Also tx, the PMD just frees mbufs.

The PMD has following options.
- size: specify packe size allocated by RX. Default packet size is 64.
- copy: specify 1 or 0 to enable or disable copy while RX and TX.
Default value is 0(disbaled).
This option is used for emulating more realistic data transfer.
Copy size is equal to packet size.

To use the PMD, enable CONFIG_RTE_BUILD_SHARED_LIB in config file. Then
compile the PMD as shared library. The library can be linked using '-d'
option when an application invokes.

Here is an example.
$ sudo ./testpmd -c f -n 4 -d librte_pmd_null.so \
--vdev 'eth_null0' --vdev 'eth_null1' -- -i --no-flush-rx

If testpmd is compiled with CONFIG_RTE_BUILD_SHARED_LIB, it may need to
specify more libraries using '-d' option.

v4:
 - Fix memory leak.
   (Thanks to Iremonger, Bernard)

Signed-off-by: Tetsuya Mukawa 
---
 config/common_bsdapp   |   5 +
 config/common_linuxapp |   5 +
 lib/Makefile   |   1 +
 lib/librte_pmd_null/Makefile   |  58 +
 lib/librte_pmd_null/rte_eth_null.c | 485 +
 5 files changed, 554 insertions(+)
 create mode 100644 lib/librte_pmd_null/Makefile
 create mode 100644 lib/librte_pmd_null/rte_eth_null.c

diff --git a/config/common_bsdapp b/config/common_bsdapp
index 9177db1..fa849be 100644
--- a/config/common_bsdapp
+++ b/config/common_bsdapp
@@ -224,6 +224,11 @@ CONFIG_RTE_LIBRTE_PMD_PCAP=y
 CONFIG_RTE_LIBRTE_PMD_BOND=y

 #
+# Compile null PMD
+#
+CONFIG_RTE_LIBRTE_PMD_NULL=y
+
+#
 # Do prefetch of packet data within PMD driver receive function
 #
 CONFIG_RTE_PMD_PACKET_PREFETCH=y
diff --git a/config/common_linuxapp b/config/common_linuxapp
index 27d05be..456fbfe 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -237,6 +237,11 @@ CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
 CONFIG_RTE_LIBRTE_PMD_XENVIRT=n

 #
+# Compile null PMD
+#
+CONFIG_RTE_LIBRTE_PMD_NULL=y
+
+#
 # Do prefetch of packet data within PMD driver receive function
 #
 CONFIG_RTE_PMD_PACKET_PREFETCH=y
diff --git a/lib/Makefile b/lib/Makefile
index 0ffc982..d246c53 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -52,6 +52,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += librte_pmd_virtio
 DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += librte_pmd_vmxnet3
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += librte_pmd_xenvirt
 DIRS-$(CONFIG_RTE_LIBRTE_VHOST) += librte_vhost
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_NULL) += librte_pmd_null
 DIRS-$(CONFIG_RTE_LIBRTE_HASH) += librte_hash
 DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm
 DIRS-$(CONFIG_RTE_LIBRTE_ACL) += librte_acl
diff --git a/lib/librte_pmd_null/Makefile b/lib/librte_pmd_null/Makefile
new file mode 100644
index 000..0ec4db9
--- /dev/null
+++ b/lib/librte_pmd_null/Makefile
@@ -0,0 +1,58 @@
+#   BSD LICENSE
+#
+#   Copyright (C) IGEL Co.,Ltd.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in
+#   the documentation and/or other materials provided with the
+#   distribution.
+# * Neither the name of IGEL Co.,Ltd. nor the names of its
+#   contributors may be used to endorse or promote products derived
+#   from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_null.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_NULL) += rte_eth_null.c
+
+#
+# Export include files
+#
+SYMLINK-y-include +=
+
+# this lib depends upon:
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_NULL) += lib/librte_mbuf
+

[dpdk-dev] [PATCH v4 2/2] librte_pmd_null: Support port hotplug function

2015-01-20 Thread Tetsuya Mukawa

This patch adds port hotplug support to null PMD.

v4:
- Fix commit title.

Signed-off-by: Tetsuya Mukawa 
---
 lib/librte_pmd_null/rte_eth_null.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/lib/librte_pmd_null/rte_eth_null.c 
b/lib/librte_pmd_null/rte_eth_null.c
index c54e90b..e9b1eee 100644
--- a/lib/librte_pmd_null/rte_eth_null.c
+++ b/lib/librte_pmd_null/rte_eth_null.c
@@ -292,6 +292,13 @@ eth_stats_reset(struct rte_eth_dev *dev)
}
 }

+static struct eth_driver rte_null_pmd = {
+   .pci_drv = {
+   .name = "rte_null_pmd",
+   .drv_flags = RTE_PCI_DRV_DETACHABLE,
+   },
+};
+
 static void
 eth_queue_release(void *q)
 {
@@ -382,10 +389,12 @@ eth_dev_null_create(const char *name __rte_unused,
data->nb_tx_queues = (uint16_t)nb_tx_queues;
data->dev_link = pmd_link;
data->mac_addrs = ð_addr;
+   strncpy(data->name, eth_dev->data->name, strlen(eth_dev->data->name));

eth_dev->data = data;
eth_dev->dev_ops = &ops;
eth_dev->pci_dev = pci_dev;
+   eth_dev->driver = &rte_null_pmd;

/* finally assign rx and tx ops */
if (packet_copy) {
@@ -476,10 +485,33 @@ rte_pmd_null_devinit(const char *name, const char *params)
return eth_dev_null_create(name, numa_node, packet_size, packet_copy);
 }

+static int
+rte_pmd_null_devuninit(const char *name, const char *params __rte_unused)
+{
+   struct rte_eth_dev *eth_dev = NULL;
+
+   RTE_LOG(INFO, PMD, "Closing null ethdev on numa socket %u\n",
+   rte_socket_id());
+
+   /* reserve an ethdev entry */
+   eth_dev = rte_eth_dev_allocated(name);
+   if (eth_dev == NULL)
+   return -1;
+
+   rte_free(eth_dev->data->dev_private);
+   rte_free(eth_dev->data);
+   rte_free(eth_dev->pci_dev);
+
+   rte_eth_dev_free(name);
+
+   return 0;
+}
+
 static struct rte_driver pmd_null_drv = {
.name = "eth_null",
.type = PMD_VDEV,
.init = rte_pmd_null_devinit,
+   .uninit = rte_pmd_null_devuninit,
 };

 PMD_REGISTER_DRIVER(pmd_null_drv);
-- 
1.9.1

[dpdk-dev] [PATCH 0/4] DPDK memcpy optimization

2015-01-20 Thread Wang, Zhihong

> -Original Message-
> From: Neil Horman [mailto:nhorman at tuxdriver.com]
> Sent: Monday, January 19, 2015 9:02 PM
> To: Wang, Zhihong
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 0/4] DPDK memcpy optimization
> 
> On Mon, Jan 19, 2015 at 09:53:30AM +0800, zhihong.wang at intel.com wrote:
> > This patch set optimizes memcpy for DPDK for both SSE and AVX platforms.
> > It also extends memcpy test coverage with unaligned cases and more test
> points.
> >
> > Optimization techniques are summarized below:
> >
> > 1. Utilize full cache bandwidth
> >
> > 2. Enforce aligned stores
> >
> > 3. Apply load address alignment based on architecture features
> >
> > 4. Make load/store address available as early as possible
> >
> > 5. General optimization techniques like inlining, branch reducing,
> > prefetch pattern access
> >
> > Zhihong Wang (4):
> >   Disabled VTA for memcpy test in app/test/Makefile
> >   Removed unnecessary test cases in test_memcpy.c
> >   Extended test coverage in test_memcpy_perf.c
> >   Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX
> > platforms
> >
> >  app/test/Makefile  |   6 +
> >  app/test/test_memcpy.c |  52 +-
> >  app/test/test_memcpy_perf.c| 238 +---
> >  .../common/include/arch/x86/rte_memcpy.h   | 664
> +++--
> >  4 files changed, 656 insertions(+), 304 deletions(-)
> >
> > --
> > 1.9.3
> >
> >
> Are you able to compile this with gcc 4.9.2?  The compilation of
> test_memcpy_perf is taking forever for me.  It appears hung.
> Neil

Neil,

Thanks for reporting this!
It should compile but will take quite some time if the CPU doesn't support 
AVX2, the reason is that:
1. The SSE & AVX memcpy implementation is more complicated than AVX2 version 
thus the compiler takes more time to compile and optimize
2. The new test_memcpy_perf.c contains 126 constants memcpy calls for better 
test case coverage, that's quite a lot

I've just tested this patch on an Ivy Bridge machine with GCC 4.9.2:
1. The whole compile process takes 9'41" with the original test_memcpy_perf.c 
(63 + 63 = 126 constant memcpy calls)
2. It takes only 2'41" after I reduce the constant memcpy call number to 12 + 
12 = 24

I'll reduce memcpy call in the next version of patch.

Zhihong (John)

[dpdk-dev] [RFC 01/17] mbuf: add definitions of unified packet types

2015-01-20 Thread Zhang, Helin



> -Original Message-
> From: Ananyev, Konstantin
> Sent: Tuesday, January 20, 2015 12:20 AM
> To: Zhang, Helin; dev at dpdk.org
> Cc: Liang, Cunming; Liu, Jijiang
> Subject: RE: [RFC 01/17] mbuf: add definitions of unified packet types
> 
> 
> 
> > -Original Message-
> > From: Zhang, Helin
> > Sent: Monday, January 19, 2015 3:23 AM
> > To: dev at dpdk.org
> > Cc: Liang, Cunming; Liu, Jijiang; Ananyev, Konstantin; Zhang, Helin
> > Subject: [RFC 01/17] mbuf: add definitions of unified packet types
> >
> > As there are only 6 bit flags in ol_flags for indicating packet types,
> > which is not enough to describe all the possible packet types hardware
> > can recognize. For example, i40e hardware can recognize more than 150
> > packet types. Unified packet type is composed of tunnel type, L3 type,
> > L4 type and inner L3 type fields, and can be stored in 16 bits mbuf
> > field of 'packet_type'.
> >
> > Signed-off-by: Helin Zhang 
> > Signed-off-by: Cunming Liang 
> > Signed-off-by: Jijiang Liu 
> > ---
> >  lib/librte_mbuf/rte_mbuf.h | 68
> > ++
> >  1 file changed, 68 insertions(+)
> >
> > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > index 16059c6..94eb38f 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -165,6 +165,74 @@ extern "C" {
> >  /* Use final bit of flags to indicate a control mbuf */
> >  #define CTRL_MBUF_FLAG   (1ULL << 63) /**< Mbuf contains control
> data */
> >
> > +/*
> > + * Sixteen bits are divided into several fields to mark packet types.
> > +Note that
> > + * each field is indexical.
> > + * - Bit 3:0 is for tunnel types.
> > + * - Bit 7:4 is for L3 or outer L3 (for tunneling case) types.
> > + * - Bit 10:8 is for L4 types. It can also be used for inner L4 types for
> > + *   tunneling packets.
> > + * - Bit 13:11 is for inner L3 types.
> > + * - Bit 15:14 is reserved.
> > + *
> > + * To be compitable with Vector PMD, RTE_PTYPE_L3_IPV4,
> > +RTE_PTYPE_L3_IPV4_EXT,
> > + * RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L4_TCP,
> > +RTE_PTYPE_L4_UDP
> > + * and RTE_PTYPE_L4_SCTP should be kept as below in a contiguous 7 bits.
> > + */
> > +#define RTE_PTYPE_UNKNOWN   0x /*
> 0b */
> > +/* bit 3:0 for tunnel types */
> > +#define RTE_PTYPE_TUNNEL_IP 0x0001 /*
> 0b0001 */
> > +#define RTE_PTYPE_TUNNEL_TCP0x0002 /*
> 0b0010 */
> > +#define RTE_PTYPE_TUNNEL_UDP0x0003 /*
> 0b0011 */
> > +#define RTE_PTYPE_TUNNEL_GRE0x0004 /*
> 0b0100 */
> > +#define RTE_PTYPE_TUNNEL_VXLAN  0x0005 /*
> 0b0101 */
> > +#define RTE_PTYPE_TUNNEL_NVGRE  0x0006 /*
> 0b0110 */
> > +#define RTE_PTYPE_TUNNEL_GENEVE 0x0007 /*
> 0b0111 */
> > +#define RTE_PTYPE_TUNNEL_GRENAT 0x0008 /*
> 0b1000 */
> > +#define RTE_PTYPE_TUNNEL_GRENAT_MAC 0x0009 /*
> 0b1001 */
> > +#define RTE_PTYPE_TUNNEL_GRENAT_MACVLAN 0x000a /*
> 0b1010 */
> > +#define RTE_PTYPE_TUNNEL_MASK   0x000f /*
> 0b */
> > +/* bit 7:4 for L3 types */
> > +#define RTE_PTYPE_L3_IPV4   0x0010 /*
> 0b0001 */
> > +#define RTE_PTYPE_L3_IPV4_EXT   0x0030 /*
> 0b0011 */
> > +#define RTE_PTYPE_L3_IPV6   0x0040 /*
> 0b0100 */
> > +#define RTE_PTYPE_L3_IPV6_EXT   0x00c0 /*
> 0b1100 */
> > +#define RTE_PTYPE_L3_IPV4_EXT_UNKNOWN   0x00d0 /*
> 0b1101 */
> > +#define RTE_PTYPE_L3_IPV6_EXT_UNKNOWN   0x00e0 /*
> 0b1110 */
> > +#define RTE_PTYPE_L3_MASK   0x00f0 /*
> 0b */
> 
> I still think it would be better to use enum not bit-set for IPv4/IPv6 
> distinction,
> but if you set it that way, can you at least take advantage of it and make
> RTE_ETH_IS_IPV4_HDR() not require 3 comparisons?
> I think it is doable if you set bit 4 for IPv4 types only (you already do 
> that) and bit
> 6 for IPv6 types only.
> For that, I think, you can make RTE_PTYPE_L3_IPV4_EXT_UNKNOWN == 0xb0
> /* 0b1011 */ Then you can:
> 
> #define  RTE_ETH_IS_IPV4_HDR(ptype)(((ptype) &
> RTE_PTYPE_L3_IPV4) != 0)
> #define  RTE_ETH_IS_IPV6_HDR(ptype)(((ptype) &
> RTE_PTYPE_L3_IPV6) != 0)
> 
> I suppose that would be faster then what you propose below, and would
> probably require less changes in our sample apps.
As waste of one bit can support Vector PMD well, I prefer to have it here.
Thank you very much for the good idea of bit selection, to get possible higher
performance. I will add the idea in the next version. Thanks a lot!

Regards,
Helin

> 
> Konstantin
> 
> > +/* bit 10:8 for L4 types */
> > +#define RTE_PTYPE_L4_TCP0x0100 /*
> 0b0001 */
> > +#define

[dpdk-dev] [PATCH 1/3] librte_reorder: New reorder library

Hi,

2015-01-07 16:39, Reshma Pattan:
> 1)New library to provide reordering of out of ordered
> mbufs based on sequence number of mbuf. Library uses reorder 
> buffer structure
> which in tern uses two circular buffers called ready and order 
> buffers.
> *rte_reorder_create API creates instance of reorder buffer.
> *rte_reorder_init API initializes given reorder buffer instance.
> *rte_reorder_reset API resets given reorder buffer instance.
> *rte_reorder_insert API inserts the mbuf into order circular 
> buffer.
> *rte_reorder_fill_overflow moves mbufs from order buffer to ready 
> buffer
> to accomodate early packets in order buffer.
> *rte_reorder_drain API provides draining facility to fetch out
> reordered mbufs from order and ready buffers.
> 
> Signed-off-by: Reshma Pattan 
> Signed-off-by: Richardson Bruce 

I think 2 things are missing in this patchset:

1) Could you show some performance numbers to compare a simple forwarding
with and without this library, in the commit log?

2) Could you add some documentation in doc/ directory for programmer's guide?

Thank you
-- 
Thomas

[dpdk-dev] [PATCH v8 3/4] i40e: support of controlling hash functions

Hi Helin,

2014-12-02 10:19, Helin Zhang:
> Hash filter control has been implemented for i40e. It includes
> getting/setting,
> - global hash configurations (hash function type, and symmetric
>   hash enable per flow type)
> - symmetric hash enable per port
> 
> Signed-off-by: Helin Zhang 
> ---
>  lib/librte_ether/rte_eth_ctrl.h   |  63 
>  lib/librte_pmd_i40e/i40e_ethdev.c | 294 
> +-
>  2 files changed, 355 insertions(+), 2 deletions(-)

Please, could you split ethdev and i40e parts while keeping Konstantin's ack?

[...]
> + * Each bit in valid_bit_mask[] indicates if the coresponding bit in

Typo: corresponding

[...]
> + /** Bit mask indicates if the coresponding bit is valid */

Same typo

[...]
> + /** Details of hash filter infomation */

Typo: information

> + union {
> + /* For RTE_ETH_HASH_FILTER_SYM_HASH_ENA_PER_PORT */
> + uint8_t enable;
> + /* Global configurations of hash filter */
> + struct rte_eth_hash_global_conf global_conf;
> + } info;

Why these comments are not doxygen'ed?

Sorry for nitpicking, that's the last review pass ;)
-- 
Thomas

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

Thank you Neil for writing this document.
This is a really important change in DPDK.
It would be very good to have comments or acknowledgement from several
developpers. This policy would be enforced by having several Acked-by lines.


2015-01-16 10:33, Neil Horman:
> Adding a document describing rudimentary ABI policy and adding notice space 
> for
> any deprecation announcements
> 
> Signed-off-by: Neil Horman 
> CC: Thomas Monjalon 
> CC: "Richardson, Bruce" 
> 
> ---
> Change notes:
> 
> v5) Updated documentation to add notes from Thomas M.
> ---
>  doc/abi.txt | 36 
>  1 file changed, 36 insertions(+)
>  create mode 100644 doc/abi.txt
> 
> diff --git a/doc/abi.txt b/doc/abi.txt
> new file mode 100644
> index 000..14be464
> --- /dev/null
> +++ b/doc/abi.txt
> @@ -0,0 +1,36 @@
> +ABI policy:
> + ABI versions are set at the time of major release labeling, and ABI may
> +change multiple times between the last labeling and the HEAD label of the git
> +tree without warning
> +
> + ABI versions, once released are available until such time as their
> +deprecation has been noted here for at least one major release cycle, after 
> it
> +has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and then the 
> decision to
> +remove it is made during the development of DPDK 1.9.  The decision will be
> +recorded here, shipped with the DPDK 1.9 release, and actually removed when 
> DPDK
> +1.10 ships.
> +
> + ABI versions may be deprecated in whole, or in part as needed by a given
> +update.
> +
> + Some ABI changes may be too significant to reasonably maintain multiple
> +versions of.  In those events ABI's may be updated without backward
> +compatibility provided.  The requirements for doing so are:
> + 1) At least 3 acknoweldgements of the need on the dpdk.org
> + 2) A full deprecation cycle must be made to offer downstream consumers
> +sufficient warning of the change.  E.g. if dpdk 2.0 is under development when
> +the change is proposed, a deprecation notice must be added to this file, and
> +released with dpdk 2.0.  Then the change may be incorporated for dpdk 2.1
> + 3) The LIBABIVER variable in the makefilei(s) where the ABI changes are
> +incorporated must be incremented in parallel with the ABI changes themselves
> +
> + Note that the above process for ABI deprecation should not be undertaken
> +lightly.  ABI stability is extreemely important for downstream consumers of 
> the
> +DPDK, especially when distributed in shared object form.  Every effort 
> should be
> +made to preserve ABI whenever possible.  For instance, reorganizing public
> +structure field for astetic or readability purposes should be avoided as it 
> will
> +cause ABI breakage.  Only significant (e.g. performance) reasons should be 
> seen
> +as cause to alter ABI.

[dpdk-dev] [PATCH v2 3/4] pmd: add support for DCB in SRIOV mode for ixgbe driver.

2015-01-19 14:02, Pawel Wodkowski:
> Add support for DCB in SRIOV mode. When no PFC is enabled this feature
> might be used as multiple queues for VF (up to 8 queues if VFs num is
> less or equal 16 or 4 if FVs num is less or equal 32).
> 
> The PF must initializes RX in ETH_MQ_RX_VMDQ_DCB and TX in
> ETH_MQ_TX_VMDQ_DCB.
> VF should initialize Rx in ETH_MQ_RX_DCB and Tx in ETH_MQ_TX_DCB to use
> multiple queues and/or DCB.
> 
> Signed-off-by: Pawel Wodkowski 

[...]

> --- a/lib/librte_pmd_ixgbe/ixgbe_pf.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> @@ -231,19 +231,19 @@ int ixgbe_pf_host_configure(struct rte_eth_dev *eth_dev)
>   }
>  
>   IXGBE_WRITE_REG(hw, IXGBE_GCR_EXT, gcr_ext);
> -IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
> + IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
>  
> -/*
> + /*
>* enable vlan filtering and allow all vlan tags through
>*/
> -vlanctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
> -vlanctrl |= IXGBE_VLNCTRL_VFE ; /* enable vlan filters */
> -IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlanctrl);
> + vlanctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
> + vlanctrl |= IXGBE_VLNCTRL_VFE ; /* enable vlan filters */
> + IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlanctrl);
>  
> -/* VFTA - enable all vlan filters */
> -for (i = 0; i < IXGBE_MAX_VFTA; i++) {
> -IXGBE_WRITE_REG(hw, IXGBE_VFTA(i), 0x);
> -}
> + /* VFTA - enable all vlan filters */
> + for (i = 0; i < IXGBE_MAX_VFTA; i++) {
> + IXGBE_WRITE_REG(hw, IXGBE_VFTA(i), 0x);
> + }

Please do not mix indent formatting with "real changes".
When looking for history of these lines, it would be difficult to understand
that this patch don't make real change. Having a dedicated cleanup commit is 
better.

Thanks
-- 
Thomas

[dpdk-dev] [PATCH v2 0/4] Integrate ethertype filter in igb/ixgbe driver to new API

> > v2 changes:
> >   change the return value if adding an existing filter from the filter's 
> > index to
> > negative value.
> > 
> > The patch set uses new filter_ctrl API to replace old ethertype filter APIs.
> > It uses new functions and structure to replace old ones in igb/ixgbe 
> > driver, new
> > commands to replace old ones in testpmd, and removes the old APIs.
> > 
> > Jingjing Wu (4):
> >   ixgbe: new functions replace old ones for ethertype filter
> >   e1000: new functions replace old ones for ethertype filter
> >   testpmd: new commands for ethertype filter
> >   ethdev: remove old APIs and structures of ethertype filter
> 
> Acked-by: Helin Zhang 

Applied

Thanks
-- 
Thomas

[dpdk-dev] [PATCH v2 2/4] ethdev: prevent changing of nb_q_per_pool in rte_eth_dev_check_mq_mode()

2015-01-20 Thread Wodkowski, PawelX

> -Original Message-
> From: Ouyang, Changchun
> Sent: Tuesday, January 20, 2015 2:33 AM
> To: Wodkowski, PawelX; dev at dpdk.org
> Cc: Ouyang, Changchun
> Subject: RE: [dpdk-dev] [PATCH v2 2/4] ethdev: prevent changing of
> nb_q_per_pool in rte_eth_dev_check_mq_mode()
> 
> 
> 
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Pawel Wodkowski
> > Sent: Monday, January 19, 2015 9:02 PM
> > To: dev at dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 2/4] ethdev: prevent changing of
> > nb_q_per_pool in rte_eth_dev_check_mq_mode()
> >
> > If SRIOV is used and device configuration does not use MQ the
> > RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool is set to 1 in
> > rte_eth_dev_check_mq_mode().
> > This is bad becouse of two reasons:
> > 1. Port reconfiguration from non-MQ mode to MQ mode is impossible 2.
> > Confguring RX and TX side in different way is impossible.
> >
> 
> This case is possible:
> rxmode.mq_mode is ETH_MQ_RX_VMDQ_RSS, and txmode.mq_mode is
> ETH_MQ_TX_NONE.
> 
but ETH_MQ_RX_NONE -> ETH_MQ_RX_VMDQ_RSS is not. 

I have 8 VFs
In testpmd

testpmd> port config all rxq 2
port config all rxq 2

testpmd> port start 0
port start 0

Configuring Port 0 (socket 0)
Fail to configure port 0
testpmd> port config all rxq 4
port config all rxq 4

testpmd> port start 0
port start 0

Configuring Port 0 (socket 0)
Fail to configure port 0
testpmd> port config all rxq 8
port config all rxq 8

testpmd> port start all
port start all

Configuring Port 0 (socket 0)
Fail to configure port 0
testpmd> port config all rxq 1
port config all rxq 1

testpmd> port start 0
port start 0

Configuring Port 0 (socket 0)
PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7ffec0ae9140 hw_ring=0x7ffec2c0bf00 
dma_addr=0x102c0bf00
PMD: set_tx_function(): Using full-featured tx code path
PMD: set_tx_function():  - txq_flags = 0 [IXGBE_SIMPLE_FLAGS=f01]
PMD: set_tx_function():  - tx_rs_thresh = 32 [RTE_PMD_IXGBE_TX_MAX_BURST=32]
PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7ffec0ae88c0 hw_ring=0x7ffec2c1bf00 
dma_addr=0x102c1bf00
PMD: ixgbe_dev_rx_queue_setup(): Rx Burst Bulk Alloc Preconditions are 
satisfied. Rx Burst Bulk Alloc function will be used on port=0, queue=0.
PMD: ixgbe_dev_rx_queue_setup(): Vector rx enabled, please make sure RX burst 
size no less than 32.
Port 0: 00:1B:21:C7:33:B0
Checking link statuses...
Port 0 Link Up - speed 1 Mbps - full-duplex
Port 1 Link Down
Done
testpmd>

Please refer to RSS patch thread. I will post there second reply.

Pawel

[dpdk-dev] [PATCH] power: added missing extern keyword in rte_power.h

2015-01-20 Thread Pablo de Lara

rte_power_freq_min function did not include "extern" keyword,
causing linking errors.

Signed-off-by: Pablo de Lara 
Reported-by: Ildar Mustafin 
---
 lib/librte_power/rte_power.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h
index 9338069..7d57359 100644
--- a/lib/librte_power/rte_power.h
+++ b/lib/librte_power/rte_power.h
@@ -242,7 +242,7 @@ extern rte_power_freq_change_t rte_power_freq_max;
  *  - 0 on success without frequency changed.
  *  - Negative on error.
  */
-rte_power_freq_change_t rte_power_freq_min;
+extern rte_power_freq_change_t rte_power_freq_min;

 #ifdef __cplusplus
 }
-- 
1.7.4.1

[dpdk-dev] [PATCH v4 06/11] eal/linux/pci: Add functions for unmapping igb_uio resources

2015-01-20 Thread Qiu, Michael

On 1/19/2015 6:42 PM, Tetsuya Mukawa wrote:
> The patch adds functions for unmapping igb_uio resources. The patch is only
> for Linux and igb_uio environment. VFIO and BSD are not supported.
>
> v4:
> - Add paramerter checking.
> - Add header file to determine if hotplug can be enabled.
>
> Signed-off-by: Tetsuya Mukawa 
> ---
>  lib/librte_eal/common/Makefile  |  1 +
>  lib/librte_eal/common/include/rte_dev_hotplug.h | 44 +
>  lib/librte_eal/linuxapp/eal/eal_pci.c   | 38 +++
>  lib/librte_eal/linuxapp/eal/eal_pci_init.h  |  8 +++
>  lib/librte_eal/linuxapp/eal/eal_pci_uio.c   | 65 
> +
>  5 files changed, 156 insertions(+)
>  create mode 100644 lib/librte_eal/common/include/rte_dev_hotplug.h
>
> diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
> index 52c1a5f..db7cc93 100644
> --- a/lib/librte_eal/common/Makefile
> +++ b/lib/librte_eal/common/Makefile
> @@ -41,6 +41,7 @@ INC += rte_eal_memconfig.h rte_malloc_heap.h
>  INC += rte_hexdump.h rte_devargs.h rte_dev.h
>  INC += rte_common_vect.h
>  INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
> +INC += rte_dev_hotplug.h
>  
>  ifeq ($(CONFIG_RTE_INSECURE_FUNCTION_WARNING),y)
>  INC += rte_warnings.h
> diff --git a/lib/librte_eal/common/include/rte_dev_hotplug.h 
> b/lib/librte_eal/common/include/rte_dev_hotplug.h
> new file mode 100644
> index 000..b333e0f
> --- /dev/null
> +++ b/lib/librte_eal/common/include/rte_dev_hotplug.h
> @@ -0,0 +1,44 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2015 IGEL Co.,LTd.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + *   notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + *   notice, this list of conditions and the following disclaimer in
> + *   the documentation and/or other materials provided with the
> + *   distribution.
> + * * Neither the name of IGEL Co.,Ltd. nor the names of its
> + *   contributors may be used to endorse or promote products derived
> + *   from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_HOTPLUG_H_
> +#define _RTE_DEV_HOTPLUG_H_
> +
> +/*
> + * determine if hotplug can be enabled on the system
> + */
> +#if defined(RTE_LIBRTE_EAL_HOTPLUG) && defined(RTE_LIBRTE_EAL_LINUXAPP)

As you said, VFIO should not work with it, so does it need to add the
vfio check here?

Thanks,
Michael
> +#define ENABLE_HOTPLUG
> +#endif /* RTE_LIBRTE_EAL_HOTPLUG & RTE_LIBRTE_EAL_LINUXAPP */
> +
> +#endif /* _RTE_DEV_HOTPLUG_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c 
> b/lib/librte_eal/linuxapp/eal/eal_pci.c
> index 3d2d93c..52c464c 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
> @@ -137,6 +137,25 @@ pci_map_resource(void *requested_addr, int fd, off_t 
> offset, size_t size)
>   return mapaddr;
>  }
>  
> +#ifdef ENABLE_HOTPLUG
> +/* unmap a particular resource */
> +void
> +pci_unmap_resource(void *requested_addr, size_t size)
> +{
> + if (requested_addr == NULL)
> + return;
> +
> + /* Unmap the PCI memory resource of device */
> + if (munmap(requested_addr, size)) {
> + RTE_LOG(ERR, EAL, "%s(): cannot munmap(%p, 0x%lx): %s\n",
> + __func__, requested_addr, (unsigned long)size,
> + strerror(errno));
> + } else
> + RTE_LOG(DEBUG, EAL, "  PCI memory mapped at %p\n",
> + requested_addr);
> +}
> +#endif /* ENABLE_HOTPLUG */
> +
>  /* parse the "resource" sysfs file */
>  #define IORESOURCE_MEM  0x0200
>  
> @@ -510,6 +529,25 @@ pci_map_device(struct rte_pci_device *dev)
>   return 0;
>  }
>  
> +#ifdef ENABLE_HOTPLUG
> +static void
> +pci_unmap_device(struct rte_p

[dpdk-dev] [PATCH v6 5/6] ixgbe: Config VF RSS

2015-01-20 Thread Wodkowski, PawelX

> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Ouyang Changchun
> Sent: Monday, January 12, 2015 6:59 AM
> To: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH v6 5/6] ixgbe: Config VF RSS
> 
> It needs config RSS and IXGBE_MRQC and IXGBE_VFPSRTYPE to enable VF RSS.
> 
> The psrtype will determine how many queues the received packets will 
> distribute
> to,
> and the value of psrtype should depends on both facet: max VF rxq number
> which
> has been negotiated with PF, and the number of rxq specified in config on 
> guest.
> 
> Signed-off-by: Changchun Ouyang 
> 
> Changes in v6:
>   - Raise an error for the case of ETH_16_POOLS in config vf rss, as the 
> previous
> logic have changed it into: ETH_32_POOLS.
> 
> Changes in v4:
>  - The number of rxq from config should be power of 2 and should not bigger
> than
> max VF rxq number(negotiated between guest and host).
> 
> ---
>  lib/librte_pmd_ixgbe/ixgbe_pf.c   |  15 ++
>  lib/librte_pmd_ixgbe/ixgbe_rxtx.c | 102
> +-
>  2 files changed, 105 insertions(+), 12 deletions(-)
> 
> diff --git a/lib/librte_pmd_ixgbe/ixgbe_pf.c b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> index dbda9b5..93f6e43 100644
> --- a/lib/librte_pmd_ixgbe/ixgbe_pf.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> @@ -187,6 +187,21 @@ int ixgbe_pf_host_configure(struct rte_eth_dev
> *eth_dev)
>   IXGBE_WRITE_REG(hw, IXGBE_MPSAR_LO(hw->mac.num_rar_entries),
> 0);
>   IXGBE_WRITE_REG(hw, IXGBE_MPSAR_HI(hw->mac.num_rar_entries),
> 0);
> 
> + /*
> +  * VF RSS can support at most 4 queues for each VF, even if
> +  * 8 queues are available for each VF, it need refine to 4
> +  * queues here due to this limitation, otherwise no queue
> +  * will receive any packet even RSS is enabled.
> +  */
> + if (eth_dev->data->dev_conf.rxmode.mq_mode ==
> ETH_MQ_RX_VMDQ_RSS) {
> + if (RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool == 8) {
> + RTE_ETH_DEV_SRIOV(eth_dev).active =
> ETH_32_POOLS;
> + RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = 4;
> + RTE_ETH_DEV_SRIOV(eth_dev).def_pool_q_idx =
> + dev_num_vf(eth_dev) * 4;
> + }
> + }
> +

I did not looked before at your patches but I think you are messing with things 
that should not be changed:

Why you are changing those values. They are set up during ixgbe_pf_host_init(). 
Limitation you are
describing is only RSS related. If there will be reconfiguration from 
ETH_MQ_RX_VMDQ_RSS to other mode those value need to be re-evaluated. If you 
find this
kind of limitation you should handle it during RSS part configuration. Or if 
your way is the right way
you should explicitly make separate function that will re-evaluate those 
parameters each time.

Second issue with this code is that the nb_q_per_pool is changed from:
ixgbe_pf_host_configure() -> ixgbe_dev_start() -> rte_eth_dev_start()
and
rte_eth_dev_check_vf_rss_rxq_num() -> rte_eth_dev_check_mq_mode() -> 
rte_eth_dev_configure()

Which one is the right one? If both, why they are calculated twice?

I don't think that rte_eth_dev_data::sriov field should be changed at all - it 
holds current SRIOV capabilities.
If this will change during runtime it no point to have this field at all and 
should be some kind of "siov_get()"
function that will calculate and return those parameters dynamically.

Please refer also to 
for further issues.

I think this patchset should not be applied.

>   /* set VMDq map to default PF pool */
>   hw->mac.ops.set_vmdq(hw, 0,
> RTE_ETH_DEV_SRIOV(eth_dev).def_vmdq_idx);
> 
> diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> index f69abda..20627df 100644
> --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> @@ -3327,6 +3327,67 @@ ixgbe_alloc_rx_queue_mbufs(struct igb_rx_queue
> *rxq)
>  }
> 
>  static int
> +ixgbe_config_vf_rss(struct rte_eth_dev *dev)
> +{
> + struct ixgbe_hw *hw;
> + uint32_t mrqc;
> +
> + ixgbe_rss_configure(dev);
> +
> + hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +
> + /* MRQC: enable VF RSS */
> + mrqc = IXGBE_READ_REG(hw, IXGBE_MRQC);
> + mrqc &= ~IXGBE_MRQC_MRQE_MASK;
> + switch (RTE_ETH_DEV_SRIOV(dev).active) {
> + case ETH_64_POOLS:
> + mrqc |= IXGBE_MRQC_VMDQRSS64EN;
> + break;
> +
> + case ETH_32_POOLS:
> + mrqc |= IXGBE_MRQC_VMDQRSS32EN;
> + break;
> +
> + default:
> + PMD_INIT_LOG(ERR, "Invalid pool number in IOV mode with
> VMDQ RSS");
> + return -EINVAL;
> + }
> +
> + IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc);
> +
> + return 0;
> +}
> +
> +static int
> +ixgbe_config_vf_default(struct rte_eth_dev *dev)
> +{
> + struct ixgbe_hw *hw =
> + IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +
> + switch (RTE_ETH_DEV_SRI

[dpdk-dev] [RFC 01/17] mbuf: add definitions of unified packet types

2015-01-20 Thread Olivier MATZ

Hi Helin,

On 01/20/2015 03:28 AM, Zhang, Helin wrote:
>> Another question I've asked several times[1][2] : what does having
>> RTE_PTYPE_TUNNEL_IP mean? What fields are checked by the hardware (or
>> the driver) and what fields should be checked by the application?
>> Are you sure that all the drivers (ixgbe, i40e, vmxnet3, enic) check the same
>> fields? (ethertype, ip version, ip len correct, ip checksum correct, flags, 
>> ...)
> RTE_PTYPE_TUNNEL_IP means hardware recognizes the received packet as an
> IP-in-IP packet.
> All the fields are filled by PMD which is recognized by hardware. The 
> application
> can just use it which can save some cpu cycles to recognize the packet type by
> software.
> Drivers is responsible for filling with correct values according to the 
> packet types
> recognized by its hardware. Different PMDs may fill with different values 
> based on
> different capabilities.

Sorry, that does not answer to my question.

Let's take a simple example. Imagine a hardware-1 that is able to
recognize an IP packet by checking the ethertype and that the IP
version is set to 4.
Another hardware-2 recognize an IP packet by checking the ethertype,
the IP version and that the IP length is correct compared to m_len(m).

For the same packet, both hardwares will return RTE_PTYPE_L3_IPV4, but
they don't do the same checks on the packet. As I want my application
behave exactly the same whatever the hardware, I need to know what
checks are done in hardware, so I can decide what checks must be
done in my application.

Example of definition: RTE_PTYPE_L3_IPV4 means that ethertype is
0x0800 and IP.version is 4.

It means that I can skip these 2 tests in my application if I have
this packet_type, but all other checks must be done in software
(ip length, flags, checksum, ...)

For each packet type, we need a definition like above, and we must
check that all drivers setting a packet type behave like described.

Regards,
Olivier

[dpdk-dev] [RFC PATCH 1/5] ethdev: add rx interrupt enable/disable functions


Signed-off-by: Danny Zhou 
---
 lib/librte_ether/rte_ethdev.c | 45 ++
 lib/librte_ether/rte_ethdev.h | 57 +++
 2 files changed, 102 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 077d430..65ddd01 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -2825,6 +2825,51 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
}
rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
+
+int
+rte_eth_dev_rx_queue_intr_enable(uint8_t port_id,
+   uint16_t queue_id)
+{
+   struct rte_eth_dev *dev;
+
+   if (port_id >= nb_ports) {
+   PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+   return (-ENODEV);
+   }
+
+   dev = &rte_eth_devices[port_id];
+   if (dev == NULL) {
+   PMD_DEBUG_TRACE("Invalid port device\n");
+   return (-ENODEV);
+   }
+
+   FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP);
+   (*dev->dev_ops->rx_queue_intr_enable)(dev, queue_id);
+   return 0;
+}
+
+int
+rte_eth_dev_rx_queue_intr_disable(uint8_t port_id,
+   uint16_t queue_id)
+{
+   struct rte_eth_dev *dev;
+
+   if (port_id >= nb_ports) {
+   PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+   return (-ENODEV);
+   }
+
+   dev = &rte_eth_devices[port_id];
+   if (dev == NULL) {
+   PMD_DEBUG_TRACE("Invalid port device\n");
+   return (-ENODEV);
+   }
+
+   FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP);
+   (*dev->dev_ops->rx_queue_intr_disable)(dev, queue_id);
+   return 0;
+}
+
 #ifdef RTE_NIC_BYPASS
 int rte_eth_dev_bypass_init(uint8_t port_id)
 {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index ce0528f..9e02a0c 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -848,6 +848,8 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
uint16_t lsc;
+   /** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
+   uint16_t rxq;
 };

 /**
@@ -1117,6 +1119,14 @@ typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev 
*dev,
const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */

+typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
+   uint16_t rx_queue_id);
+/**< @internal Enable interrupt of a receive queue of an Ethernet device. */
+
+typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev,
+   uint16_t rx_queue_id);
+/**< @internal Disable interrupt of a receive queue of an Ethernet device. */
+
 typedef void (*eth_queue_release_t)(void *queue);
 /**< @internal Release memory resources allocated by given RX/TX queue. */

@@ -1467,6 +1477,8 @@ struct eth_dev_ops {
eth_queue_start_t  tx_queue_start;/**< Start TX for a queue.*/
eth_queue_stop_t   tx_queue_stop;/**< Stop TX for a queue.*/
eth_rx_queue_setup_t   rx_queue_setup;/**< Set up device RX queue.*/
+   eth_rx_enable_intr_t   rx_queue_intr_enable; /**< Enable Rx queue 
interrupt. */
+   eth_rx_disable_intr_t  rx_queue_intr_disable; /**< Disable Rx queue 
interrupt.*/
eth_queue_release_trx_queue_release;/**< Release RX queue.*/
eth_rx_queue_count_t   rx_queue_count; /**< Get Rx queue count. */
eth_rx_descriptor_done_t   rx_descriptor_done;  /**< Check rxd DD bit */
@@ -2836,6 +2848,51 @@ void _rte_eth_dev_callback_process(struct rte_eth_dev 
*dev,
enum rte_eth_event_type event);

 /**
+ * When there is no rx packet coming in Rx Queue for a long time, we can
+ * sleep lcore related to RX Queue for power saving, and enable rx interrupt
+ * to be triggered when rx packect arrives.
+ *
+ * The rte_eth_dev_rx_queue_intr_enable() function enables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ * that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_queue_intr_enable(uint8_t port_id,
+   uint16_t queue_id);
+
+/**
+ * When lcore wakes up from rx interrupt indicating packet coming, disable rx
+ * interrupt and returns to polling mode.
+ *
+ * The rte_eth_dev_rx_queue_intr_disable() function d

[dpdk-dev] [RFC PATCH 3/5] eal: add per rx queue interrupt handling based on VFIO


Signed-off-by: Danny Zhou 
---
 lib/librte_eal/common/include/rte_eal.h|   9 ++
 lib/librte_eal/linuxapp/eal/eal_interrupts.c   | 120 -
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c |  11 +-
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |   3 +
 4 files changed, 137 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_eal.h 
b/lib/librte_eal/common/include/rte_eal.h
index f4ecd2e..5f31aa5 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -150,6 +150,15 @@ int rte_eal_iopl_init(void);
  *   - On failure, a negative error value.
  */
 int rte_eal_init(int argc, char **argv);
+
+/**
+ * @param port_id
+ *   the port id
+ * @return
+ *   - On success, return 0
+ */
+int rte_eal_wait_rx_intr(uint8_t port_id, uint8_t queue_id);
+
 /**
  * Usage function typedef used by the application usage function.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c 
b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index dc2668a..1be4ba7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -64,6 +64,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "eal_private.h"
 #include "eal_vfio.h"
@@ -127,6 +128,7 @@ static pthread_t intr_thread;
 #ifdef VFIO_PRESENT

 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int) * 
(VFIO_MAX_QUEUE_ID + 1))

 /* enable legacy (INTx) interrupts */
 static int
@@ -293,7 +295,7 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 static int
 vfio_enable_msix(struct rte_intr_handle *intr_handle) {
int len, ret;
-   char irq_set_buf[IRQ_SET_BUF_LEN];
+   char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
struct vfio_irq_set *irq_set;
int *fd_ptr;

@@ -301,12 +303,13 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {

irq_set = (struct vfio_irq_set *) irq_set_buf;
irq_set->argsz = len;
-   irq_set->count = 1;
+   irq_set->count = VFIO_MAX_QUEUE_ID + 1;
irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 
VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
irq_set->start = 0;
fd_ptr = (int *) &irq_set->data;
-   *fd_ptr = intr_handle->fd;
+   memcpy(fd_ptr, intr_handle->queue_fd, sizeof(intr_handle->queue_fd));
+   fd_ptr[VFIO_MAX_QUEUE_ID] = intr_handle->fd;

ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);

@@ -323,7 +326,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
irq_set->count = 1;
irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-   irq_set->start = 0;
+   irq_set->start = VFIO_MAX_QUEUE_ID;

ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);

@@ -332,6 +335,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
intr_handle->fd);
return -1;
}
+
return 0;
 }

@@ -339,7 +343,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 static int
 vfio_disable_msix(struct rte_intr_handle *intr_handle) {
struct vfio_irq_set *irq_set;
-   char irq_set_buf[IRQ_SET_BUF_LEN];
+   char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
int len, ret;

len = sizeof(struct vfio_irq_set);
@@ -824,3 +828,109 @@ rte_eal_intr_init(void)
return -ret;
 }

+static int
+eal_intr_process_rx_interrupts(uint8_t port_id, struct epoll_event *events, 
int nfds)
+{
+   int n, bytes_read;
+   union rte_intr_read_buffer buf;
+   struct rte_intr_handle intr_handle = 
rte_eth_devices[port_id].pci_dev->intr_handle;
+
+   for (n = 0; n < nfds; n++) {
+   /* set the length to be read dor different handle type */
+   switch (intr_handle.type) {
+   case RTE_INTR_HANDLE_UIO:
+   bytes_read = sizeof(buf.uio_intr_count);
+   break;
+   case RTE_INTR_HANDLE_ALARM:
+   bytes_read = sizeof(buf.timerfd_num);
+   break;
+#ifdef VFIO_PRESENT
+   case RTE_INTR_HANDLE_VFIO_MSIX:
+   case RTE_INTR_HANDLE_VFIO_MSI:
+   case RTE_INTR_HANDLE_VFIO_LEGACY:
+   bytes_read = sizeof(buf.vfio_intr_count);
+   break;
+#endif
+   default:
+   bytes_read = 1;
+   break;
+   }
+
+   /**
+   * read out to clear the ready-to-be-read flag
+   * for epoll_wait.
+   */
+   bytes_read = read(events[n].data.fd, &buf, bytes_read);
+   if (bytes_read < 0)
+   RTE_LOG(ERR, EAL, "Error reading from file "
+

[dpdk-dev] [RFC PATCH 0/5] Introduce low-latency one-shot rx interrupt into DPDK with polling/interrupt switch control example

DPDK interrupt notification/handling mechanism is based on UIO with
below limitation:
1) It is designed to handle LSC interrupt only with inefficient
suspended pthread wakeup procedure (e.g. UIO wakes up LSC interrupt
handling thread which then wakes up DPDK polling thread). In this way,
it introduces non-deterministic wakeup latency for DPDK polling thread
as well as packet latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared
by LSC interrupt and interrupts assigned to dedicated rx queues.

This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD for PF as well as VF.
2) Build on top of the VFIO mechanism instead of UIO, so it could support
up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a
dedicated VFIO eventfd, which eliminates non-deterministic pthread wakeup
latency in user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like
polling/interrupt switch algorithms in L3fwd-power example.

Danny Zhou (5):
  ethdev: add rx interrupt enable/disable functions
  ixgbe: enable rx queue interrupts
  eal: add per rx queue interrupt handling based on VFIO
  L3fwd-power: enable one-shot rx interrupt and polling/interrupt mode  
  switch
  ixgbe: enable rx queue interrupts for VF

 examples/l3fwd-power/main.c| 171 +++---
 lib/librte_eal/common/include/rte_eal.h|   9 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c   | 189 ---
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c |  11 +-
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |   4 +
 lib/librte_ether/rte_ethdev.c  |  45 +++
 lib/librte_ether/rte_ethdev.h  |  57 
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c| 356 +
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h|   6 +
 9 files changed, 765 insertions(+), 83 deletions(-)

-- 
1.8.1.4

[dpdk-dev] [RFC PATCH 2/5] ixgbe: enable rx queue interrupts


Signed-off-by: Danny Zhou 
---
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 203 
 1 file changed, 203 insertions(+)

diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c 
b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 3fc3738..1d694c5 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -173,6 +173,7 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -217,6 +218,11 @@ static int ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 static int ixgbe_mirror_rule_reset(struct rte_eth_dev *dev,
uint8_t rule_id);

+static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t 
queue_id);
+static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t 
queue_id);
+static void ixgbe_set_ivar(struct ixgbe_hw *hw, s8 direction, u8 queue, u8 
msix_vector);
+static void ixgbe_configure_msix(struct  ixgbe_hw *hw);
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
uint16_t queue_idx, uint16_t tx_rate);
 static int ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf,
@@ -332,6 +338,8 @@ static struct eth_dev_ops ixgbe_eth_dev_ops = {
.tx_queue_start   = ixgbe_dev_tx_queue_start,
.tx_queue_stop= ixgbe_dev_tx_queue_stop,
.rx_queue_setup   = ixgbe_dev_rx_queue_setup,
+   .rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
+   .rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
.rx_queue_release = ixgbe_dev_rx_queue_release,
.rx_queue_count   = ixgbe_dev_rx_queue_count,
.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
@@ -1481,6 +1489,9 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
/* configure PF module if SRIOV enabled */
ixgbe_pf_host_configure(dev);

+   /* confiugre msix for  sleep until  rx interrupt */
+   ixgbe_configure_msix(hw);
+
/* initialize transmission unit */
ixgbe_dev_tx_init(dev);

@@ -1550,6 +1561,10 @@ skip_link_setup:
if (dev->data->dev_conf.intr_conf.lsc != 0)
ixgbe_dev_lsc_interrupt_setup(dev);

+   /* check if rxq interrupt is enabled */
+   if (dev->data->dev_conf.intr_conf.rxq != 0)
+   ixgbe_dev_rxq_interrupt_setup(dev);
+
/* resume enabled intr since hw reset */
ixgbe_enable_intr(dev);

@@ -2212,6 +2227,28 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
return 0;
 }

+/**
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int
+ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+   struct ixgbe_interrupt *intr =
+   IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+   intr->mask |= IXGBE_EICR_RTX_QUEUE;
+
+   return 0;
+}
+
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
  *
@@ -3502,6 +3539,172 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, 
uint8_t rule_id)
return 0;
 }

+static int
+ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+   u32 mask;
+   struct ixgbe_hw *hw =
+   IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+   struct ixgbe_interrupt *intr =
+   IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+   if (queue_id < 16) {
+   ixgbe_disable_intr(hw);
+   intr->mask |= (1 << queue_id);
+   ixgbe_enable_intr(dev);
+   } else if (queue_id < 32) {
+   mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+   mask &= (1 << queue_id);
+   IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+   } else if (queue_id < 64) {
+   mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+   mask &= (1 << (queue_id - 32));
+   IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+   }
+
+   return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+   u32 mask;
+   struct ixgbe_hw *hw =
+   IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+   struct ixgbe_interrupt *intr =
+   IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+   if (queue_id < 16) {
+   ixgbe_disable_intr(hw);
+   intr->mask &= ~(1 << queue_id);
+   ixgbe_

[dpdk-dev] [RFC PATCH 4/5] L3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch


Signed-off-by: Danny Zhou 
---
 examples/l3fwd-power/main.c | 127 ++--
 1 file changed, 98 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f6b55b9..71f1d90 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -75,12 +75,13 @@
 #include 
 #include 
 #include 
+#include 

 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1

 #define MAX_PKT_BURST 32

-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10

 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES   2ULL
@@ -188,6 +189,9 @@ struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128

+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
uint8_t port_id;
@@ -214,7 +218,7 @@ static uint16_t nb_lcore_params = 
sizeof(lcore_params_array_default) /

 static struct rte_eth_conf port_conf = {
.rxmode = {
-   .mq_mode= ETH_MQ_RX_RSS,
+   .mq_mode = ETH_MQ_RX_RSS,
.max_rx_pkt_len = ETHER_MAX_LEN,
.split_hdr_size = 0,
.header_split   = 0, /**< Header Split disabled */
@@ -226,11 +230,15 @@ static struct rte_eth_conf port_conf = {
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL,
-   .rss_hf = ETH_RSS_IP,
+   .rss_hf = ETH_RSS_UDP,
},
},
.txmode = {
-   .mq_mode = ETH_DCB_NONE,
+   .mq_mode = ETH_MQ_TX_NONE,
+   },
+   .intr_conf = {
+   .lsc = 1,
+   .rxq = 1, /**< rxq interrupt feature enabled */
},
 };

@@ -402,7 +410,6 @@ power_timer_cb(__attribute__((unused)) struct rte_timer 
*tim,
/* accumulate total execution time in us when callback is invoked */
sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
(float)SCALING_PERIOD;
-
/**
 * check whether need to scale down frequency a step if it sleep a lot.
 */
@@ -707,22 +714,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,

 }

-#define SLEEP_GEAR1_THRESHOLD100
-#define SLEEP_GEAR2_THRESHOLD1000
+#define MINIMUM_SLEEP_TIME 1
+#define SUSPEND_THRESHOLD  300

 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-   /* If zero count is less than 100, use it as the sleep time in us */
-   if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-   return zero_rx_packet_count;
-   /* If zero count is less than 1000, sleep time should be 100 us */
-   else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-   (zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-   return SLEEP_GEAR1_THRESHOLD;
-   /* If zero count is greater than 1000, sleep time should be 1000 us */
-   else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-   return SLEEP_GEAR2_THRESHOLD;
+   /* If zero count is less than 100,  sleep 1us */
+   if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+   return MINIMUM_SLEEP_TIME;
+   /* If zero count is less than 1000, sleep 100 us which is the minimum 
latency
+   switching from C3/C6 to C0
+   */
+   else
+   return SUSPEND_THRESHOLD;

return 0;
 }
@@ -762,6 +767,35 @@ power_freq_scaleup_heuristic(unsigned lcore_id,
return FREQ_CURRENT;
 }

+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(uint8_t port_id, uint8_t queue_id)
+{
+   /* Enable one-shot rx interrupt */
+   rte_eth_dev_rx_queue_intr_enable(port_id, queue_id);
+
+   RTE_LOG(INFO, L3FWD_POWER,
+   "lcore %u sleeps until interrupt on port%d,rxq%d triggers\n",
+   rte_lcore_id(), port_id, queue_id);
+   rte_eal_wait_rx_intr(port_id, queue_id);
+   RTE_LOG(INFO, L3FWD_POWER,
+   "lcore %u is waked up from rx interrupt on port%d,rxq%d\n",
+   rte_lcore_id(), port_id, queue_id);
+
+   /* Disable one-shot rx interrupt */
+   rte_eth_dev_rx_queue_intr_disable(port_id, queue_id);
+
+   return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -775,7 +809,6 @@ main_loop(__attribute__((unused)) void *dummy)
struct lcore_conf *qconf;
struct lcore_rx_queue *rx_queue;
enum freq_scale_hint_t lcore_scaleup_hint;
-
uint32_t lcore_rx_idle_count = 0;
uint32_t lcore_idle_hint = 0;

@@ -835,6 +868,8 @@ main_loop(__attribute__((unused)) void *dummy)
prev_tsc_power = cur_tsc_power;

[dpdk-dev] [RFC PATCH 5/5] ixgbe: enable rx queue interrupts for VF

This patch enables rx queue interrupt for ixgbevf with below changes:
1) Configure ixgbevf rx queue interrupts
2) Initialize ixgbevf devices in L3fwd-power appropriately
3) Fix VFIO interrupt vector settings

Signed-off-by: Yong Liu 
Signed-off-by: Danny Zhou 
---
 examples/l3fwd-power/main.c|  46 +--
 lib/librte_eal/linuxapp/eal/eal_interrupts.c   |  83 ++-
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |   1 +
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c| 153 +
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h|   6 +
 5 files changed, 233 insertions(+), 56 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index 71f1d90..3262db2 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -237,7 +237,6 @@ static struct rte_eth_conf port_conf = {
.mq_mode = ETH_MQ_TX_NONE,
},
.intr_conf = {
-   .lsc = 1,
.rxq = 1, /**< rxq interrupt feature enabled */
},
 };
@@ -413,15 +412,19 @@ power_timer_cb(__attribute__((unused)) struct rte_timer 
*tim,
/**
 * check whether need to scale down frequency a step if it sleep a lot.
 */
-   if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-   rte_power_freq_down(lcore_id);
+   if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+   if (rte_power_freq_down)
+   rte_power_freq_down(lcore_id);
+   }
else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-   stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+   stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
/**
 * scale down a step if average packet per iteration less
 * than expectation.
 */
-   rte_power_freq_down(lcore_id);
+   if (rte_power_freq_down)
+   rte_power_freq_down(lcore_id);
+   }

/**
 * initialize another timer according to current frequency to ensure
@@ -946,10 +949,14 @@ start_rx:
rx_queue->freq_up_hint;
}

-   if (lcore_scaleup_hint == FREQ_HIGHEST)
-   rte_power_freq_max(lcore_id);
-   else if (lcore_scaleup_hint == FREQ_HIGHER)
-   rte_power_freq_up(lcore_id);
+   if (lcore_scaleup_hint == FREQ_HIGHEST) {
+   if (rte_power_freq_max)
+   rte_power_freq_max(lcore_id);
+   }
+   else if (lcore_scaleup_hint == FREQ_HIGHER) {
+   if (rte_power_freq_up)
+   rte_power_freq_up(lcore_id);
+   }
} else {
/**
 * All Rx queues empty in recent consecutive polls,
@@ -1546,6 +1553,7 @@ main(int argc, char **argv)
unsigned lcore_id;
uint64_t hz;
uint32_t n_tx_queue, nb_lcores;
+   uint32_t dev_rxq_num, dev_txq_num;
uint8_t portid, nb_rx_queue, queue, socketid;

/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1595,10 +1603,18 @@ main(int argc, char **argv)
printf("Initializing port %d ... ", portid );
fflush(stdout);

+   rte_eth_dev_info_get(portid, &dev_info);
+   dev_rxq_num = dev_info.max_rx_queues;
+   dev_txq_num = dev_info.max_tx_queues;
+
nb_rx_queue = get_port_n_rx_queues(portid);
+   if (nb_rx_queue > dev_rxq_num)
+   rte_exit(EXIT_FAILURE, "Cannot configure not existed 
rxq: "
+   "port=%d\n", portid);
+
n_tx_queue = nb_lcores;
-   if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-   n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+   if (n_tx_queue > dev_txq_num)
+   n_tx_queue = dev_txq_num;
printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
nb_rx_queue, (unsigned)n_tx_queue );
ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1622,6 +1638,9 @@ main(int argc, char **argv)
if (rte_lcore_is_enabled(lcore_id) == 0)
continue;

+   if (queueid >= dev_txq_num)
+   continue;
+
if (numa_on)
socketid = \
(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1656,8 +1675,9 @@ main(int argc, char **argv)
/* init power management library */
ret = rte_power_init(lcore_id);
if (re

[dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.

Hi Thomas,

> -Original Message-
> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> Sent: Monday, January 19, 2015 5:18 PM
> To: Ananyev, Konstantin
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.
> 
> 2015-01-12 19:16, Konstantin Ananyev:
> >  /*
> > + * ACL RT structure is a set of multibit tries (with stride == 8)
> > + * represented by an array of transitions. The next position is calculated
> > + * based on the current position and the input byte.
> > + * Each transition is 64 bit value with the following format:
> > + * | node_type_specific : 32 | node_type : 3 | node_addr : 29 |
> > + * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index
> > + * to the start of the node in the transtions array.
> > + * Few different node types are used:
> > + * RTE_ACL_NODE_MATCH:
> > + * node_addr value is and index into an array that contains the return 
> > value
> > + * and its priority for each category.
> > + * Upper 32 bits of the transtion value are not used for that node type.
> > + * RTE_ACL_NODE_QRANGE:
> > + * that node consist of up to 5 transitions.
> > + * Upper 32 bits are interpreted as 4 signed character values which
> > + * are ordered from smallest(INT8_MIN) to largest (INT8_MAX).
> > + * These values define 5 ranges:
> > + * INT8_MIN <= range[0]  <= ((int8_t *)&transition)[4]
> > + * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5]
> > + * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6]
> > + * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7]
> > + * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX
> > + * So for input byte value within range[i] i-th transition within that node
> > + * will be used.
> > + * RTE_ACL_NODE_SINGLE:
> > + * always transitions to the same node regardless of the input value.
> > + * RTE_ACL_NODE_DFA:
> > + * that node consits of up to 256 transitions.
> > + * In attempt to conserve space all transitions are divided into 4 
> > consecutive
> > + * groups, by 64 transitions per group:
> > + * group64[i] contains transitions[i * 64, .. i * 64 + 63].
> > + * Upper 32 bits are interpreted as 4 unsigned character values one per 
> > group,
> > + * which contain index to the start of the given group within the node.
> > + * So to calculate transition index within the node for given input byte 
> > value:
> > + * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
> > + */
> 
> It's maybe an error. You were only supposed to remove some macros in this 
> patch.

Ah yes, I added some comments about ACL internal layout.
Thought it might be useful.
Forgot to add it into patch description. 
Are you saying I need to split it into 2 patches, or it is ok like that?
Konstantin

> 
> --
> Thomas

[dpdk-dev] [PATCH v2 00/17] ACL: New AVX2 classify method and several other enhancements.



> -Original Message-
> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> Sent: Monday, January 19, 2015 5:16 PM
> To: Ananyev, Konstantin
> Cc: dev at dpdk.org; Neil Horman
> Subject: Re: [dpdk-dev] [PATCH v2 00/17] ACL: New AVX2 classify method and 
> several other enhancements.
> 
> 2015-01-14 13:39, Neil Horman:
> > On Mon, Jan 12, 2015 at 07:16:04PM +, Konstantin Ananyev wrote:
> > > v2 changes:
> > > - When build with the compilers that don't support AVX2 instructions,
> > > make rte_acl_classify_avx2() do nothing and return an error.
> > > - Remove unneeded 'ifdef __AVX2__' in acl_run_avx2.*.
> > > - Reorder order of patches in the set, to keep RTE_LIBRTE_ACL_STANDALONE=y
> > > always buildable.
> > >
> > > This patch series contain several fixes and enhancements for ACL library.
> > > See complete list below.
> > > Two main changes that are externally visible:
> > > - Introduce new classify method:  RTE_ACL_CLASSIFY_AVX2.
> > > It uses AVX2 instructions and 256 bit wide data types
> > > to perform internal trie traversal.
> > > That helps to increase classify() throughput.
> > > This method is selected as default one on CPUs that supports AVX2.
> > > - Introduce new field in the build config structure: max_size.
> > > It specifies maximum size that internal RT structure for given context
> > > can reach.
> > > The purpose of that is to allow user to decide about space/performance 
> > > trade-off
> > > (faster classify() vs less space for RT internal structures)
> > > for each given set of rules.
> > >
> > > Konstantin Ananyev (17):
> > >   fix fix compilation issues with RTE_LIBRTE_ACL_STANDALONE=y
> > >   app/test: few small fixes fot test_acl.c
> > >   librte_acl: make data_indexes long enough to survive idle transitions.
> > >   librte_acl: remove build phase heuristsic with negative perfomance
> > > effect.
> > >   librte_acl: fix a bug at build phase that can cause matches beeing
> > > overwirtten.
> > >   librte_acl: introduce DFA nodes compression (group64) for identical
> > > entries.
> > >   librte_acl: build/gen phase - simplify the way match nodes are
> > > allocated.
> > >   librte_acl: make scalar RT code to be more similar to vector one.
> > >   librte_acl: a bit of RT code deduplication.
> > >   EAL: introduce rte_ymm and relatives in rte_common_vect.h.
> > >   librte_acl: add AVX2 as new rte_acl_classify() method
> > >   test-acl: add ability to manually select RT method.
> > >   librte_acl: Remove search_sse_2 and relatives.
> > >   libter_acl: move lo/hi dwords shuffle out from calc_addr
> > >   libte_acl: make calc_addr a define to deduplicate the code.
> > >   libte_acl: introduce max_size into rte_acl_config.
> > >   libte_acl: remove unused macros.
> > >
> > >  app/test-acl/main.c | 126 +++--
> > >  app/test/test_acl.c |   8 +-
> > >  examples/l3fwd-acl/main.c   |   3 +-
> > >  examples/l3fwd/main.c   |   2 +-
> > >  lib/librte_acl/Makefile |  18 +
> > >  lib/librte_acl/acl.h|  58 ++-
> > >  lib/librte_acl/acl_bld.c| 392 +++-
> > >  lib/librte_acl/acl_gen.c| 268 +++
> > >  lib/librte_acl/acl_run.h|   7 +-
> > >  lib/librte_acl/acl_run_avx2.c   |  54 +++
> > >  lib/librte_acl/acl_run_avx2.h   | 284 
> > >  lib/librte_acl/acl_run_scalar.c |  65 ++-
> > >  lib/librte_acl/acl_run_sse.c| 585 
> > > +---
> > >  lib/librte_acl/acl_run_sse.h| 357 +++
> > >  lib/librte_acl/acl_vect.h   | 132 +++---
> > >  lib/librte_acl/rte_acl.c|  47 +-
> > >  lib/librte_acl/rte_acl.h|   4 +
> > >  lib/librte_acl/rte_acl_osdep_alone.h|  47 +-
> > >  lib/librte_eal/common/include/rte_common_vect.h |  39 +-
> > >  lib/librte_lpm/rte_lpm.h|   2 +-
> > >  20 files changed, 1444 insertions(+), 1054 deletions(-)
> > >  create mode 100644 lib/librte_acl/acl_run_avx2.c
> > >  create mode 100644 lib/librte_acl/acl_run_avx2.h
> > >  create mode 100644 lib/librte_acl/acl_run_sse.h
> > >
> > Series
> > Acked-by: Neil Horman 
> 
> Are you sure there is nothing to change or add in the documentation?
> Maybe that explaining the space/performance trade-off would be a good idea.

Ok, after that patch will be applied, I'll work on the docs update.
Thanks
Konstantin

> 
> > Nice work
> 
> Yes, great work!
> 
> --
> Thomas

[dpdk-dev] [PATCH v2 3/4] testpmd: new commands for ethertype filter

2015-01-12 15:16, Jingjing Wu:
> Following commands of ethertype filter are removed:
>   - add_ethertype_filter (port_id) ethertype (eth_value)
>   - remove_ethertype_filter (port_id) index (idx)
>   - get_ethertype_filter (port_id) index (idx)
> New command is added for ethertype filter by using filter_ctrl API and new
> ethertype filter structure:
>   - ethertype_filter (port_id) (add|del) (mac_addr|mac_ignr)
> (mac_address) ethertype (ether_type) (drop|fwd) queue (queue_id)

Please update doc accordingly:

http://dpdk.org/browse/dpdk/tree/doc/guides/testpmd_app_ug/testpmd_funcs.rst

Thanks
-- 
Thomas

[dpdk-dev] Segmentation fault in ixgbe_rxtx_vec.c:444 with 1.8.0

2015-01-20 Thread Martin Weiser

Hi again,

I did some further testing and it seems like this issue is linked to
jumbo frames. I think a similar issue has already been reported by
Prashant Upadhyaya with the subject 'Packet Rx issue with DPDK1.8'.
In our application we use the following rxmode port configuration:

.mq_mode= ETH_MQ_RX_RSS,
.split_hdr_size = 0,
.header_split   = 0,
.hw_ip_checksum = 1,
.hw_vlan_filter = 0,
.jumbo_frame= 1,
.hw_strip_crc   = 1,
.max_rx_pkt_len = 9000,

and the mbuf size is calculated like the following:

(2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)

This works fine with DPDK 1.7 and jumbo frames are split into buffer
chains and can be forwarded on another port without a problem.
With DPDK 1.8 and the default configuration (CONFIG_RTE_IXGBE_INC_VECTOR
enabled) the application sometimes crashes like described in my first
mail and sometimes packet receiving stops with subsequently arriving
packets counted as rx errors. When CONFIG_RTE_IXGBE_INC_VECTOR is
disabled the packet processing also comes to a halt as soon as jumbo
frames arrive with a the slightly different effect that now
rte_eth_tx_burst refuses to send any previously received packets.

Is there anything special to consider regarding jumbo frames when moving
from DPDK 1.7 to 1.8 that we might have missed?

Martin

On 19.01.15 11:26, Martin Weiser wrote:
> Hi everybody,
>
> we quite recently updated one of our applications to DPDK 1.8.0 and are
> now seeing a segmentation fault in ixgbe_rxtx_vec.c:444 after a few minutes.
> I just did some quick debugging and I only have a very limited
> understanding of the code in question but it seems that the 'continue'
> in line 445 without increasing 'buf_idx' might cause the problem. In one
> debugging session when the crash occurred the value of 'buf_idx' was 2
> and the value of 'pkt_idx' was 8965.
> Any help with this issue would be greatly appreciated. If you need any
> further information just let me know.
>
> Martin
>
>

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 Thread Bruce Richardson

On Tue, Jan 20, 2015 at 08:14:50AM +0100, Thomas Monjalon wrote:
> Thank you Neil for writing this document.
> This is a really important change in DPDK.
> It would be very good to have comments or acknowledgement from several
> developpers. This policy would be enforced by having several Acked-by lines.
> 
> 
> 2015-01-16 10:33, Neil Horman:
> > Adding a document describing rudimentary ABI policy and adding notice space 
> > for
> > any deprecation announcements
> > 
> > Signed-off-by: Neil Horman 
> > CC: Thomas Monjalon 
> > CC: "Richardson, Bruce" 

This policy looks sensible to me.
Acked-by: Bruce Richardson 

> > 
> > ---
> > Change notes:
> > 
> > v5) Updated documentation to add notes from Thomas M.
> > ---
> >  doc/abi.txt | 36 
> >  1 file changed, 36 insertions(+)
> >  create mode 100644 doc/abi.txt
> > 
> > diff --git a/doc/abi.txt b/doc/abi.txt
> > new file mode 100644
> > index 000..14be464
> > --- /dev/null
> > +++ b/doc/abi.txt
> > @@ -0,0 +1,36 @@
> > +ABI policy:
> > +   ABI versions are set at the time of major release labeling, and ABI may
> > +change multiple times between the last labeling and the HEAD label of the 
> > git
> > +tree without warning
> > +
> > +   ABI versions, once released are available until such time as their
> > +deprecation has been noted here for at least one major release cycle, 
> > after it
> > +has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and then the 
> > decision to
> > +remove it is made during the development of DPDK 1.9.  The decision will be
> > +recorded here, shipped with the DPDK 1.9 release, and actually removed 
> > when DPDK
> > +1.10 ships.
> > +
> > +   ABI versions may be deprecated in whole, or in part as needed by a given
> > +update.
> > +
> > +   Some ABI changes may be too significant to reasonably maintain multiple
> > +versions of.  In those events ABI's may be updated without backward
> > +compatibility provided.  The requirements for doing so are:
> > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > +   2) A full deprecation cycle must be made to offer downstream consumers
> > +sufficient warning of the change.  E.g. if dpdk 2.0 is under development 
> > when
> > +the change is proposed, a deprecation notice must be added to this file, 
> > and
> > +released with dpdk 2.0.  Then the change may be incorporated for dpdk 2.1
> > +   3) The LIBABIVER variable in the makefilei(s) where the ABI changes are
> > +incorporated must be incremented in parallel with the ABI changes 
> > themselves
> > +
> > +   Note that the above process for ABI deprecation should not be undertaken
> > +lightly.  ABI stability is extreemely important for downstream consumers 
> > of the
> > +DPDK, especially when distributed in shared object form.  Every effort 
> > should be
> > +made to preserve ABI whenever possible.  For instance, reorganizing public
> > +structure field for astetic or readability purposes should be avoided as 
> > it will
> > +cause ABI breakage.  Only significant (e.g. performance) reasons should be 
> > seen
> > +as cause to alter ABI.
>

[dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.

2015-01-20 Thread Jim Thompson


> On Jan 20, 2015, at 4:09 AM, Ananyev, Konstantin  intel.com> wrote:
> 
> Hi Thomas,
> 
>> -Original Message-
>> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
>> Sent: Monday, January 19, 2015 5:18 PM
>> To: Ananyev, Konstantin
>> Cc: dev at dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.
>> 
>> 2015-01-12 19:16, Konstantin Ananyev:
>>> /*
>>> + * ACL RT structure is a set of multibit tries (with stride == 8)
>>> + * represented by an array of transitions. The next position is calculated
>>> + * based on the current position and the input byte.
>>> + * Each transition is 64 bit value with the following format:
>>> + * | node_type_specific : 32 | node_type : 3 | node_addr : 29 |
>>> + * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index
>>> + * to the start of the node in the transtions array.
>>> + * Few different node types are used:
>>> + * RTE_ACL_NODE_MATCH:
>>> + * node_addr value is and index into an array that contains the return 
>>> value
>>> + * and its priority for each category.
>>> + * Upper 32 bits of the transtion value are not used for that node type.
>>> + * RTE_ACL_NODE_QRANGE:
>>> + * that node consist of up to 5 transitions.
>>> + * Upper 32 bits are interpreted as 4 signed character values which
>>> + * are ordered from smallest(INT8_MIN) to largest (INT8_MAX).
>>> + * These values define 5 ranges:
>>> + * INT8_MIN <= range[0]  <= ((int8_t *)&transition)[4]
>>> + * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5]
>>> + * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6]
>>> + * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7]
>>> + * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX
>>> + * So for input byte value within range[i] i-th transition within that node
>>> + * will be used.
>>> + * RTE_ACL_NODE_SINGLE:
>>> + * always transitions to the same node regardless of the input value.
>>> + * RTE_ACL_NODE_DFA:
>>> + * that node consits of up to 256 transitions.
>>> + * In attempt to conserve space all transitions are divided into 4 
>>> consecutive
>>> + * groups, by 64 transitions per group:
>>> + * group64[i] contains transitions[i * 64, .. i * 64 + 63].
>>> + * Upper 32 bits are interpreted as 4 unsigned character values one per 
>>> group,
>>> + * which contain index to the start of the given group within the node.
>>> + * So to calculate transition index within the node for given input byte 
>>> value:
>>> + * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
>>> + */
>> 
>> It's maybe an error. You were only supposed to remove some macros in this 
>> patch.
> 
> Ah yes, I added some comments about ACL internal layout.
> Thought it might be useful.
> Forgot to add it into patch description. 
> Are you saying I need to split it into 2 patches, or it is ok like that?

it?s great info, but it should probably go in 
doc/guides/prog_guide/packet_classif_access_ctrl.rst.

Jim

[dpdk-dev] [PATCH v2 11/17] librte_acl: add AVX2 as new rte_acl_classify() method



> -Original Message-
> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> Sent: Monday, January 19, 2015 5:23 PM
> To: Ananyev, Konstantin
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 11/17] librte_acl: add AVX2 as new 
> rte_acl_classify() method
> 
> 2015-01-12 19:16, Konstantin Ananyev:
> > +/*
> > + * Select highest avaialbe classify method as default one.
> 
> Typo here.
> Actually, I've seen few typos when browsing your big patchset
> but I don't remember exactly where.
> Maybe that a tool like codespell could help.
> 
> [...]
> 
> > +   else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))
> > +#else
> > if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))
> > +#endif
> 
> Minor nit, it could be
> + else
> +#endif
>   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))

I made it deliberately.
>From my point, It seems more clear and easy to read with 'else if' at one line.
Konstantin


> 
> --
> Thomas

[dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.


> From: Jim Thompson [mailto:jim at netgate.com]
> Sent: Tuesday, January 20, 2015 10:48 AM
> To: Ananyev, Konstantin
> Cc: Thomas Monjalon; dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.
> 
> 
> On Jan 20, 2015, at 4:09 AM, Ananyev, Konstantin  intel.com> wrote:
> 
> Hi Thomas,
> 
> 
> -Original Message-
> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> Sent: Monday, January 19, 2015 5:18 PM
> To: Ananyev, Konstantin
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.
> 
> 2015-01-12 19:16, Konstantin Ananyev:
> 
> /*
> + * ACL RT structure is a set of multibit tries (with stride == 8)
> + * represented by an array of transitions. The next position is calculated
> + * based on the current position and the input byte.
> + * Each transition is 64 bit value with the following format:
> + * | node_type_specific : 32 | node_type : 3 | node_addr : 29 |
> + * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index
> + * to the start of the node in the transtions array.
> + * Few different node types are used:
> + * RTE_ACL_NODE_MATCH:
> + * node_addr value is and index into an array that contains the return value
> + * and its priority for each category.
> + * Upper 32 bits of the transtion value are not used for that node type.
> + * RTE_ACL_NODE_QRANGE:
> + * that node consist of up to 5 transitions.
> + * Upper 32 bits are interpreted as 4 signed character values which
> + * are ordered from smallest(INT8_MIN) to largest (INT8_MAX).
> + * These values define 5 ranges:
> + * INT8_MIN <= range[0] ?<= ((int8_t *)&transition)[4]
> + * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5]
> + * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6]
> + * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7]
> + * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX
> + * So for input byte value within range[i] i-th transition within that node
> + * will be used.
> + * RTE_ACL_NODE_SINGLE:
> + * always transitions to the same node regardless of the input value.
> + * RTE_ACL_NODE_DFA:
> + * that node consits of up to 256 transitions.
> + * In attempt to conserve space all transitions are divided into 4 
> consecutive
> + * groups, by 64 transitions per group:
> + * group64[i] contains transitions[i * 64, .. i * 64 + 63].
> + * Upper 32 bits are interpreted as 4 unsigned character values one per 
> group,
> + * which contain index to the start of the given group within the node.
> + * So to calculate transition index within the node for given input byte 
> value:
> + * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
> + */
> 
> It's maybe an error. You were only supposed to remove some macros in this 
> patch.
> 
> Ah yes, I added some comments about ACL internal layout.
> Thought it might be useful.
> Forgot to add it into patch description.
> Are you saying I need to split it into 2 patches, or it is ok like that?
> 
> it?s great info, but it should probably go 
> in?doc/guides/prog_guide/packet_classif_access_ctrl.rst.

Well, I think it is a good practise to have some brief description of the 
internal structure in the header file.
Same as rte_ring.h, rte_mempool.h, rte_timer.h.
When I'll start doc updating, I can put some sort of internal structure 
description into PG too, though not sure it is worth it.

[dpdk-dev] [PATCH v6 0/6] enicpmd: Cisco Systems Inc. VIC Ethernet PMD

2015-01-20 Thread David Marchand

Hello Sujith,

Any news on the documentation and the performance numbers you said you
would send ?

Thanks.

-- 
David Marchand

On Thu, Nov 27, 2014 at 4:31 PM, Thomas Monjalon 
wrote:

> 2014-11-27 04:27, Sujith Sankar:
> > Thanks Thomas, David and Neil !
> >
> > I shall work on finishing the documentation.
> > About that, you had mentioned that you wanted it in doc/drivers/ path.
> > Could I send a patch with documentation in the path doc/drivers/enicpmd/
> ?
>
> Yes.
> I'd prefer doc/drivers/enic/ but it's a detail ;)
> The format must be sphinx rst to allow web publishing.
>
> It would be great to have some design documentation of every drivers
> in doc/drivers.
>
> Thanks
> --
> Thomas
>

[dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.

2015-01-20 10:09, Ananyev, Konstantin:
> Hi Thomas,
> 
> > -Original Message-
> > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> > Sent: Monday, January 19, 2015 5:18 PM
> > To: Ananyev, Konstantin
> > Cc: dev at dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros.
> > 
> > 2015-01-12 19:16, Konstantin Ananyev:
> > >  /*
> > > + * ACL RT structure is a set of multibit tries (with stride == 8)
> > > + * represented by an array of transitions. The next position is 
> > > calculated
> > > + * based on the current position and the input byte.
> > > + * Each transition is 64 bit value with the following format:
> > > + * | node_type_specific : 32 | node_type : 3 | node_addr : 29 |
> > > + * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index
> > > + * to the start of the node in the transtions array.
> > > + * Few different node types are used:
> > > + * RTE_ACL_NODE_MATCH:
> > > + * node_addr value is and index into an array that contains the return 
> > > value
> > > + * and its priority for each category.
> > > + * Upper 32 bits of the transtion value are not used for that node type.
> > > + * RTE_ACL_NODE_QRANGE:
> > > + * that node consist of up to 5 transitions.
> > > + * Upper 32 bits are interpreted as 4 signed character values which
> > > + * are ordered from smallest(INT8_MIN) to largest (INT8_MAX).
> > > + * These values define 5 ranges:
> > > + * INT8_MIN <= range[0]  <= ((int8_t *)&transition)[4]
> > > + * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5]
> > > + * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6]
> > > + * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7]
> > > + * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX
> > > + * So for input byte value within range[i] i-th transition within that 
> > > node
> > > + * will be used.
> > > + * RTE_ACL_NODE_SINGLE:
> > > + * always transitions to the same node regardless of the input value.
> > > + * RTE_ACL_NODE_DFA:
> > > + * that node consits of up to 256 transitions.
> > > + * In attempt to conserve space all transitions are divided into 4 
> > > consecutive
> > > + * groups, by 64 transitions per group:
> > > + * group64[i] contains transitions[i * 64, .. i * 64 + 63].
> > > + * Upper 32 bits are interpreted as 4 unsigned character values one per 
> > > group,
> > > + * which contain index to the start of the given group within the node.
> > > + * So to calculate transition index within the node for given input byte 
> > > value:
> > > + * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
> > > + */
> > 
> > It's maybe an error. You were only supposed to remove some macros in this 
> > patch.
> 
> Ah yes, I added some comments about ACL internal layout.
> Thought it might be useful.
> Forgot to add it into patch description. 
> Are you saying I need to split it into 2 patches, or it is ok like that?

As it is not related to the other topic of the patch, yes please make
a separate patch.

-- 
Thomas

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

2015-01-20 Thread Olivier MATZ

Hi,

On 01/20/2015 02:12 AM, Ananyev, Konstantin wrote:
 I think a good definition would
 be:

 Packet is IPv4. This flag must be set when using any offload
 feature (TSO, L3 or L4 checksum) to tell the NIC that the packet
 is an IPv4 packet.

 That's why I added PKT_TX_IPV4 in the examples.
>>>
>>> I suppose we discussed it several times: both ways are possible.
>>>  From PMD perspective - treating PKT_TX_IPV4 and PKT_TX_IP_CSUM
>>> As mutually exclusive seems a bit more plausible.
>>>  From the upper layer - my understanding, that it is doesn't really matter.
>>> I thought we had an agreement about it in 1.8, no?
>>
>> Indeed, this was already discussed, but there was a lot of pressure
>> for 1.8.0 to push something, even not perfect. The fog around comments
>> shows that the API was not very clearly defined for 1.8.0. If you read
>> the comments of the API, it is impossible to understand when the
>> PKT_TX_IPV4 or PKT_TX_IP_CSUM flags must be set. I would even say
>> more: the only place where the comments bring a valuable information
>> (L4 checksum and TSO) describe the case where PKT_TX_IPV4 and
>> PKT_TX_IP_CSUM are not exclusive...
>>
>> So I will fix that in my coming patch series. Just for information,
>> I'm pretty sure that having PKT_TX_IPV4 and PKT_TX_IP_CSUM as not
>> exclusive flag would not require any change anywhere in the PMDs (even
>> in i40e).
>
> Right now - no.
> Though as I said from PMD perspective having them exclusive is a bit 
> preferable.
> Again, I don't see any big difference from upper layer code.

Sure, it does not make a big difference in terms of code. But
in terms of API, the naming of the flag is coherent to what it is
used for. And it's easier to find a simple definition, like:

  * Packet is IPv4. This flag must be set when using any offload feature
  * (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
  * packet.

>> On the contrary, making them exclusive would require to
>> change the ixgbe TSO code because we check.
>
> Hmm, so you are saying there is a bug somewhere  in ixbe_rxtx.c?
> What particular place you are talking about?

Sorry, I spoke too fast. In TSO code, we check PKT_TX_IP_CKSUM (and not
PKT_TX_IPV4 as I thought), so it would work for both methods without
patching the code.

In this case, it means that both approach would not require to
modify the code.

 *Problem 3*: without using the word "fortville", it is difficult
 to understand the goal of the flag PKT_TX_UDP_TUNNEL_PKT. Indeed,
 once PKT_TX_OUTER_IPV4/6 is set, it looks obvious that it's a
 tunnel packet. I suggest to remove the PKT_TX_UDP_TUNNEL_PKT
 flag. In linux, the driver doesn't care about the tunnel type,
 it always set I40E_TXD_CTX_UDP_TUNNELING for all encapsulations [6].
>>>
>>> It might be obvious that it is a tunnel packet from PKT_TX_OUTER_* is set,
>>> but it is not obvious what type of tunnelling it would be.
>>> FVL HW supports HW TX offloads for different type of tunnelling and
>>> requires that SW provide information about tunnelling type.
>>>  From i40e datasheet:
>>> L4TUNT L4 Tunneling Type (Teredo / GRE header / VXLAN header) indication:
>>> 00b - No UDP / GRE tunneling (field must be set to zero if EIPT equals to 
>>> zero)
>>> 01b - UDP tunneling header (any UDP tunneling, VXLAN and Geneve).
>>> 10b - GRE tunneling header
>>> As we do plan to support other than UDP tunnelling types, I suppose we'll 
>>> need to keep
>>> PKT_TX_UDP_TUNNEL_PKT flag.
>>
>> As I've said: in linux, the driver doesn't care about the tunnel type,
>> it always set I40E_TXD_CTX_UDP_TUNNELING for all encapsulations.
>
> Ok, and why it should be our problem?
> We have a lot of things done in a different manner then linux/freebsd kernel 
> drivers,
> Why now it became a problem?

If linux doesn't need an equivalent flag for doing the same thing,
it probably means we don't need it either.

In a performance-oriented software like dpdk, having a flag that we
don't know what the hardware does with, that is not needed in other
drivers of the same harware, that makes the API harder to understand
could be a problem.

Another argument: if we can remove this flag, it would make the
testpmd commands reworkd proposed by Jijiang much more easy to
understand: only a new "csum parse-tunnel on|off" would be required,
and it can be explained in a few words.

I'll try to do some tests on a fortville NIC if I can find one. I'm
curious to see if we can transmit any encapsulation packet (ip in ip,
ip in gre, eth in gre, eth in vxlan, or even a proprietary tunnel).

We should avoid the need to specify the tunnel type in the OUTER
checksum API if we can, else it would limit us to specific
supported protocols.

 I think the following cases should be *forbidden by the API*:

 case 9) calculate checksum of in_ip and in_tcp  (was case B.1 in [1])

 mb->outer_l2_len = len(out_eth)
>>

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 Thread Iremonger, Bernard

> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas Monjalon
> Sent: Tuesday, January 20, 2015 7:15 AM
> To: Neil Horman
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> 
> Thank you Neil for writing this document.
> This is a really important change in DPDK.
> It would be very good to have comments or acknowledgement from several 
> developpers. This policy
> would be enforced by having several Acked-by lines.
> 
> 
> 2015-01-16 10:33, Neil Horman:
> > Adding a document describing rudimentary ABI policy and adding notice
> > space for any deprecation announcements
> >
> > Signed-off-by: Neil Horman 
> > CC: Thomas Monjalon 
> > CC: "Richardson, Bruce" 
> >
> > ---
> > Change notes:
> >
> > v5) Updated documentation to add notes from Thomas M.
> > ---
> >  doc/abi.txt | 36 
> >  1 file changed, 36 insertions(+)
> >  create mode 100644 doc/abi.txt
> >
> > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644 index
> > 000..14be464
> > --- /dev/null
> > +++ b/doc/abi.txt
> > @@ -0,0 +1,36 @@
> > +ABI policy:
> > +   ABI versions are set at the time of major release labeling, and ABI
> > +may change multiple times between the last labeling and the HEAD
> > +label of the git tree without warning
> > +
> > +   ABI versions, once released are available until such time as their
> > +deprecation has been noted here for at least one major release cycle,
> > +after it has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and
> > +then the decision to remove it is made during the development of DPDK
> > +1.9.  The decision will be recorded here, shipped with the DPDK 1.9
> > +release, and actually removed when DPDK
> > +1.10 ships.
> > +
> > +   ABI versions may be deprecated in whole, or in part as needed by a
> > +given update.
> > +
> > +   Some ABI changes may be too significant to reasonably maintain
> > +multiple versions of.  In those events ABI's may be updated without
> > +backward compatibility provided.  The requirements for doing so are:
> > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > +   2) A full deprecation cycle must be made to offer downstream
> > +consumers sufficient warning of the change.  E.g. if dpdk 2.0 is
> > +under development when the change is proposed, a deprecation notice
> > +must be added to this file, and released with dpdk 2.0.  Then the change 
> > may be incorporated for
> dpdk 2.1
> > +   3) The LIBABIVER variable in the makefilei(s) where the ABI changes
> > +are incorporated must be incremented in parallel with the ABI changes
> > +themselves
> > +
> > +   Note that the above process for ABI deprecation should not be
> > +undertaken lightly.  ABI stability is extreemely important for
> > +downstream consumers of the DPDK, especially when distributed in
> > +shared object form.  Every effort should be made to preserve ABI
> > +whenever possible.  For instance, reorganizing public structure field
> > +for astetic or readability purposes should be avoided as it will
> > +cause ABI breakage.  Only significant (e.g. performance) reasons should be 
> > seen as cause to alter
> ABI.

Hi Thomas,

Should there be a reference to this document in the programmers guide?

Regards,

Bernard.

[dpdk-dev] DPDK - Linux communication

2015-01-20 Thread Al Stewart

Hi,
I am writing a DPI application on DPDK that needs to communicate with another 
non-DPDK Linux application on the same host for configuration/stats exchange. 
This needs to be a low-latency communication.
Can the RTE ring be used for communication between DPDK? & non-DPDK 
applications? If no, which other mechanism would you recommend for this IPC?
Thank you,Al

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 13:37, Iremonger, Bernard:
> Should there be a reference to this document in the programmers guide?

Maybe. You mean that an application developper must be aware of the deprecation
policy? So probably yes.
And I'd add that the release notes should reference the deprecations.

-- 
Thomas

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-16 10:33, Neil Horman:
> --- /dev/null
> +++ b/doc/abi.txt
> @@ -0,0 +1,36 @@
> +ABI policy:
> + ABI versions are set at the time of major release labeling, and ABI may
> +change multiple times between the last labeling and the HEAD label of the git
> +tree without warning
> +
> + ABI versions, once released are available until such time as their
> +deprecation has been noted here for at least one major release cycle, after 
> it
> +has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and then the 
> decision to
> +remove it is made during the development of DPDK 1.9.  The decision will be
> +recorded here, shipped with the DPDK 1.9 release, and actually removed when 
> DPDK
> +1.10 ships.
> +
> + ABI versions may be deprecated in whole, or in part as needed by a given
> +update.
> +
> + Some ABI changes may be too significant to reasonably maintain multiple
> +versions of.  In those events ABI's may be updated without backward
> +compatibility provided.  The requirements for doing so are:
> + 1) At least 3 acknoweldgements of the need on the dpdk.org
> + 2) A full deprecation cycle must be made to offer downstream consumers
> +sufficient warning of the change.  E.g. if dpdk 2.0 is under development when
> +the change is proposed, a deprecation notice must be added to this file, and
> +released with dpdk 2.0.  Then the change may be incorporated for dpdk 2.1
> + 3) The LIBABIVER variable in the makefilei(s) where the ABI changes are
> +incorporated must be incremented in parallel with the ABI changes themselves
> +
> + Note that the above process for ABI deprecation should not be undertaken
> +lightly.  ABI stability is extreemely important for downstream consumers of 
> the
> +DPDK, especially when distributed in shared object form.  Every effort 
> should be
> +made to preserve ABI whenever possible.  For instance, reorganizing public
> +structure field for astetic or readability purposes should be avoided as it 
> will

astetic? typo?

> +cause ABI breakage.  Only significant (e.g. performance) reasons should be 
> seen
> +as cause to alter ABI.
> +  
> +Deprecation Notices:

Neil, are you sure it's a good idea to put deprecations notices here instead
of release notes?

I'm also thinking that we need to add more things in this doc:
- case of macros/constant deprecation (API only)
- case of structure update: must be renamed to provide ABI 
compatibility?

Do you think we can have a tool to test the ABI compatibility by building
examples/apps of previous version and checking them with built DSO of
current version?

Thanks
-- 
Thomas

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

On Tue, Jan 20, 2015 at 01:37:35PM +, Iremonger, Bernard wrote:
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas Monjalon
> > Sent: Tuesday, January 20, 2015 7:15 AM
> > To: Neil Horman
> > Cc: dev at dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > 
> > Thank you Neil for writing this document.
> > This is a really important change in DPDK.
> > It would be very good to have comments or acknowledgement from several 
> > developpers. This policy
> > would be enforced by having several Acked-by lines.
> > 
> > 
> > 2015-01-16 10:33, Neil Horman:
> > > Adding a document describing rudimentary ABI policy and adding notice
> > > space for any deprecation announcements
> > >
> > > Signed-off-by: Neil Horman 
> > > CC: Thomas Monjalon 
> > > CC: "Richardson, Bruce" 
> > >
> > > ---
> > > Change notes:
> > >
> > > v5) Updated documentation to add notes from Thomas M.
> > > ---
> > >  doc/abi.txt | 36 
> > >  1 file changed, 36 insertions(+)
> > >  create mode 100644 doc/abi.txt
> > >
> > > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644 index
> > > 000..14be464
> > > --- /dev/null
> > > +++ b/doc/abi.txt
> > > @@ -0,0 +1,36 @@
> > > +ABI policy:
> > > + ABI versions are set at the time of major release labeling, and ABI
> > > +may change multiple times between the last labeling and the HEAD
> > > +label of the git tree without warning
> > > +
> > > + ABI versions, once released are available until such time as their
> > > +deprecation has been noted here for at least one major release cycle,
> > > +after it has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and
> > > +then the decision to remove it is made during the development of DPDK
> > > +1.9.  The decision will be recorded here, shipped with the DPDK 1.9
> > > +release, and actually removed when DPDK
> > > +1.10 ships.
> > > +
> > > + ABI versions may be deprecated in whole, or in part as needed by a
> > > +given update.
> > > +
> > > + Some ABI changes may be too significant to reasonably maintain
> > > +multiple versions of.  In those events ABI's may be updated without
> > > +backward compatibility provided.  The requirements for doing so are:
> > > + 1) At least 3 acknoweldgements of the need on the dpdk.org
> > > + 2) A full deprecation cycle must be made to offer downstream
> > > +consumers sufficient warning of the change.  E.g. if dpdk 2.0 is
> > > +under development when the change is proposed, a deprecation notice
> > > +must be added to this file, and released with dpdk 2.0.  Then the change 
> > > may be incorporated for
> > dpdk 2.1
> > > + 3) The LIBABIVER variable in the makefilei(s) where the ABI changes
> > > +are incorporated must be incremented in parallel with the ABI changes
> > > +themselves
> > > +
> > > + Note that the above process for ABI deprecation should not be
> > > +undertaken lightly.  ABI stability is extreemely important for
> > > +downstream consumers of the DPDK, especially when distributed in
> > > +shared object form.  Every effort should be made to preserve ABI
> > > +whenever possible.  For instance, reorganizing public structure field
> > > +for astetic or readability purposes should be avoided as it will
> > > +cause ABI breakage.  Only significant (e.g. performance) reasons should 
> > > be seen as cause to alter
> > ABI.
> 
> Hi Thomas,
> 
> Should there be a reference to this document in the programmers guide?
> 
Thats a good question. I think, as Thomas notes, it probably should be
referenced in some way.  The programmers guide might be good.  What might be
better would be checking the deprecation notices and adding them to the release
notes for any given release.

Thoughts?
Neil

> Regards,
> 
> Bernard.
> 
>

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 Thread Butler, Siobhan A



> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Neil Horman
> Sent: Tuesday, January 20, 2015 2:24 PM
> To: Iremonger, Bernard
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> 
> On Tue, Jan 20, 2015 at 01:37:35PM +, Iremonger, Bernard wrote:
> > > -Original Message-
> > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas
> Monjalon
> > > Sent: Tuesday, January 20, 2015 7:15 AM
> > > To: Neil Horman
> > > Cc: dev at dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > >
> > > Thank you Neil for writing this document.
> > > This is a really important change in DPDK.
> > > It would be very good to have comments or acknowledgement from
> > > several developpers. This policy would be enforced by having several
> Acked-by lines.
> > >
> > >
> > > 2015-01-16 10:33, Neil Horman:
> > > > Adding a document describing rudimentary ABI policy and adding
> > > > notice space for any deprecation announcements
> > > >
> > > > Signed-off-by: Neil Horman 
> > > > CC: Thomas Monjalon 
> > > > CC: "Richardson, Bruce" 
> > > >
> > > > ---
> > > > Change notes:
> > > >
> > > > v5) Updated documentation to add notes from Thomas M.
> > > > ---
> > > >  doc/abi.txt | 36 
> > > >  1 file changed, 36 insertions(+)
> > > >  create mode 100644 doc/abi.txt
> > > >
> > > > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644 index
> > > > 000..14be464
> > > > --- /dev/null
> > > > +++ b/doc/abi.txt
> > > > @@ -0,0 +1,36 @@
> > > > +ABI policy:
> > > > +   ABI versions are set at the time of major release labeling, and
> > > > +ABI may change multiple times between the last labeling and the
> > > > +HEAD label of the git tree without warning
> > > > +
> > > > +   ABI versions, once released are available until such time as
> > > > +their deprecation has been noted here for at least one major
> > > > +release cycle, after it has been tagged.  E.g. the ABI for DPDK
> > > > +1.8 is shipped, and then the decision to remove it is made during
> > > > +the development of DPDK 1.9.  The decision will be recorded here,
> > > > +shipped with the DPDK 1.9 release, and actually removed when DPDK
> > > > +1.10 ships.
> > > > +
> > > > +   ABI versions may be deprecated in whole, or in part as needed by
> > > > +a given update.
> > > > +
> > > > +   Some ABI changes may be too significant to reasonably maintain
> > > > +multiple versions of.  In those events ABI's may be updated
> > > > +without backward compatibility provided.  The requirements for doing
> so are:
> > > > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > > > +   2) A full deprecation cycle must be made to offer downstream
> > > > +consumers sufficient warning of the change.  E.g. if dpdk 2.0 is
> > > > +under development when the change is proposed, a deprecation
> > > > +notice must be added to this file, and released with dpdk 2.0.
> > > > +Then the change may be incorporated for
> > > dpdk 2.1
> > > > +   3) The LIBABIVER variable in the makefilei(s) where the ABI
> > > > +changes are incorporated must be incremented in parallel with the
> > > > +ABI changes themselves
> > > > +
> > > > +   Note that the above process for ABI deprecation should not be
> > > > +undertaken lightly.  ABI stability is extreemely important for
> > > > +downstream consumers of the DPDK, especially when distributed in
> > > > +shared object form.  Every effort should be made to preserve ABI
> > > > +whenever possible.  For instance, reorganizing public structure
> > > > +field for astetic or readability purposes should be avoided as it
> > > > +will cause ABI breakage.  Only significant (e.g. performance)
> > > > +reasons should be seen as cause to alter
> > > ABI.
> >
> > Hi Thomas,
> >
> > Should there be a reference to this document in the programmers guide?
> >
> Thats a good question. I think, as Thomas notes, it probably should be
> referenced in some way.  The programmers guide might be good.  What
> might be better would be checking the deprecation notices and adding them
> to the release notes for any given release.
> 
> Thoughts?
> Neil
> 
> > Regards,
> >
> > Bernard.
> >
> >

Sorry to be pedantic but would you also mind sending it as a .rst file instead 
of .txt if you're going to send as patches to Programmer's Guide anyway? :)
Thanks,
Siobhan

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 Thread O'driscoll, Tim

> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Neil Horman
> Sent: Tuesday, January 20, 2015 2:24 PM
> To: Iremonger, Bernard
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> 
> On Tue, Jan 20, 2015 at 01:37:35PM +, Iremonger, Bernard wrote:
> > > -Original Message-
> > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas
> Monjalon
> > > Sent: Tuesday, January 20, 2015 7:15 AM
> > > To: Neil Horman
> > > Cc: dev at dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > >
> > > Thank you Neil for writing this document.
> > > This is a really important change in DPDK.
> > > It would be very good to have comments or acknowledgement from
> several developpers. This policy
> > > would be enforced by having several Acked-by lines.
> > >
> > >
> > > 2015-01-16 10:33, Neil Horman:
> > > > Adding a document describing rudimentary ABI policy and adding notice
> > > > space for any deprecation announcements
> > > >
> > > > Signed-off-by: Neil Horman 
> > > > CC: Thomas Monjalon 
> > > > CC: "Richardson, Bruce" 
> > > >
> > > > ---
> > > > Change notes:
> > > >
> > > > v5) Updated documentation to add notes from Thomas M.
> > > > ---
> > > >  doc/abi.txt | 36 
> > > >  1 file changed, 36 insertions(+)
> > > >  create mode 100644 doc/abi.txt
> > > >
> > > > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644 index
> > > > 000..14be464
> > > > --- /dev/null
> > > > +++ b/doc/abi.txt
> > > > @@ -0,0 +1,36 @@
> > > > +ABI policy:
> > > > +   ABI versions are set at the time of major release labeling, and 
> > > > ABI
> > > > +may change multiple times between the last labeling and the HEAD
> > > > +label of the git tree without warning
> > > > +
> > > > +   ABI versions, once released are available until such time as 
> > > > their
> > > > +deprecation has been noted here for at least one major release cycle,
> > > > +after it has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and
> > > > +then the decision to remove it is made during the development of
> DPDK
> > > > +1.9.  The decision will be recorded here, shipped with the DPDK 1.9
> > > > +release, and actually removed when DPDK
> > > > +1.10 ships.
> > > > +
> > > > +   ABI versions may be deprecated in whole, or in part as needed 
> > > > by a
> > > > +given update.
> > > > +
> > > > +   Some ABI changes may be too significant to reasonably maintain
> > > > +multiple versions of.  In those events ABI's may be updated without
> > > > +backward compatibility provided.  The requirements for doing so are:
> > > > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > > > +   2) A full deprecation cycle must be made to offer downstream
> > > > +consumers sufficient warning of the change.  E.g. if dpdk 2.0 is
> > > > +under development when the change is proposed, a deprecation
> notice
> > > > +must be added to this file, and released with dpdk 2.0.  Then the
> change may be incorporated for
> > > dpdk 2.1
> > > > +   3) The LIBABIVER variable in the makefilei(s) where the ABI 
> > > > changes
> > > > +are incorporated must be incremented in parallel with the ABI changes
> > > > +themselves
> > > > +
> > > > +   Note that the above process for ABI deprecation should not be
> > > > +undertaken lightly.  ABI stability is extreemely important for
> > > > +downstream consumers of the DPDK, especially when distributed in
> > > > +shared object form.  Every effort should be made to preserve ABI
> > > > +whenever possible.  For instance, reorganizing public structure field
> > > > +for astetic or readability purposes should be avoided as it will
> > > > +cause ABI breakage.  Only significant (e.g. performance) reasons
> should be seen as cause to alter
> > > ABI.
> >
> > Hi Thomas,
> >
> > Should there be a reference to this document in the programmers guide?
> >
> Thats a good question. I think, as Thomas notes, it probably should be
> referenced in some way.  The programmers guide might be good.  What
> might be
> better would be checking the deprecation notices and adding them to the
> release
> notes for any given release.
> 
> Thoughts?

I'd suggest that the policy itself should go in, or at least be referenced 
from, the programmer's guide. I agree that the deprecation notices themselves 
should go in the release notes.

> Neil
> 
> > Regards,
> >
> > Bernard.
> >
> >

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

On Tue, Jan 20, 2015 at 03:00:01PM +0100, Thomas Monjalon wrote:
> 2015-01-16 10:33, Neil Horman:
> > --- /dev/null
> > +++ b/doc/abi.txt
> > @@ -0,0 +1,36 @@
> > +ABI policy:
> > +   ABI versions are set at the time of major release labeling, and ABI may
> > +change multiple times between the last labeling and the HEAD label of the 
> > git
> > +tree without warning
> > +
> > +   ABI versions, once released are available until such time as their
> > +deprecation has been noted here for at least one major release cycle, 
> > after it
> > +has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and then the 
> > decision to
> > +remove it is made during the development of DPDK 1.9.  The decision will be
> > +recorded here, shipped with the DPDK 1.9 release, and actually removed 
> > when DPDK
> > +1.10 ships.
> > +
> > +   ABI versions may be deprecated in whole, or in part as needed by a given
> > +update.
> > +
> > +   Some ABI changes may be too significant to reasonably maintain multiple
> > +versions of.  In those events ABI's may be updated without backward
> > +compatibility provided.  The requirements for doing so are:
> > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > +   2) A full deprecation cycle must be made to offer downstream consumers
> > +sufficient warning of the change.  E.g. if dpdk 2.0 is under development 
> > when
> > +the change is proposed, a deprecation notice must be added to this file, 
> > and
> > +released with dpdk 2.0.  Then the change may be incorporated for dpdk 2.1
> > +   3) The LIBABIVER variable in the makefilei(s) where the ABI changes are
> > +incorporated must be incremented in parallel with the ABI changes 
> > themselves
> > +
> > +   Note that the above process for ABI deprecation should not be undertaken
> > +lightly.  ABI stability is extreemely important for downstream consumers 
> > of the
> > +DPDK, especially when distributed in shared object form.  Every effort 
> > should be
> > +made to preserve ABI whenever possible.  For instance, reorganizing public
> > +structure field for astetic or readability purposes should be avoided as 
> > it will
> 
> astetic? typo?
> 
> > +cause ABI breakage.  Only significant (e.g. performance) reasons should be 
> > seen
> > +as cause to alter ABI.
> > +  
> > +Deprecation Notices:
> 
> Neil, are you sure it's a good idea to put deprecations notices here instead
> of release notes?
> 
Funny, I just made mention of that in my last note.  I do think that the release
notes is the right place to "officially" announce deprecation warnings, but I
think we need a way for developers to communicate that efficiently (given that
the release notes aren't stored in the git tree).  I think this is the place for
developers to canonically list deprecations, and make reading this file part of
the release notes generation process.  That way, updates can be made as part of
the commit process easily.

> I'm also thinking that we need to add more things in this doc:
>   - case of macros/constant deprecation (API only)
>   - case of structure update: must be renamed to provide ABI 
> compatibility?
> 
I'm definately in favor of adding such notices here, but I hadn't planned for
any strict formatting of any given notice.  That is to say, I considered you're
two issues above to be able to be included here.  I have no issue with listing a
deprecation note that indicates macros are being removed or that sections of api
are being versioned to accomodate structure changes. of any sort

> Do you think we can have a tool to test the ABI compatibility by building
> examples/apps of previous version and checking them with built DSO of
> current version?
> 
I do, though I'm not sure its within the scope of this update.  The easiest way
to do it currently is to checkout the last released version of the dpdk, build
it as a DSO build, copy out one of the test/example apps, checkout the HEAD of
the tree, rebuild, and run the saved off test app from the first build using the
shared objects of the second build.  That does some rudimentary validation,
but it only touches on the API aspects that the application you're using makes
use of.  What would be better would be if we had a test application that made a
call to every exported API call that we have, so that we could be confident that
we were exhaustively testing the ABI surface.  I think thats a large piece of
work, but it would be beneficial to have.

Thanks
Neil

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

On Tue, Jan 20, 2015 at 02:29:54PM +, Butler, Siobhan A wrote:
> 
> 
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Neil Horman
> > Sent: Tuesday, January 20, 2015 2:24 PM
> > To: Iremonger, Bernard
> > Cc: dev at dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > 
> > On Tue, Jan 20, 2015 at 01:37:35PM +, Iremonger, Bernard wrote:
> > > > -Original Message-
> > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas
> > Monjalon
> > > > Sent: Tuesday, January 20, 2015 7:15 AM
> > > > To: Neil Horman
> > > > Cc: dev at dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > > >
> > > > Thank you Neil for writing this document.
> > > > This is a really important change in DPDK.
> > > > It would be very good to have comments or acknowledgement from
> > > > several developpers. This policy would be enforced by having several
> > Acked-by lines.
> > > >
> > > >
> > > > 2015-01-16 10:33, Neil Horman:
> > > > > Adding a document describing rudimentary ABI policy and adding
> > > > > notice space for any deprecation announcements
> > > > >
> > > > > Signed-off-by: Neil Horman 
> > > > > CC: Thomas Monjalon 
> > > > > CC: "Richardson, Bruce" 
> > > > >
> > > > > ---
> > > > > Change notes:
> > > > >
> > > > > v5) Updated documentation to add notes from Thomas M.
> > > > > ---
> > > > >  doc/abi.txt | 36 
> > > > >  1 file changed, 36 insertions(+)
> > > > >  create mode 100644 doc/abi.txt
> > > > >
> > > > > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644 index
> > > > > 000..14be464
> > > > > --- /dev/null
> > > > > +++ b/doc/abi.txt
> > > > > @@ -0,0 +1,36 @@
> > > > > +ABI policy:
> > > > > + ABI versions are set at the time of major release labeling, and
> > > > > +ABI may change multiple times between the last labeling and the
> > > > > +HEAD label of the git tree without warning
> > > > > +
> > > > > + ABI versions, once released are available until such time as
> > > > > +their deprecation has been noted here for at least one major
> > > > > +release cycle, after it has been tagged.  E.g. the ABI for DPDK
> > > > > +1.8 is shipped, and then the decision to remove it is made during
> > > > > +the development of DPDK 1.9.  The decision will be recorded here,
> > > > > +shipped with the DPDK 1.9 release, and actually removed when DPDK
> > > > > +1.10 ships.
> > > > > +
> > > > > + ABI versions may be deprecated in whole, or in part as needed by
> > > > > +a given update.
> > > > > +
> > > > > + Some ABI changes may be too significant to reasonably maintain
> > > > > +multiple versions of.  In those events ABI's may be updated
> > > > > +without backward compatibility provided.  The requirements for doing
> > so are:
> > > > > + 1) At least 3 acknoweldgements of the need on the dpdk.org
> > > > > + 2) A full deprecation cycle must be made to offer downstream
> > > > > +consumers sufficient warning of the change.  E.g. if dpdk 2.0 is
> > > > > +under development when the change is proposed, a deprecation
> > > > > +notice must be added to this file, and released with dpdk 2.0.
> > > > > +Then the change may be incorporated for
> > > > dpdk 2.1
> > > > > + 3) The LIBABIVER variable in the makefilei(s) where the ABI
> > > > > +changes are incorporated must be incremented in parallel with the
> > > > > +ABI changes themselves
> > > > > +
> > > > > + Note that the above process for ABI deprecation should not be
> > > > > +undertaken lightly.  ABI stability is extreemely important for
> > > > > +downstream consumers of the DPDK, especially when distributed in
> > > > > +shared object form.  Every effort should be made to preserve ABI
> > > > > +whenever possible.  For instance, reorganizing public structure
> > > > > +field for astetic or readability purposes should be avoided as it
> > > > > +will cause ABI breakage.  Only significant (e.g. performance)
> > > > > +reasons should be seen as cause to alter
> > > > ABI.
> > >
> > > Hi Thomas,
> > >
> > > Should there be a reference to this document in the programmers guide?
> > >
> > Thats a good question. I think, as Thomas notes, it probably should be
> > referenced in some way.  The programmers guide might be good.  What
> > might be better would be checking the deprecation notices and adding them
> > to the release notes for any given release.
> > 
> > Thoughts?
> > Neil
> > 
> > > Regards,
> > >
> > > Bernard.
> > >
> > >
> 
> Sorry to be pedantic but would you also mind sending it as a .rst file 
> instead of .txt if you're going to send as patches to Programmer's Guide 
> anyway? :)
> Thanks,
Actually I'm not sure this is a good idea.  The release notes get formatted and
review by a documentation team right?  I'm not sure theres value in having a
developer write formatted text if its just going to get reviewed and reformatted
later, is there?
Nei

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 Thread Butler, Siobhan A



> -Original Message-
> From: Neil Horman [mailto:nhorman at tuxdriver.com]
> Sent: Tuesday, January 20, 2015 2:42 PM
> To: Butler, Siobhan A
> Cc: Iremonger, Bernard; dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> 
> On Tue, Jan 20, 2015 at 02:29:54PM +, Butler, Siobhan A wrote:
> >
> >
> > > -Original Message-
> > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Neil Horman
> > > Sent: Tuesday, January 20, 2015 2:24 PM
> > > To: Iremonger, Bernard
> > > Cc: dev at dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > >
> > > On Tue, Jan 20, 2015 at 01:37:35PM +, Iremonger, Bernard wrote:
> > > > > -Original Message-
> > > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas
> > > Monjalon
> > > > > Sent: Tuesday, January 20, 2015 7:15 AM
> > > > > To: Neil Horman
> > > > > Cc: dev at dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI
> > > > > documentation
> > > > >
> > > > > Thank you Neil for writing this document.
> > > > > This is a really important change in DPDK.
> > > > > It would be very good to have comments or acknowledgement from
> > > > > several developpers. This policy would be enforced by having
> > > > > several
> > > Acked-by lines.
> > > > >
> > > > >
> > > > > 2015-01-16 10:33, Neil Horman:
> > > > > > Adding a document describing rudimentary ABI policy and adding
> > > > > > notice space for any deprecation announcements
> > > > > >
> > > > > > Signed-off-by: Neil Horman 
> > > > > > CC: Thomas Monjalon 
> > > > > > CC: "Richardson, Bruce" 
> > > > > >
> > > > > > ---
> > > > > > Change notes:
> > > > > >
> > > > > > v5) Updated documentation to add notes from Thomas M.
> > > > > > ---
> > > > > >  doc/abi.txt | 36 
> > > > > >  1 file changed, 36 insertions(+)  create mode 100644
> > > > > > doc/abi.txt
> > > > > >
> > > > > > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644
> > > > > > index
> > > > > > 000..14be464
> > > > > > --- /dev/null
> > > > > > +++ b/doc/abi.txt
> > > > > > @@ -0,0 +1,36 @@
> > > > > > +ABI policy:
> > > > > > +   ABI versions are set at the time of major release labeling,
> > > > > > +and ABI may change multiple times between the last labeling
> > > > > > +and the HEAD label of the git tree without warning
> > > > > > +
> > > > > > +   ABI versions, once released are available until such time as
> > > > > > +their deprecation has been noted here for at least one major
> > > > > > +release cycle, after it has been tagged.  E.g. the ABI for
> > > > > > +DPDK
> > > > > > +1.8 is shipped, and then the decision to remove it is made
> > > > > > +during the development of DPDK 1.9.  The decision will be
> > > > > > +recorded here, shipped with the DPDK 1.9 release, and
> > > > > > +actually removed when DPDK
> > > > > > +1.10 ships.
> > > > > > +
> > > > > > +   ABI versions may be deprecated in whole, or in part as
> > > > > > +needed by a given update.
> > > > > > +
> > > > > > +   Some ABI changes may be too significant to reasonably
> > > > > > +maintain multiple versions of.  In those events ABI's may be
> > > > > > +updated without backward compatibility provided.  The
> > > > > > +requirements for doing
> > > so are:
> > > > > > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > > > > > +   2) A full deprecation cycle must be made to offer
> downstream
> > > > > > +consumers sufficient warning of the change.  E.g. if dpdk 2.0
> > > > > > +is under development when the change is proposed, a
> > > > > > +deprecation notice must be added to this file, and released with
> dpdk 2.0.
> > > > > > +Then the change may be incorporated for
> > > > > dpdk 2.1
> > > > > > +   3) The LIBABIVER variable in the makefilei(s) where the ABI
> > > > > > +changes are incorporated must be incremented in parallel with
> > > > > > +the ABI changes themselves
> > > > > > +
> > > > > > +   Note that the above process for ABI deprecation should not
> > > > > > +be undertaken lightly.  ABI stability is extreemely important
> > > > > > +for downstream consumers of the DPDK, especially when
> > > > > > +distributed in shared object form.  Every effort should be
> > > > > > +made to preserve ABI whenever possible.  For instance,
> > > > > > +reorganizing public structure field for astetic or
> > > > > > +readability purposes should be avoided as it will cause ABI
> > > > > > +breakage.  Only significant (e.g. performance) reasons should
> > > > > > +be seen as cause to alter
> > > > > ABI.
> > > >
> > > > Hi Thomas,
> > > >
> > > > Should there be a reference to this document in the programmers
> guide?
> > > >
> > > Thats a good question. I think, as Thomas notes, it probably should
> > > be referenced in some way.  The programmers guide might be good.
> > > What might be better would be checking the deprecation notices and
> > > adding them to the release notes for any g

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

2015-01-20 09:37, Neil Horman:
> On Tue, Jan 20, 2015 at 03:00:01PM +0100, Thomas Monjalon wrote:
> > 2015-01-16 10:33, Neil Horman:
> > > --- /dev/null
> > > +++ b/doc/abi.txt
> > > @@ -0,0 +1,36 @@
> > > +ABI policy:
> > > + ABI versions are set at the time of major release labeling, and ABI may
> > > +change multiple times between the last labeling and the HEAD label of 
> > > the git
> > > +tree without warning
> > > +
> > > + ABI versions, once released are available until such time as their
> > > +deprecation has been noted here for at least one major release cycle, 
> > > after it
> > > +has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and then the 
> > > decision to
> > > +remove it is made during the development of DPDK 1.9.  The decision will 
> > > be
> > > +recorded here, shipped with the DPDK 1.9 release, and actually removed 
> > > when DPDK
> > > +1.10 ships.
> > > +
> > > + ABI versions may be deprecated in whole, or in part as needed by a given
> > > +update.
> > > +
> > > + Some ABI changes may be too significant to reasonably maintain multiple
> > > +versions of.  In those events ABI's may be updated without backward
> > > +compatibility provided.  The requirements for doing so are:
> > > + 1) At least 3 acknoweldgements of the need on the dpdk.org
> > > + 2) A full deprecation cycle must be made to offer downstream consumers
> > > +sufficient warning of the change.  E.g. if dpdk 2.0 is under development 
> > > when
> > > +the change is proposed, a deprecation notice must be added to this file, 
> > > and
> > > +released with dpdk 2.0.  Then the change may be incorporated for dpdk 2.1
> > > + 3) The LIBABIVER variable in the makefilei(s) where the ABI changes are
> > > +incorporated must be incremented in parallel with the ABI changes 
> > > themselves
> > > +
> > > + Note that the above process for ABI deprecation should not be undertaken
> > > +lightly.  ABI stability is extreemely important for downstream consumers 
> > > of the
> > > +DPDK, especially when distributed in shared object form.  Every effort 
> > > should be
> > > +made to preserve ABI whenever possible.  For instance, reorganizing 
> > > public
> > > +structure field for astetic or readability purposes should be avoided as 
> > > it will
> > 
> > astetic? typo?
> > 
> > > +cause ABI breakage.  Only significant (e.g. performance) reasons should 
> > > be seen
> > > +as cause to alter ABI.
> > > +  
> > > +Deprecation Notices:
> > 
> > Neil, are you sure it's a good idea to put deprecations notices here instead
> > of release notes?
> > 
> Funny, I just made mention of that in my last note.  I do think that the 
> release
> notes is the right place to "officially" announce deprecation warnings, but I
> think we need a way for developers to communicate that efficiently (given that
> the release notes aren't stored in the git tree).

Yes, they are:
http://dpdk.org/browse/dpdk/tree/doc/guides/rel_notes
So I suggest to remove Deprecation Notices from abi.txt and create an entry
in release notes.

> I think this is the place for
> developers to canonically list deprecations, and make reading this file part 
> of
> the release notes generation process.  That way, updates can be made as part 
> of
> the commit process easily.

Developpers can update the release notes themselves.

> > I'm also thinking that we need to add more things in this doc:
> > - case of macros/constant deprecation (API only)
> > - case of structure update: must be renamed to provide ABI 
> > compatibility?
> > 
> I'm definately in favor of adding such notices here, but I hadn't planned for
> any strict formatting of any given notice.  That is to say, I considered 
> you're
> two issues above to be able to be included here.  I have no issue with 
> listing a
> deprecation note that indicates macros are being removed or that sections of 
> api
> are being versioned to accomodate structure changes. of any sort

No, I was suggesting to explain in this doc that macro removal must be
announced with a deprecation notice,
and that in case structure must be reworked, the name must change if we
want to preserve ABI compatibility with old structure.

> > Do you think we can have a tool to test the ABI compatibility by building
> > examples/apps of previous version and checking them with built DSO of
> > current version?
> > 
> I do, though I'm not sure its within the scope of this update.  The easiest 
> way
> to do it currently is to checkout the last released version of the dpdk, build
> it as a DSO build, copy out one of the test/example apps, checkout the HEAD of
> the tree, rebuild, and run the saved off test app from the first build using 
> the
> shared objects of the second build.  That does some rudimentary validation,
> but it only touches on the API aspects that the application you're using makes
> use of.  What would be better would be if we had a test application that made 
> a
> call to every exported API call that we have, so tha

[dpdk-dev] [PATCH 0/4] DPDK memcpy optimization

On Tue, Jan 20, 2015 at 03:01:44AM +, Wang, Zhihong wrote:
> 
> 
> > -Original Message-
> > From: Neil Horman [mailto:nhorman at tuxdriver.com]
> > Sent: Monday, January 19, 2015 9:02 PM
> > To: Wang, Zhihong
> > Cc: dev at dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH 0/4] DPDK memcpy optimization
> > 
> > On Mon, Jan 19, 2015 at 09:53:30AM +0800, zhihong.wang at intel.com wrote:
> > > This patch set optimizes memcpy for DPDK for both SSE and AVX platforms.
> > > It also extends memcpy test coverage with unaligned cases and more test
> > points.
> > >
> > > Optimization techniques are summarized below:
> > >
> > > 1. Utilize full cache bandwidth
> > >
> > > 2. Enforce aligned stores
> > >
> > > 3. Apply load address alignment based on architecture features
> > >
> > > 4. Make load/store address available as early as possible
> > >
> > > 5. General optimization techniques like inlining, branch reducing,
> > > prefetch pattern access
> > >
> > > Zhihong Wang (4):
> > >   Disabled VTA for memcpy test in app/test/Makefile
> > >   Removed unnecessary test cases in test_memcpy.c
> > >   Extended test coverage in test_memcpy_perf.c
> > >   Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX
> > > platforms
> > >
> > >  app/test/Makefile  |   6 +
> > >  app/test/test_memcpy.c |  52 +-
> > >  app/test/test_memcpy_perf.c| 238 +---
> > >  .../common/include/arch/x86/rte_memcpy.h   | 664
> > +++--
> > >  4 files changed, 656 insertions(+), 304 deletions(-)
> > >
> > > --
> > > 1.9.3
> > >
> > >
> > Are you able to compile this with gcc 4.9.2?  The compilation of
> > test_memcpy_perf is taking forever for me.  It appears hung.
> > Neil
> 
> 
> Neil,
> 
> Thanks for reporting this!
> It should compile but will take quite some time if the CPU doesn't support 
> AVX2, the reason is that:
> 1. The SSE & AVX memcpy implementation is more complicated than AVX2 version 
> thus the compiler takes more time to compile and optimize
> 2. The new test_memcpy_perf.c contains 126 constants memcpy calls for better 
> test case coverage, that's quite a lot
> 
> I've just tested this patch on an Ivy Bridge machine with GCC 4.9.2:
> 1. The whole compile process takes 9'41" with the original test_memcpy_perf.c 
> (63 + 63 = 126 constant memcpy calls)
> 2. It takes only 2'41" after I reduce the constant memcpy call number to 12 + 
> 12 = 24
> 
> I'll reduce memcpy call in the next version of patch.
> 
ok, thank you.  I'm all for optimzation, but I think a compile that takes almost
10 minutes for a single file is going to generate some raised eyebrows when end
users start tinkering with it

Neil

> Zhihong (John)
>

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

2015-01-20 13:39, Olivier MATZ:
> On 01/20/2015 02:12 AM, Ananyev, Konstantin wrote:
> >> So I will fix that in my coming patch series. Just for information,
> >> I'm pretty sure that having PKT_TX_IPV4 and PKT_TX_IP_CSUM as not
> >> exclusive flag would not require any change anywhere in the PMDs (even
> >> in i40e).
> >
> > Right now - no.
> > Though as I said from PMD perspective having them exclusive is a bit 
> > preferable.
> > Again, I don't see any big difference from upper layer code.
> 
> Sure, it does not make a big difference in terms of code. But
> in terms of API, the naming of the flag is coherent to what it is
> used for. And it's easier to find a simple definition, like:
> 
>   * Packet is IPv4. This flag must be set when using any offload feature
>   * (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
>   * packet.

+1
It's clearer to set PKT_TX_IPV4 in all offload cases of IPv4 packets,
and add PKT_TX_IP_CSUM when checksum offload is required.

Simply simpler ;)

-- 
Thomas

[dpdk-dev] DPDK Community Call, Monday 2nd February, 17:00 GMT

2015-01-20 Thread O'driscoll, Tim

We had our last community call in December, and then took a break over the
holiday period. I think we should reinstate these, so I've scheduled the next
one for Monday 2nd February. Since our last call was at a time convenient for
Asia, this one is at a time that's more convenient for people based in the USA.
As for previous calls, I'll post a recording to youtube afterwards for anybody
who can't make it.

I don't have an agenda yet, but will send one out in advance of the meeting.

Meeting Time:
Dublin (Ireland), Monday, February 2, 2015 at 5:00:00 PMGMT UTC
San Francisco (U.S.A. - California), Monday, February 2, 2015 at 9:00:00 AM
PST UTC-8 hours
Phoenix (U.S.A. - Arizona), Monday, February 2, 2015 at 10:00:00 AM MST UTC-7
hours
New York (U.S.A. - New York), Monday, February 2, 2015 at 12:00:00 Noon EST
UTC-5 hours
Ottawa (Canada - Ontario), Monday, February 2, 2015 at 12:00:00 Noon EST UTC-5
hours
Paris (France), Monday, February 2, 2015 at 6:00:00 PMCET UTC+1 hour
Tel Aviv (Israel), Monday, February 2, 2015 at 7:00:00 PMIST UTC+2 hours
Moscow (Russia), Monday, February 2, 2015 at 8:00:00 PMMSK UTC+3 hours
New Delhi (India - Delhi), Monday, February 2, 2015 at 10:30:00 PM IST
UTC+5:30 hours
Shanghai (China - Shanghai Municipality), Tuesday, February 3, 2015 at 1:00:00
AM CST UTC+8 hours
Tokyo (Japan), Tuesday, February 3, 2015 at 2:00:00 AM JST UTC+9 hours
Corresponding UTC (GMT), Monday, February 2, 2015 at 17:00:00

GoToMeeting Details:
To join, follow the meeting link:
https://global.gotomeeting.com/join/557845085. This will start the GoToMeeting
web viewer. You then have two options for audio:

1. To use your computer's audio via a headset, you need to switch to the
desktop version of GoToMeeting. You can do this by clicking the GoToMeeting
icon on the top right hand side of the web viewer, and then selecting "Switch
to the desktop version". The desktop version will need to download and install,
so if you plan to use this you may want to get it set up in advance. Once it
starts, under the Audio section, you can select "Mic & Speakers". The desktop
version is only available for Windows and Mac, so if you're using Linux then
you need to use option 2 below.

2. You can join using a phone via one of the numbers listed below. The Access
Code is 557-845-085. You'll also be asked for an Audio PIN, which is accessible
by clicking the phone icon in the GoToMeeting web viewer after you've joined
the meeting.
Canada +1 (647) 497-9391
France +33 (0) 170 950 593
Ireland +353 (0) 15 290 180
United Kingdom +44 (0) 20 3713 5028
United States +1 (646) 982-0002
More phone numbers: https://global.gotomeeting.com/557845085/numbersdisplay.html

Info on downloading the desktop app is available at:
http://support.citrixonline.com/en_US/meeting/help_files/G2M010002?title=Download%7D
Info on the web viewer is available at:
http://support.citrixonline.com/en_US/GoToMeeting/help_files/GTM130019?title=Web+Viewer+FAQs

Thanks,
Tim

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

On Tue, Jan 20, 2015 at 02:50:43PM +, Butler, Siobhan A wrote:
> 
> 
> > -Original Message-
> > From: Neil Horman [mailto:nhorman at tuxdriver.com]
> > Sent: Tuesday, January 20, 2015 2:42 PM
> > To: Butler, Siobhan A
> > Cc: Iremonger, Bernard; dev at dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > 
> > On Tue, Jan 20, 2015 at 02:29:54PM +, Butler, Siobhan A wrote:
> > >
> > >
> > > > -Original Message-
> > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Neil Horman
> > > > Sent: Tuesday, January 20, 2015 2:24 PM
> > > > To: Iremonger, Bernard
> > > > Cc: dev at dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation
> > > >
> > > > On Tue, Jan 20, 2015 at 01:37:35PM +, Iremonger, Bernard wrote:
> > > > > > -Original Message-
> > > > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Thomas
> > > > Monjalon
> > > > > > Sent: Tuesday, January 20, 2015 7:15 AM
> > > > > > To: Neil Horman
> > > > > > Cc: dev at dpdk.org
> > > > > > Subject: Re: [dpdk-dev] [PATCH v5 4/4] docs: Add ABI
> > > > > > documentation
> > > > > >
> > > > > > Thank you Neil for writing this document.
> > > > > > This is a really important change in DPDK.
> > > > > > It would be very good to have comments or acknowledgement from
> > > > > > several developpers. This policy would be enforced by having
> > > > > > several
> > > > Acked-by lines.
> > > > > >
> > > > > >
> > > > > > 2015-01-16 10:33, Neil Horman:
> > > > > > > Adding a document describing rudimentary ABI policy and adding
> > > > > > > notice space for any deprecation announcements
> > > > > > >
> > > > > > > Signed-off-by: Neil Horman 
> > > > > > > CC: Thomas Monjalon 
> > > > > > > CC: "Richardson, Bruce" 
> > > > > > >
> > > > > > > ---
> > > > > > > Change notes:
> > > > > > >
> > > > > > > v5) Updated documentation to add notes from Thomas M.
> > > > > > > ---
> > > > > > >  doc/abi.txt | 36 
> > > > > > >  1 file changed, 36 insertions(+)  create mode 100644
> > > > > > > doc/abi.txt
> > > > > > >
> > > > > > > diff --git a/doc/abi.txt b/doc/abi.txt new file mode 100644
> > > > > > > index
> > > > > > > 000..14be464
> > > > > > > --- /dev/null
> > > > > > > +++ b/doc/abi.txt
> > > > > > > @@ -0,0 +1,36 @@
> > > > > > > +ABI policy:
> > > > > > > + ABI versions are set at the time of major release labeling,
> > > > > > > +and ABI may change multiple times between the last labeling
> > > > > > > +and the HEAD label of the git tree without warning
> > > > > > > +
> > > > > > > + ABI versions, once released are available until such time as
> > > > > > > +their deprecation has been noted here for at least one major
> > > > > > > +release cycle, after it has been tagged.  E.g. the ABI for
> > > > > > > +DPDK
> > > > > > > +1.8 is shipped, and then the decision to remove it is made
> > > > > > > +during the development of DPDK 1.9.  The decision will be
> > > > > > > +recorded here, shipped with the DPDK 1.9 release, and
> > > > > > > +actually removed when DPDK
> > > > > > > +1.10 ships.
> > > > > > > +
> > > > > > > + ABI versions may be deprecated in whole, or in part as
> > > > > > > +needed by a given update.
> > > > > > > +
> > > > > > > + Some ABI changes may be too significant to reasonably
> > > > > > > +maintain multiple versions of.  In those events ABI's may be
> > > > > > > +updated without backward compatibility provided.  The
> > > > > > > +requirements for doing
> > > > so are:
> > > > > > > + 1) At least 3 acknoweldgements of the need on the dpdk.org
> > > > > > > + 2) A full deprecation cycle must be made to offer
> > downstream
> > > > > > > +consumers sufficient warning of the change.  E.g. if dpdk 2.0
> > > > > > > +is under development when the change is proposed, a
> > > > > > > +deprecation notice must be added to this file, and released with
> > dpdk 2.0.
> > > > > > > +Then the change may be incorporated for
> > > > > > dpdk 2.1
> > > > > > > + 3) The LIBABIVER variable in the makefilei(s) where the ABI
> > > > > > > +changes are incorporated must be incremented in parallel with
> > > > > > > +the ABI changes themselves
> > > > > > > +
> > > > > > > + Note that the above process for ABI deprecation should not
> > > > > > > +be undertaken lightly.  ABI stability is extreemely important
> > > > > > > +for downstream consumers of the DPDK, especially when
> > > > > > > +distributed in shared object form.  Every effort should be
> > > > > > > +made to preserve ABI whenever possible.  For instance,
> > > > > > > +reorganizing public structure field for astetic or
> > > > > > > +readability purposes should be avoided as it will cause ABI
> > > > > > > +breakage.  Only significant (e.g. performance) reasons should
> > > > > > > +be seen as cause to alter
> > > > > > ABI.
> > > > >
> > > > > Hi Thomas,
> > > > >
> > > > > Should there be a reference to this document in the programmers
> > guide?
>

[dpdk-dev] [PATCH v5 4/4] docs: Add ABI documentation

On Tue, Jan 20, 2015 at 04:06:07PM +0100, Thomas Monjalon wrote:
> 2015-01-20 09:37, Neil Horman:
> > On Tue, Jan 20, 2015 at 03:00:01PM +0100, Thomas Monjalon wrote:
> > > 2015-01-16 10:33, Neil Horman:
> > > > --- /dev/null
> > > > +++ b/doc/abi.txt
> > > > @@ -0,0 +1,36 @@
> > > > +ABI policy:
> > > > +   ABI versions are set at the time of major release labeling, and 
> > > > ABI may
> > > > +change multiple times between the last labeling and the HEAD label of 
> > > > the git
> > > > +tree without warning
> > > > +
> > > > +   ABI versions, once released are available until such time as 
> > > > their
> > > > +deprecation has been noted here for at least one major release cycle, 
> > > > after it
> > > > +has been tagged.  E.g. the ABI for DPDK 1.8 is shipped, and then the 
> > > > decision to
> > > > +remove it is made during the development of DPDK 1.9.  The decision 
> > > > will be
> > > > +recorded here, shipped with the DPDK 1.9 release, and actually removed 
> > > > when DPDK
> > > > +1.10 ships.
> > > > +
> > > > +   ABI versions may be deprecated in whole, or in part as needed 
> > > > by a given
> > > > +update.
> > > > +
> > > > +   Some ABI changes may be too significant to reasonably maintain 
> > > > multiple
> > > > +versions of.  In those events ABI's may be updated without backward
> > > > +compatibility provided.  The requirements for doing so are:
> > > > +   1) At least 3 acknoweldgements of the need on the dpdk.org
> > > > +   2) A full deprecation cycle must be made to offer downstream 
> > > > consumers
> > > > +sufficient warning of the change.  E.g. if dpdk 2.0 is under 
> > > > development when
> > > > +the change is proposed, a deprecation notice must be added to this 
> > > > file, and
> > > > +released with dpdk 2.0.  Then the change may be incorporated for dpdk 
> > > > 2.1
> > > > +   3) The LIBABIVER variable in the makefilei(s) where the ABI 
> > > > changes are
> > > > +incorporated must be incremented in parallel with the ABI changes 
> > > > themselves
> > > > +
> > > > +   Note that the above process for ABI deprecation should not be 
> > > > undertaken
> > > > +lightly.  ABI stability is extreemely important for downstream 
> > > > consumers of the
> > > > +DPDK, especially when distributed in shared object form.  Every effort 
> > > > should be
> > > > +made to preserve ABI whenever possible.  For instance, reorganizing 
> > > > public
> > > > +structure field for astetic or readability purposes should be avoided 
> > > > as it will
> > > 
> > > astetic? typo?
> > > 
> > > > +cause ABI breakage.  Only significant (e.g. performance) reasons 
> > > > should be seen
> > > > +as cause to alter ABI.
> > > > +  
> > > > +Deprecation Notices:
> > > 
> > > Neil, are you sure it's a good idea to put deprecations notices here 
> > > instead
> > > of release notes?
> > > 
> > Funny, I just made mention of that in my last note.  I do think that the 
> > release
> > notes is the right place to "officially" announce deprecation warnings, but 
> > I
> > think we need a way for developers to communicate that efficiently (given 
> > that
> > the release notes aren't stored in the git tree).
> 
> Yes, they are:
>   http://dpdk.org/browse/dpdk/tree/doc/guides/rel_notes
> So I suggest to remove Deprecation Notices from abi.txt and create an entry
> in release notes.
> 
> > I think this is the place for
> > developers to canonically list deprecations, and make reading this file 
> > part of
> > the release notes generation process.  That way, updates can be made as 
> > part of
> > the commit process easily.
> 
> Developpers can update the release notes themselves.
> 
ok, I was unaware. If thats the case, then yes, putting these deprecations
directly in the release notes makes the most sense. I'll resubmit with that
changed.


> > > I'm also thinking that we need to add more things in this doc:
> > >   - case of macros/constant deprecation (API only)
> > >   - case of structure update: must be renamed to provide ABI 
> > > compatibility?
> > > 
> > I'm definately in favor of adding such notices here, but I hadn't planned 
> > for
> > any strict formatting of any given notice.  That is to say, I considered 
> > you're
> > two issues above to be able to be included here.  I have no issue with 
> > listing a
> > deprecation note that indicates macros are being removed or that sections 
> > of api
> > are being versioned to accomodate structure changes. of any sort
> 
> No, I was suggesting to explain in this doc that macro removal must be
> announced with a deprecation notice,
> and that in case structure must be reworked, the name must change if we
> want to preserve ABI compatibility with old structure.
> 
> > > Do you think we can have a tool to test the ABI compatibility by building
> > > examples/apps of previous version and checking them with built DSO of
> > > current version?
> > > 
> > I do, though I'm not sure its within the scope of this u

[dpdk-dev] [PATCH 0/4] DPDK memcpy optimization

2015-01-20 Thread Bruce Richardson

On Tue, Jan 20, 2015 at 10:11:18AM -0500, Neil Horman wrote:
> On Tue, Jan 20, 2015 at 03:01:44AM +, Wang, Zhihong wrote:
> > 
> > 
> > > -Original Message-
> > > From: Neil Horman [mailto:nhorman at tuxdriver.com]
> > > Sent: Monday, January 19, 2015 9:02 PM
> > > To: Wang, Zhihong
> > > Cc: dev at dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH 0/4] DPDK memcpy optimization
> > > 
> > > On Mon, Jan 19, 2015 at 09:53:30AM +0800, zhihong.wang at intel.com wrote:
> > > > This patch set optimizes memcpy for DPDK for both SSE and AVX platforms.
> > > > It also extends memcpy test coverage with unaligned cases and more test
> > > points.
> > > >
> > > > Optimization techniques are summarized below:
> > > >
> > > > 1. Utilize full cache bandwidth
> > > >
> > > > 2. Enforce aligned stores
> > > >
> > > > 3. Apply load address alignment based on architecture features
> > > >
> > > > 4. Make load/store address available as early as possible
> > > >
> > > > 5. General optimization techniques like inlining, branch reducing,
> > > > prefetch pattern access
> > > >
> > > > Zhihong Wang (4):
> > > >   Disabled VTA for memcpy test in app/test/Makefile
> > > >   Removed unnecessary test cases in test_memcpy.c
> > > >   Extended test coverage in test_memcpy_perf.c
> > > >   Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX
> > > > platforms
> > > >
> > > >  app/test/Makefile  |   6 +
> > > >  app/test/test_memcpy.c |  52 +-
> > > >  app/test/test_memcpy_perf.c| 238 +---
> > > >  .../common/include/arch/x86/rte_memcpy.h   | 664
> > > +++--
> > > >  4 files changed, 656 insertions(+), 304 deletions(-)
> > > >
> > > > --
> > > > 1.9.3
> > > >
> > > >
> > > Are you able to compile this with gcc 4.9.2?  The compilation of
> > > test_memcpy_perf is taking forever for me.  It appears hung.
> > > Neil
> > 
> > 
> > Neil,
> > 
> > Thanks for reporting this!
> > It should compile but will take quite some time if the CPU doesn't support 
> > AVX2, the reason is that:
> > 1. The SSE & AVX memcpy implementation is more complicated than AVX2 
> > version thus the compiler takes more time to compile and optimize
> > 2. The new test_memcpy_perf.c contains 126 constants memcpy calls for 
> > better test case coverage, that's quite a lot
> > 
> > I've just tested this patch on an Ivy Bridge machine with GCC 4.9.2:
> > 1. The whole compile process takes 9'41" with the original 
> > test_memcpy_perf.c (63 + 63 = 126 constant memcpy calls)
> > 2. It takes only 2'41" after I reduce the constant memcpy call number to 12 
> > + 12 = 24
> > 
> > I'll reduce memcpy call in the next version of patch.
> > 
> ok, thank you.  I'm all for optimzation, but I think a compile that takes 
> almost
> 10 minutes for a single file is going to generate some raised eyebrows when 
> end
> users start tinkering with it
> 
> Neil
> 
> > Zhihong (John)
> > 
Even two minutes is a very long time to compile, IMHO. The whole of DPDK doesn't
take that long to compile right now, and that's with a couple of huge header 
files
with routing tables in it. Any chance you could cut compile time down to a few
seconds while still having reasonable tests?
Also, when there is AVX2 present on the system, what is the compile time like
for that code?

/Bruce

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

2015-01-20 Thread Stephen Hemminger

On Tue, 20 Jan 2015 16:18:01 +0100
Thomas Monjalon  wrote:

> 2015-01-20 13:39, Olivier MATZ:
> > On 01/20/2015 02:12 AM, Ananyev, Konstantin wrote:
> > >> So I will fix that in my coming patch series. Just for information,
> > >> I'm pretty sure that having PKT_TX_IPV4 and PKT_TX_IP_CSUM as not
> > >> exclusive flag would not require any change anywhere in the PMDs (even
> > >> in i40e).
> > >
> > > Right now - no.
> > > Though as I said from PMD perspective having them exclusive is a bit 
> > > preferable.
> > > Again, I don't see any big difference from upper layer code.
> > 
> > Sure, it does not make a big difference in terms of code. But
> > in terms of API, the naming of the flag is coherent to what it is
> > used for. And it's easier to find a simple definition, like:
> > 
> >   * Packet is IPv4. This flag must be set when using any offload feature
> >   * (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
> >   * packet.
> 
> +1
> It's clearer to set PKT_TX_IPV4 in all offload cases of IPv4 packets,
> and add PKT_TX_IP_CSUM when checksum offload is required.
> 
> Simply simpler ;)
> 

Sure. Although in my experience IP checksum is just as cheap done in
software since the header fits in cache.

[dpdk-dev] DPDK Community Call, Monday 2nd February, 17:00 GMT

2015-01-20 Thread Stephen Hemminger

On Tue, 20 Jan 2015 15:21:40 +
"O'driscoll, Tim"  wrote:

> We had our last community call in December, and then took a break over the 
> holiday period. I think we should reinstate these, so I've scheduled the next 
> one for Monday 2nd February. Since our last call was at a time convenient for 
> Asia, this one is at a time that's more convenient for people based in the 
> USA. As for previous calls, I'll post a recording to youtube afterwards for 
> anybody who can't make it.
> 
> I don't have an agenda yet, but will send one out in advance of the meeting.

This is right after FOSDEM and many people will be returning home.

[dpdk-dev] [PATCH 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-20 Thread Stephen Hemminger

On Mon, 19 Jan 2015 09:53:34 +0800
zhihong.wang at intel.com wrote:

> Main code changes:
> 
> 1. Differentiate architectural features based on CPU flags
> 
> a. Implement separated move functions for SSE/AVX/AVX2 to make full 
> utilization of cache bandwidth
> 
> b. Implement separated copy flow specifically optimized for target 
> architecture
> 
> 2. Rewrite the memcpy function "rte_memcpy"
> 
> a. Add store aligning
> 
> b. Add load aligning based on architectural features
> 
> c. Put block copy loop into inline move functions for better control of 
> instruction order
> 
> d. Eliminate unnecessary MOVs
> 
> 3. Rewrite the inline move functions
> 
> a. Add move functions for unaligned load cases
> 
> b. Change instruction order in copy loops for better pipeline utilization
> 
> c. Use intrinsics instead of assembly code
> 
> 4. Remove slow glibc call for constant copies
> 
> Signed-off-by: Zhihong Wang 

Dumb question: why not fix glibc memcpy instead?
What is special about rte_memcpy?

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

Hi Olivier,

> -Original Message-
> From: Olivier MATZ [mailto:olivier.matz at 6wind.com]
> Sent: Tuesday, January 20, 2015 12:39 PM
> To: Ananyev, Konstantin; Liu, Jijiang
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum 
> forwarding engine
> 
> Hi,
> 
> On 01/20/2015 02:12 AM, Ananyev, Konstantin wrote:
>  I think a good definition would
>  be:
> 
>  Packet is IPv4. This flag must be set when using any offload
>  feature (TSO, L3 or L4 checksum) to tell the NIC that the packet
>  is an IPv4 packet.
> 
>  That's why I added PKT_TX_IPV4 in the examples.
> >>>
> >>> I suppose we discussed it several times: both ways are possible.
> >>>  From PMD perspective - treating PKT_TX_IPV4 and PKT_TX_IP_CSUM
> >>> As mutually exclusive seems a bit more plausible.
> >>>  From the upper layer - my understanding, that it is doesn't really 
> >>> matter.
> >>> I thought we had an agreement about it in 1.8, no?
> >>
> >> Indeed, this was already discussed, but there was a lot of pressure
> >> for 1.8.0 to push something, even not perfect. The fog around comments
> >> shows that the API was not very clearly defined for 1.8.0. If you read
> >> the comments of the API, it is impossible to understand when the
> >> PKT_TX_IPV4 or PKT_TX_IP_CSUM flags must be set. I would even say
> >> more: the only place where the comments bring a valuable information
> >> (L4 checksum and TSO) describe the case where PKT_TX_IPV4 and
> >> PKT_TX_IP_CSUM are not exclusive...
> >>
> >> So I will fix that in my coming patch series. Just for information,
> >> I'm pretty sure that having PKT_TX_IPV4 and PKT_TX_IP_CSUM as not
> >> exclusive flag would not require any change anywhere in the PMDs (even
> >> in i40e).
> >
> > Right now - no.
> > Though as I said from PMD perspective having them exclusive is a bit 
> > preferable.
> > Again, I don't see any big difference from upper layer code.
> 
> Sure, it does not make a big difference in terms of code. But
> in terms of API, the naming of the flag is coherent to what it is
> used for. And it's easier to find a simple definition, like:
> 
>   * Packet is IPv4. This flag must be set when using any offload feature
>   * (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
>   * packet.

Ok, and what's wrong with:
"Packet is IPv4. This flag must be set when using any offload feature
(TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
packet and no HW offload for IPv4 header checksum calculation is required"
?

> 
> >> On the contrary, making them exclusive would require to
> >> change the ixgbe TSO code because we check.
> >
> > Hmm, so you are saying there is a bug somewhere  in ixbe_rxtx.c?
> > What particular place you are talking about?
> 
> Sorry, I spoke too fast. In TSO code, we check PKT_TX_IP_CKSUM (and not
> PKT_TX_IPV4 as I thought), so it would work for both methods without
> patching the code.
> 
> In this case, it means that both approach would not require to
> modify the code.

Ok.

> 
>  *Problem 3*: without using the word "fortville", it is difficult
>  to understand the goal of the flag PKT_TX_UDP_TUNNEL_PKT. Indeed,
>  once PKT_TX_OUTER_IPV4/6 is set, it looks obvious that it's a
>  tunnel packet. I suggest to remove the PKT_TX_UDP_TUNNEL_PKT
>  flag. In linux, the driver doesn't care about the tunnel type,
>  it always set I40E_TXD_CTX_UDP_TUNNELING for all encapsulations [6].
> >>>
> >>> It might be obvious that it is a tunnel packet from PKT_TX_OUTER_* is set,
> >>> but it is not obvious what type of tunnelling it would be.
> >>> FVL HW supports HW TX offloads for different type of tunnelling and
> >>> requires that SW provide information about tunnelling type.
> >>>  From i40e datasheet:
> >>> L4TUNT L4 Tunneling Type (Teredo / GRE header / VXLAN header) indication:
> >>> 00b - No UDP / GRE tunneling (field must be set to zero if EIPT equals to 
> >>> zero)
> >>> 01b - UDP tunneling header (any UDP tunneling, VXLAN and Geneve).
> >>> 10b - GRE tunneling header
> >>> As we do plan to support other than UDP tunnelling types, I suppose we'll 
> >>> need to keep
> >>> PKT_TX_UDP_TUNNEL_PKT flag.
> >>
> >> As I've said: in linux, the driver doesn't care about the tunnel type,
> >> it always set I40E_TXD_CTX_UDP_TUNNELING for all encapsulations.
> >
> > Ok, and why it should be our problem?
> > We have a lot of things done in a different manner then linux/freebsd 
> > kernel drivers,
> > Why now it became a problem?
> 
> If linux doesn't need an equivalent flag for doing the same thing,
> it probably means we don't need it either.

Probably yes  Or probably not.
Why do we need to guess what was the intention of guys who wrote that part of 
linux driver?
BTW, the macro for GRE is here:
find lib/librte_pmd_i40e/i40e -type f | xargs grep TUN | grep TXD
lib/librte_pmd_i40e/i40e/i40e_type.h:#define

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

2015-01-20 Thread Olivier MATZ

Hi Konstantin,

On 01/20/2015 06:23 PM, Ananyev, Konstantin wrote:
>> Sure, it does not make a big difference in terms of code. But
>> in terms of API, the naming of the flag is coherent to what it is
>> used for. And it's easier to find a simple definition, like:
>>
>>* Packet is IPv4. This flag must be set when using any offload feature
>>* (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
>>* packet.
>
> Ok, and what's wrong with:
> "Packet is IPv4. This flag must be set when using any offload feature
> (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4
> packet and no HW offload for IPv4 header checksum calculation is required"
> ?

I honestly find the first one simpler.

Again, I understand both are possible, but I think choosing the
most trivial one is the right way for an API.


>>> Ok, and why it should be our problem?
>>> We have a lot of things done in a different manner then linux/freebsd 
>>> kernel drivers,
>>> Why now it became a problem?
>>
>> If linux doesn't need an equivalent flag for doing the same thing,
>> it probably means we don't need it either.
>
> Probably yes  Or probably not.
> Why do we need to guess what was the intention of guys who wrote that part of 
> linux driver?

Because the dpdk looks very similar to that part of linux driver.

> BTW, the macro for GRE is here:
> find lib/librte_pmd_i40e/i40e -type f | xargs grep TUN | grep TXD
> lib/librte_pmd_i40e/i40e/i40e_type.h:#define I40E_TXD_CTX_UDP_TUNNELING 
> (0x1ULL << I40E_TXD_CTX_QW0_NATT_SHIFT)
> lib/librte_pmd_i40e/i40e/i40e_type.h:#define I40E_TXD_CTX_GRE_TUNNELING 
> (0x2ULL << I40E_TXD_CTX_QW0_NATT_SHIFT)
>
> Though it not used (yet?) by some reason.
>
>>
>> In a performance-oriented software like dpdk, having a flag that we
>> don't know what the hardware does with, that is not needed in other
>> drivers of the same harware, that makes the API harder to understand
>> could be a problem.
>
> Here is a HW spec, that says what values have to be setup for L4TUNT.
> Yes, I am not sure why they need to distinguish between VXLAN/GRE tunnelling.
> Though, I suppose that wouldn't eliminate the requirement.
> But for same, there is no good explanation why FVL HW need to know that it is 
> IPv4 or IPv6 packet,
> in the case when only L4 checksum offload is required (IIPT field).
> Niantic, as I remember, is able to work ok without that requirement.
> Though, we still have to set it up.
>
>> Another argument: if we can remove this flag, it would make the
>> testpmd commands reworkd proposed by Jijiang much more easy to
>> understand: only a new "csum parse-tunnel on|off" would be required,
>> and it can be explained in a few words.
>
> Well, from my point - testpmd commands that Jijiang proposed are perfectly 
> clear and understandable.
> Another thing, as I remember, our primary concern should be public API, no 
> testpmd.

OK let's talk about testpmd later.

>> We should avoid the need to specify the tunnel type in the OUTER
>> checksum API if we can, else it would limit us to specific
>> supported protocols.
>
>  From the FVL spec it is required by HW, it is not what we introducing on our 
> own.
> Spec stays explicitly that L4TUNT (L4 tunneling type) has to be setup for 
> tunnelling packets.
> Again from the spec, there are 3 different values it can take.
> If you have an idea how to pass that information to  PMD without using flags, 
> sure we can consider it.
>
>>
>> I think the following cases should be *forbidden by the API*:
>>
>> case 9) calculate checksum of in_ip and in_tcp  (was case B.1 in [1])
>>
>>  mb->outer_l2_len = len(out_eth)
>>  mb->outer_l3_len = len(out_ip)
>>  mb->l2_len = len(out_udp + vxlan + in_eth)
>>  mb->l3_len = len(out_ip)
>>  mb->ol_flags |= PKT_TX_IPV4 | PKT_TX_UDP_TUNNEL_PKT | \
>>PKT_TX_IP_CSUM | PKT_TX_UDP_CKSUM;
>>  set out_ip checksum to 0 in the packet
>>  set out_udp checksum to pseudo header using rte_ipv4_phdr_cksum()
>>
>>  If we remove the flag PKT_TX_UDP_TUNNEL_PKT, this cannot be
>>  supported, but there is no reason to support it as there is
>>  already one way to do the same.
>>
>>  I think the driver should not even look at mb->outer_l2_len
>>  and mb->outer_l3_len if no flag PKT_TX_OUTER_* is set.
>
> Why it should be forbidden?
> I admit it might be a bit slower than case 4),
> but I think absolutely legal way to setup HW offloads for inner L3/L4.
> As I said we need a PKT_TX_UDP_TUNNEL_PKT anyway, so I suppose
> PKT_TX_*_TUNNEL_PKT should be an indication is it a tunnel packet or not.
> PKT_TX_OUTER_* flags indicate does outer cksum offload is required or not.

 I don't understand. The result in terms of hardware is exactly the
 same than case 4). Why should we have 2 different ways for doing the
 same thing?
>>>
>>> If HW supports that capability, w

[dpdk-dev] [PATCH v3 01/18] fix fix compilation issues with RTE_LIBRTE_ACL_STANDALONE=y

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/rte_acl_osdep_alone.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/lib/librte_acl/rte_acl_osdep_alone.h 
b/lib/librte_acl/rte_acl_osdep_alone.h
index a84b6f9..2a99860 100644
--- a/lib/librte_acl/rte_acl_osdep_alone.h
+++ b/lib/librte_acl/rte_acl_osdep_alone.h
@@ -214,6 +214,13 @@ rte_rdtsc(void)
 /*
  * rte_tailq related.
  */
+
+struct rte_tailq_entry {
+   TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list
+ */
+   void *data; /**< Pointer to the data referenced by this tailq entry */
+};
+
 static inline void *
 rte_dummy_tailq(void)
 {
@@ -248,6 +255,7 @@ rte_zmalloc_socket(__rte_unused const char *type, size_t 
size, unsigned align,
void *ptr;
int rc;

+   align = (align != 0) ? align : RTE_CACHE_LINE_SIZE;
rc = posix_memalign(&ptr, align, size);
if (rc != 0) {
rte_errno = rc;
@@ -258,6 +266,8 @@ rte_zmalloc_socket(__rte_unused const char *type, size_t 
size, unsigned align,
return ptr;
 }

+#definerte_zmalloc(type, sz, align)rte_zmalloc_socket(type, sz, 
align, 0)
+
 /*
  * rte_debug related
  */
@@ -271,6 +281,8 @@ rte_zmalloc_socket(__rte_unused const char *type, size_t 
size, unsigned align,
exit(err);   \
 } while (0)

+#definerte_cpu_get_flag_enabled(x) (0)
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 00/18] ACL: New AVX2 classify method and several other enhancements.

v3 changes:
Applied review comments from Thomas:
- fix spelling errors reported by codespell.
- split last patch into two:
first to remove unused macros,
second to add some comments about ACL internal layout.

v2 changes:
- When build with the compilers that don't support AVX2 instructions,
make rte_acl_classify_avx2() do nothing and return an error.
- Remove unneeded 'ifdef __AVX2__' in acl_run_avx2.*.
- Reorder order of patches in the set, to keep RTE_LIBRTE_ACL_STANDALONE=y
always buildable.

This patch series contain several fixes and enhancements for ACL library.
See complete list below.
Two main changes that are externally visible:
- Introduce new classify method:  RTE_ACL_CLASSIFY_AVX2.
It uses AVX2 instructions and 256 bit wide data types
to perform internal trie traversal.
That helps to increase classify() throughput.
This method is selected as default one on CPUs that supports AVX2.
- Introduce new field in the build config structure: max_size.
It specifies maximum size that internal RT structure for given context
can reach.
The purpose of that is to allow user to decide about space/performance trade-off
(faster classify() vs less space for RT internal structures)
for each given set of rules.

Konstantin Ananyev (18):
  fix fix compilation issues with RTE_LIBRTE_ACL_STANDALONE=y
  app/test: few small fixes fot test_acl.c
  librte_acl: make data_indexes long enough to survive idle transitions.
  librte_acl: remove build phase heuristsic with negative performance
effect.
  librte_acl: fix a bug at build phase that can cause matches beeing
overwirtten.
  librte_acl: introduce DFA nodes compression (group64) for identical
entries.
  librte_acl: build/gen phase - simplify the way match nodes are
allocated.
  librte_acl: make scalar RT code to be more similar to vector one.
  librte_acl: a bit of RT code deduplication.
  EAL: introduce rte_ymm and relatives in rte_common_vect.h.
  librte_acl: add AVX2 as new rte_acl_classify() method
  test-acl: add ability to manually select RT method.
  librte_acl: Remove search_sse_2 and relatives.
  libter_acl: move lo/hi dwords shuffle out from calc_addr
  libte_acl: make calc_addr a define to deduplicate the code.
  libte_acl: introduce max_size into rte_acl_config.
  libte_acl: remove unused macros.
  libte_acl: add some comments about ACL internal layout.

 app/test-acl/main.c | 126 +++--
 app/test/test_acl.c |   8 +-
 examples/l3fwd-acl/main.c   |   3 +-
 examples/l3fwd/main.c   |   2 +-
 lib/librte_acl/Makefile |  18 +
 lib/librte_acl/acl.h|  58 ++-
 lib/librte_acl/acl_bld.c| 392 +++-
 lib/librte_acl/acl_gen.c| 268 +++
 lib/librte_acl/acl_run.h|   7 +-
 lib/librte_acl/acl_run_avx2.c   |  54 +++
 lib/librte_acl/acl_run_avx2.h   | 284 
 lib/librte_acl/acl_run_scalar.c |  65 ++-
 lib/librte_acl/acl_run_sse.c| 585 +---
 lib/librte_acl/acl_run_sse.h| 357 +++
 lib/librte_acl/acl_vect.h   | 132 +++---
 lib/librte_acl/rte_acl.c|  47 +-
 lib/librte_acl/rte_acl.h|   4 +
 lib/librte_acl/rte_acl_osdep_alone.h|  47 +-
 lib/librte_eal/common/include/rte_common_vect.h |  39 +-
 lib/librte_lpm/rte_lpm.h|   2 +-
 20 files changed, 1444 insertions(+), 1054 deletions(-)
 create mode 100644 lib/librte_acl/acl_run_avx2.c
 create mode 100644 lib/librte_acl/acl_run_avx2.h
 create mode 100644 lib/librte_acl/acl_run_sse.h

-- 
1.8.5.3

[dpdk-dev] [PATCH v3 08/18] librte_acl: make scalar RT code to be more similar to vector one.

Make classify_scalar to behave in the same way as it's vector counterpart:
move match check out of the inner loop, etc.
That makes scalar and vector code look more identical.
Plus it improves scalar code performance.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_run_scalar.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c
index 40691ce..9935125 100644
--- a/lib/librte_acl/acl_run_scalar.c
+++ b/lib/librte_acl/acl_run_scalar.c
@@ -162,31 +162,34 @@ rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, 
const uint8_t **data,
transition0 = index_array[0];
transition1 = index_array[1];

+   while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
+   transition0 = acl_match_check(transition0,
+   0, ctx, parms, &flows, resolve_priority_scalar);
+   transition1 = acl_match_check(transition1,
+   1, ctx, parms, &flows, resolve_priority_scalar);
+   }
+
while (flows.started > 0) {

input0 = GET_NEXT_4BYTES(parms, 0);
input1 = GET_NEXT_4BYTES(parms, 1);

for (n = 0; n < 4; n++) {
-   if (likely((transition0 & RTE_ACL_NODE_MATCH) == 0))
-   transition0 = scalar_transition(flows.trans,
-   transition0, (uint8_t)input0);

+   transition0 = scalar_transition(flows.trans,
+   transition0, (uint8_t)input0);
input0 >>= CHAR_BIT;

-   if (likely((transition1 & RTE_ACL_NODE_MATCH) == 0))
-   transition1 = scalar_transition(flows.trans,
-   transition1, (uint8_t)input1);
-
+   transition1 = scalar_transition(flows.trans,
+   transition1, (uint8_t)input1);
input1 >>= CHAR_BIT;
-
}
-   if ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
+
+   while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
transition0 = acl_match_check(transition0,
0, ctx, parms, &flows, resolve_priority_scalar);
transition1 = acl_match_check(transition1,
1, ctx, parms, &flows, resolve_priority_scalar);
-
}
}
return 0;
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 03/18] librte_acl: make data_indexes long enough to survive idle transitions.

Make data_indexes long enough to survive idle transitions.
That allows to simplify match processing code.
Also fix incorrect size calculations for data indexes.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_bld.c | 5 +++--
 lib/librte_acl/acl_run.h | 4 
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index d6e0c45..c5a674a 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -1948,7 +1948,7 @@ acl_set_data_indexes(struct rte_acl_ctx *ctx)
memcpy(ctx->data_indexes + ofs, ctx->trie[i].data_index,
n * sizeof(ctx->data_indexes[0]));
ctx->trie[i].data_index = ctx->data_indexes + ofs;
-   ofs += n;
+   ofs += RTE_ACL_MAX_FIELDS;
}
 }

@@ -1988,7 +1988,8 @@ rte_acl_build(struct rte_acl_ctx *ctx, const struct 
rte_acl_config *cfg)
/* allocate and fill run-time  structures. */
rc = rte_acl_gen(ctx, bcx.tries, bcx.bld_tries,
bcx.num_tries, bcx.cfg.num_categories,
-   RTE_ACL_IPV4VLAN_NUM * RTE_DIM(bcx.tries),
+   RTE_ACL_MAX_FIELDS * RTE_DIM(bcx.tries) *
+   sizeof(ctx->data_indexes[0]),
bcx.num_build_rules);
if (rc == 0) {

diff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h
index c191053..4c843c1 100644
--- a/lib/librte_acl/acl_run.h
+++ b/lib/librte_acl/acl_run.h
@@ -256,10 +256,6 @@ acl_match_check(uint64_t transition, int slot,

/* Fill the slot with the next trie or idle trie */
transition = acl_start_next_trie(flows, parms, slot, ctx);
-
-   } else if (transition == ctx->idle) {
-   /* reset indirection table for idle slots */
-   parms[slot].data_index = idle;
}

return transition;
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 04/18] librte_acl: remove build phase heuristsic with negative performance effect.

Current rule-wildness based heuristsics can cause unnecessary splits of
the ruleset.
That might have negative performance effect:
more tries to traverse, bigger RT tables.
After removing it, on some test-cases with big rulesets (~10K)
observed ~50% speedup.
No difference for smaller rulesets.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_bld.c | 277 +--
 1 file changed, 97 insertions(+), 180 deletions(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index c5a674a..8bf4a54 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -1539,11 +1539,9 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
return 0;
 }

-static int
-acl_rule_stats(struct rte_acl_build_rule *head, struct rte_acl_config *config,
-   uint32_t *wild_limit)
+static void
+acl_rule_stats(struct rte_acl_build_rule *head, struct rte_acl_config *config)
 {
-   int min;
struct rte_acl_build_rule *rule;
uint32_t n, m, fields_deactivated = 0;
uint32_t start = 0, deactivate = 0;
@@ -1604,129 +1602,58 @@ acl_rule_stats(struct rte_acl_build_rule *head, struct 
rte_acl_config *config,

for (k = 0; k < config->num_fields; k++) {
if (tally[k][TALLY_DEACTIVATED] == 0) {
-   memcpy(&tally[l][0], &tally[k][0],
+   memmove(&tally[l][0], &tally[k][0],
TALLY_NUM * sizeof(tally[0][0]));
-   memcpy(&config->defs[l++],
+   memmove(&config->defs[l++],
&config->defs[k],
sizeof(struct rte_acl_field_def));
}
}
config->num_fields = l;
}
-
-   min = RTE_ACL_SINGLE_TRIE_SIZE;
-   if (config->num_fields == 2)
-   min *= 4;
-   else if (config->num_fields == 3)
-   min *= 3;
-   else if (config->num_fields == 4)
-   min *= 2;
-
-   if (tally[0][TALLY_0] < min)
-   return 0;
-   for (n = 0; n < config->num_fields; n++)
-   wild_limit[n] = 0;
-
-   /*
-* If trailing fields are 100% wild, group those together.
-* This allows the search length of the trie to be shortened.
-*/
-   for (n = 1; n < config->num_fields; n++) {
-
-   double rule_percentage = (double)tally[n][TALLY_DEPTH] /
-   tally[n][0];
-
-   if (rule_percentage > RULE_PERCENTAGE) {
-   /* if it crosses an input boundary then round up */
-   while (config->defs[n - 1].input_index ==
-   config->defs[n].input_index)
-   n++;
-
-   /* set the limit for selecting rules */
-   while (n < config->num_fields)
-   wild_limit[n++] = 100;
-
-   if (wild_limit[n - 1] == 100)
-   return 1;
-   }
-   }
-
-   /* look for the most wild that's 40% or more of the rules */
-   for (n = 1; n < config->num_fields; n++) {
-   for (m = TALLY_100; m > 0; m--) {
-
-   double rule_percentage = (double)tally[n][m] /
-   tally[n][0];
-
-   if (tally[n][TALLY_DEACTIVATED] == 0 &&
-   tally[n][TALLY_0] >
-   RTE_ACL_SINGLE_TRIE_SIZE &&
-   rule_percentage > NODE_PERCENTAGE &&
-   rule_percentage < 0.80) {
-   wild_limit[n] = wild_limits[m];
-   return 1;
-   }
-   }
-   }
-   return 0;
 }

 static int
-order(struct rte_acl_build_rule **insert, struct rte_acl_build_rule *rule)
+rule_cmp_wildness(struct rte_acl_build_rule *r1, struct rte_acl_build_rule *r2)
 {
uint32_t n;
-   struct rte_acl_build_rule *left = *insert;
-
-   if (left == NULL)
-   return 0;

-   for (n = 1; n < left->config->num_fields; n++) {
-   int field_index = left->config->defs[n].field_index;
+   for (n = 1; n < r1->config->num_fields; n++) {
+   int field_index = r1->config->defs[n].field_index;

-   if (left->wildness[field_index] != rule->wildness[field_index])
-   return (left->wildness[field_index] >=
-   rule->wildness[field_index]);
+   if (r1->wildness[field_index] != r2->wildness[field_index])
+   return (r1->wildness[field_index] -
+   r2->wildness[field_index]);
}
return 0;
 }

 static struct

[dpdk-dev] [PATCH v3 17/18] libte_acl: remove unused macros.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl.h | 1 -
 lib/librte_acl/acl_run.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 61b849a..217bab3 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -62,7 +62,6 @@ struct rte_acl_bitset {

 #defineRTE_ACL_NODE_DFA(0 << RTE_ACL_TYPE_SHIFT)
 #defineRTE_ACL_NODE_SINGLE (1U << RTE_ACL_TYPE_SHIFT)
-#defineRTE_ACL_NODE_QEXACT (2U << RTE_ACL_TYPE_SHIFT)
 #defineRTE_ACL_NODE_QRANGE (3U << RTE_ACL_TYPE_SHIFT)
 #defineRTE_ACL_NODE_MATCH  (4U << RTE_ACL_TYPE_SHIFT)
 #defineRTE_ACL_NODE_TYPE   (7U << RTE_ACL_TYPE_SHIFT)
diff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h
index 850bc81..b2fc42c 100644
--- a/lib/librte_acl/acl_run.h
+++ b/lib/librte_acl/acl_run.h
@@ -40,7 +40,6 @@
 #define MAX_SEARCHES_AVX16 16
 #define MAX_SEARCHES_SSE8  8
 #define MAX_SEARCHES_SSE4  4
-#define MAX_SEARCHES_SSE2  2
 #define MAX_SEARCHES_SCALAR2

 #define GET_NEXT_4BYTES(prm, idx)  \
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 06/18] librte_acl: introduce DFA nodes compression (group64) for identical entries.

Introduced division of whole 256 child transition enties
into 4 sub-groups (64 kids per group).
So 2 groups within the same node with identical children,
can use one set of transition entries.
That allows to compact some DFA nodes and get space savings in the RT table,
without any negative performance impact.
>From what I've seen an average space savings: ~20%.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl.h|  12 ++-
 lib/librte_acl/acl_gen.c| 195 
 lib/librte_acl/acl_run_scalar.c |  38 
 lib/librte_acl/acl_run_sse.c|  99 ++--
 4 files changed, 196 insertions(+), 148 deletions(-)

diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 102fa51..3f6ac79 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -47,6 +47,11 @@ extern"C" {
 #define RTE_ACL_DFA_MAXUINT8_MAX
 #define RTE_ACL_DFA_SIZE   (UINT8_MAX + 1)

+#defineRTE_ACL_DFA_GR64_SIZE   64
+#defineRTE_ACL_DFA_GR64_NUM(RTE_ACL_DFA_SIZE / 
RTE_ACL_DFA_GR64_SIZE)
+#defineRTE_ACL_DFA_GR64_BIT\
+   (CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)
+
 typedef int bits_t;

 #defineRTE_ACL_BIT_SET_SIZE((UINT8_MAX + 1) / (sizeof(bits_t) * 
CHAR_BIT))
@@ -100,8 +105,11 @@ struct rte_acl_node {
/* number of ranges (transitions w/ consecutive bits) */
int32_t id;
struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
-   char transitions[RTE_ACL_QUAD_SIZE];
-   /* boundaries for ranged node */
+   union {
+   chartransitions[RTE_ACL_QUAD_SIZE];
+   /* boundaries for ranged node */
+   uint8_t dfa_gr64[RTE_ACL_DFA_GR64_NUM];
+   };
struct rte_acl_node *next;
/* free list link or pointer to duplicate node during merge */
struct rte_acl_node *prev;
diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c
index b1f766b..c9b7839 100644
--- a/lib/librte_acl/acl_gen.c
+++ b/lib/librte_acl/acl_gen.c
@@ -43,13 +43,14 @@
 } while (0)

 struct acl_node_counters {
-   intmatch;
-   intmatch_used;
-   intsingle;
-   intquad;
-   intquad_vectors;
-   intdfa;
-   intsmallest_match;
+   int32_t match;
+   int32_t match_used;
+   int32_t single;
+   int32_t quad;
+   int32_t quad_vectors;
+   int32_t dfa;
+   int32_t dfa_gr64;
+   int32_t smallest_match;
 };

 struct rte_acl_indices {
@@ -61,24 +62,118 @@ struct rte_acl_indices {

 static void
 acl_gen_log_stats(const struct rte_acl_ctx *ctx,
-   const struct acl_node_counters *counts)
+   const struct acl_node_counters *counts,
+   const struct rte_acl_indices *indices)
 {
RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
"runtime memory footprint on socket %d:\n"
"single nodes/bytes used: %d/%zu\n"
-   "quad nodes/bytes used: %d/%zu\n"
-   "DFA nodes/bytes used: %d/%zu\n"
+   "quad nodes/vectors/bytes used: %d/%d/%zu\n"
+   "DFA nodes/group64/bytes used: %d/%d/%zu\n"
"match nodes/bytes used: %d/%zu\n"
"total: %zu bytes\n",
ctx->name, ctx->socket_id,
counts->single, counts->single * sizeof(uint64_t),
-   counts->quad, counts->quad_vectors * sizeof(uint64_t),
-   counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t),
+   counts->quad, counts->quad_vectors,
+   (indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
+   counts->dfa, counts->dfa_gr64,
+   indices->dfa_index * sizeof(uint64_t),
counts->match,
counts->match * sizeof(struct rte_acl_match_results),
ctx->mem_sz);
 }

+static uint64_t
+acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
+{
+   uint64_t idx;
+   uint32_t i;
+
+   idx = 0;
+   for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+   RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
+   RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
+   idx |= (i - node->dfa_gr64[i]) <<
+   (6 + RTE_ACL_DFA_GR64_BIT * i);
+   }
+
+   return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
+}
+
+static void
+acl_dfa_fill_gr64(const struct rte_acl_node *node,
+   const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
+{
+   uint32_t i;
+
+   for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+   memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
+   src + i * RTE_ACL_DFA_GR64_SIZE,
+   RTE_ACL_DF

[dpdk-dev] [PATCH v3 07/18] librte_acl: build/gen phase - simplify the way match nodes are allocated.

Right now we allocate indexes for all types of nodes, except MATCH,
at 'gen final RT table' stage.
For MATCH type nodes we are doing it at building temporary tree stage.
This is totally unnecessary and makes code more complex and error prone.
Rework the code and make MATCH indexes being allocated at the same stage
as all others.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl.h |  3 +--
 lib/librte_acl/acl_bld.c |  4 +--
 lib/librte_acl/acl_gen.c | 69 ++--
 3 files changed, 34 insertions(+), 42 deletions(-)

diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 3f6ac79..96bb318 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -146,7 +146,6 @@ enum {
 struct rte_acl_trie {
uint32_ttype;
uint32_tcount;
-   int32_t smallest;  /* smallest rule in this trie */
uint32_troot_index;
const uint32_t *data_index;
uint32_tnum_data_indexes;
@@ -181,7 +180,7 @@ struct rte_acl_ctx {

 int rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
-   uint32_t num_categories, uint32_t data_index_sz, int match_num);
+   uint32_t num_categories, uint32_t data_index_sz);

 typedef int (*rte_acl_classify_t)
 (const struct rte_acl_ctx *, const uint8_t **, uint32_t *, uint32_t, uint32_t);
diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index 22f7934..1fd59ee 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -1719,7 +1719,6 @@ acl_build_tries(struct acl_build_context *context,
context->tries[n].type = RTE_ACL_UNUSED_TRIE;
context->bld_tries[n].trie = NULL;
context->tries[n].count = 0;
-   context->tries[n].smallest = INT32_MAX;
}

context->tries[0].type = RTE_ACL_FULL_TRIE;
@@ -1906,8 +1905,7 @@ rte_acl_build(struct rte_acl_ctx *ctx, const struct 
rte_acl_config *cfg)
rc = rte_acl_gen(ctx, bcx.tries, bcx.bld_tries,
bcx.num_tries, bcx.cfg.num_categories,
RTE_ACL_MAX_FIELDS * RTE_DIM(bcx.tries) *
-   sizeof(ctx->data_indexes[0]),
-   bcx.num_build_rules + 1);
+   sizeof(ctx->data_indexes[0]));
if (rc == 0) {

/* set data indexes. */
diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c
index c9b7839..d3def66 100644
--- a/lib/librte_acl/acl_gen.c
+++ b/lib/librte_acl/acl_gen.c
@@ -50,14 +50,14 @@ struct acl_node_counters {
int32_t quad_vectors;
int32_t dfa;
int32_t dfa_gr64;
-   int32_t smallest_match;
 };

 struct rte_acl_indices {
-   intdfa_index;
-   intquad_index;
-   intsingle_index;
-   intmatch_index;
+   int32_t dfa_index;
+   int32_t quad_index;
+   int32_t single_index;
+   int32_t match_index;
+   int32_t match_start;
 };

 static void
@@ -243,9 +243,9 @@ acl_count_fanout(struct rte_acl_node *node)
 /*
  * Determine the type of nodes and count each type
  */
-static int
+static void
 acl_count_trie_types(struct acl_node_counters *counts,
-   struct rte_acl_node *node, uint64_t no_match, int match, int force_dfa)
+   struct rte_acl_node *node, uint64_t no_match, int force_dfa)
 {
uint32_t n;
int num_ptrs;
@@ -253,16 +253,12 @@ acl_count_trie_types(struct acl_node_counters *counts,

/* skip if this node has been counted */
if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
-   return match;
+   return;

if (node->match_flag != 0 || node->num_ptrs == 0) {
counts->match++;
-   if (node->match_flag == -1)
-   node->match_flag = match++;
node->node_type = RTE_ACL_NODE_MATCH;
-   if (counts->smallest_match > node->match_flag)
-   counts->smallest_match = node->match_flag;
-   return match;
+   return;
}

num_ptrs = acl_count_fanout(node);
@@ -299,11 +295,9 @@ acl_count_trie_types(struct acl_node_counters *counts,
 */
for (n = 0; n < node->num_ptrs; n++) {
if (node->ptrs[n].ptr != NULL)
-   match = acl_count_trie_types(counts, node->ptrs[n].ptr,
-   no_match, match, 0);
+   acl_count_trie_types(counts, node->ptrs[n].ptr,
+   no_match, 0);
}
-
-   return match;
 }

 static void
@@ -400,9 +394,13 @@ acl_gen_node(struct rte_acl_node *node, uint64_t 
*node_array,
break;
case RTE_ACL_NODE_MATCH:
match = ((struct rte_acl_match_results *)
-

[dpdk-dev] [PATCH v3 09/18] librte_acl: a bit of RT code deduplication.

Move common check for input parameters up into rte_acl_classify_alg().

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_run_scalar.c |  4 
 lib/librte_acl/acl_run_sse.c|  4 
 lib/librte_acl/rte_acl.c| 19 ---
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c
index 9935125..5be216c 100644
--- a/lib/librte_acl/acl_run_scalar.c
+++ b/lib/librte_acl/acl_run_scalar.c
@@ -147,10 +147,6 @@ rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, 
const uint8_t **data,
struct completion cmplt[MAX_SEARCHES_SCALAR];
struct parms parms[MAX_SEARCHES_SCALAR];

-   if (categories != 1 &&
-   ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
-   return -EINVAL;
-
acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, num,
categories, ctx->trans_table);

diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 576c92b..09e32be 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -572,10 +572,6 @@ int
 rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
uint32_t *results, uint32_t num, uint32_t categories)
 {
-   if (categories != 1 &&
-   ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
-   return -EINVAL;
-
if (likely(num >= MAX_SEARCHES_SSE8))
return search_sse_8(ctx, data, results, num, categories);
else if (num >= MAX_SEARCHES_SSE4)
diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
index 547e6da..a16c4a4 100644
--- a/lib/librte_acl/rte_acl.c
+++ b/lib/librte_acl/rte_acl.c
@@ -76,20 +76,25 @@ rte_acl_init(void)
 }

 int
-rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data,
-   uint32_t *results, uint32_t num, uint32_t categories)
-{
-   return classify_fns[ctx->alg](ctx, data, results, num, categories);
-}
-
-int
 rte_acl_classify_alg(const struct rte_acl_ctx *ctx, const uint8_t **data,
uint32_t *results, uint32_t num, uint32_t categories,
enum rte_acl_classify_alg alg)
 {
+   if (categories != 1 &&
+   ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
+   return -EINVAL;
+
return classify_fns[alg](ctx, data, results, num, categories);
 }

+int
+rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data,
+   uint32_t *results, uint32_t num, uint32_t categories)
+{
+   return rte_acl_classify_alg(ctx, data, results, num, categories,
+   ctx->alg);
+}
+
 struct rte_acl_ctx *
 rte_acl_find_existing(const char *name)
 {
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 15/18] libte_acl: make calc_addr a define to deduplicate the code.

Vector code reorganisation/deduplication:
To avoid maintaining two nearly identical implementations of calc_addr()
(one for SSE, another for AVX2), replace it with  a new macro that suits
both SSE and AVX2 code-paths.
Also remove no needed any more MM_* macros.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_run_avx2.h   |  87 +---
 lib/librte_acl/acl_run_sse.h| 178 
 lib/librte_acl/acl_vect.h   | 132 --
 lib/librte_eal/common/include/rte_common_vect.h |  12 ++
 4 files changed, 160 insertions(+), 249 deletions(-)

diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h
index 1688c50..b01a46a 100644
--- a/lib/librte_acl/acl_run_avx2.h
+++ b/lib/librte_acl/acl_run_avx2.h
@@ -73,51 +73,19 @@ static const rte_ymm_t ymm_ones_16 = {
},
 };

-static inline __attribute__((always_inline)) ymm_t
-calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,
-   ymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi)
-{
-   ymm_t in, node_type, r, t;
-   ymm_t dfa_msk, dfa_ofs, quad_ofs;
-   ymm_t addr;
-
-   const ymm_t range_base = _mm256_set_epi32(
-   0xff0c, 0xff08, 0xff04, 0xff00,
-   0xff0c, 0xff08, 0xff04, 0xff00);
-
-   t = _mm256_xor_si256(index_mask, index_mask);
-   in = _mm256_shuffle_epi8(next_input, shuffle_input);
-
-   /* Calc node type and node addr */
-   node_type = _mm256_andnot_si256(index_mask, tr_lo);
-   addr = _mm256_and_si256(index_mask, tr_lo);
-
-   /* DFA calculations. */
-
-   dfa_msk = _mm256_cmpeq_epi32(node_type, t);
-
-   r = _mm256_srli_epi32(in, 30);
-   r = _mm256_add_epi8(r, range_base);
-
-   t = _mm256_srli_epi32(in, 24);
-   r = _mm256_shuffle_epi8(tr_hi, r);
-
-   dfa_ofs = _mm256_sub_epi32(t, r);
-
-   /* QUAD/SINGLE caluclations. */
-
-   t = _mm256_cmpgt_epi8(in, tr_hi);
-   t = _mm256_sign_epi8(t, t);
-   t = _mm256_maddubs_epi16(t, t);
-   quad_ofs = _mm256_madd_epi16(t, ones_16);
-
-   /* blend DFA and QUAD/SINGLE. */
-   t = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
-
-   addr = _mm256_add_epi32(addr, t);
-   return addr;
-}
+static const rte_ymm_t ymm_range_base = {
+   .u32 = {
+   0xff00, 0xff04, 0xff08, 0xff0c,
+   0xff00, 0xff04, 0xff08, 0xff0c,
+   },
+};

+/*
+ * Process 8 transitions in parallel.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ * next_input contains up to 4 input bytes for 8 flows.
+ */
 static inline __attribute__((always_inline)) ymm_t
 transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t 
*tr_hi)
 {
@@ -126,8 +94,10 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t 
*tr_lo, ymm_t *tr_hi)

tr = (const int32_t *)(uintptr_t)trans;

-   addr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
-   ymm_ones_16.y, *tr_lo, *tr_hi);
+   /* Calculate the address (array index) for all 8 transitions. */
+   ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
+   ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
+   *tr_lo, *tr_hi);

/* load lower 32 bits of 8 transactions at once. */
*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
@@ -140,6 +110,11 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t 
*tr_lo, ymm_t *tr_hi)
return next_input;
 }

+/*
+ * Process matches for  8 flows.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ */
 static inline void
 acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
@@ -155,6 +130,11 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
l0 = _mm256_castsi256_si128(*tr_lo);

for (i = 0; i != RTE_DIM(tr) / 2; i++) {
+
+   /*
+* Extract low 32bits of each transition.
+* That's enough to process the match.
+*/
tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);

@@ -167,12 +147,14 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
ctx, parms, flows, resolve_priority_sse);
}

+   /* Collect new transitions into 2 YMM registers. */
t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);

-   lo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
-   hi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
+   /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
+   ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);

+

[dpdk-dev] [PATCH v3 18/18] libte_acl: add some comments about ACL internal layout.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl.h | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 217bab3..4dadab5 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -68,6 +68,44 @@ struct rte_acl_bitset {
 #defineRTE_ACL_NODE_UNDEFINED  UINT32_MAX

 /*
+ * ACL RT structure is a set of multibit tries (with stride == 8)
+ * represented by an array of transitions. The next position is calculated
+ * based on the current position and the input byte.
+ * Each transition is 64 bit value with the following format:
+ * | node_type_specific : 32 | node_type : 3 | node_addr : 29 |
+ * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index
+ * to the start of the node in the transtions array.
+ * Few different node types are used:
+ * RTE_ACL_NODE_MATCH:
+ * node_addr value is and index into an array that contains the return value
+ * and its priority for each category.
+ * Upper 32 bits of the transition value are not used for that node type.
+ * RTE_ACL_NODE_QRANGE:
+ * that node consist of up to 5 transitions.
+ * Upper 32 bits are interpreted as 4 signed character values which
+ * are ordered from smallest(INT8_MIN) to largest (INT8_MAX).
+ * These values define 5 ranges:
+ * INT8_MIN <= range[0]  <= ((int8_t *)&transition)[4]
+ * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5]
+ * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6]
+ * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7]
+ * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX
+ * So for input byte value within range[i] i-th transition within that node
+ * will be used.
+ * RTE_ACL_NODE_SINGLE:
+ * always transitions to the same node regardless of the input value.
+ * RTE_ACL_NODE_DFA:
+ * that node consits of up to 256 transitions.
+ * In attempt to conserve space all transitions are divided into 4 consecutive
+ * groups, by 64 transitions per group:
+ * group64[i] contains transitions[i * 64, .. i * 64 + 63].
+ * Upper 32 bits are interpreted as 4 unsigned character values one per group,
+ * which contain index to the start of the given group within the node.
+ * So to calculate transition index within the node for given input byte value:
+ * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
+ */
+
+/*
  * Structure of a node is a set of ptrs and each ptr has a bit map
  * of values associated with this transition.
  */
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 12/18] test-acl: add ability to manually select RT method.

In test-acl replace command-line option "--scalar" with new one:
"--alg=scalar|sse|avx2".
Allows user manually select preferred classify() method.

Signed-off-by: Konstantin Ananyev 
---
 app/test-acl/main.c | 93 ++---
 1 file changed, 75 insertions(+), 18 deletions(-)

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index b3d4294..52f43c6 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -82,7 +82,7 @@
 #defineOPT_RULE_NUM"rulenum"
 #defineOPT_TRACE_NUM   "tracenum"
 #defineOPT_TRACE_STEP  "tracestep"
-#defineOPT_SEARCH_SCALAR   "scalar"
+#defineOPT_SEARCH_ALG  "alg"
 #defineOPT_BLD_CATEGORIES  "bldcat"
 #defineOPT_RUN_CATEGORIES  "runcat"
 #defineOPT_ITER_NUM"iter"
@@ -102,6 +102,26 @@ enum {
DUMP_MAX
 };

+struct acl_alg {
+   const char *name;
+   enum rte_acl_classify_alg alg;
+};
+
+static const struct acl_alg acl_alg[] = {
+   {
+   .name = "scalar",
+   .alg = RTE_ACL_CLASSIFY_SCALAR,
+   },
+   {
+   .name = "sse",
+   .alg = RTE_ACL_CLASSIFY_SSE,
+   },
+   {
+   .name = "avx2",
+   .alg = RTE_ACL_CLASSIFY_AVX2,
+   },
+};
+
 static struct {
const char *prgname;
const char *rule_file;
@@ -114,11 +134,11 @@ static struct {
uint32_ttrace_sz;
uint32_titer_num;
uint32_tverbose;
-   uint32_tscalar;
+   uint32_tipv6;
+   struct acl_alg  alg;
uint32_tused_traces;
void   *traces;
struct rte_acl_ctx *acx;
-   uint32_tipv6;
 } config = {
.bld_categories = 3,
.run_categories = 1,
@@ -127,6 +147,10 @@ static struct {
.trace_step = TRACE_STEP_DEF,
.iter_num = 1,
.verbose = DUMP_MAX,
+   .alg = {
+   .name = "default",
+   .alg = RTE_ACL_CLASSIFY_DEFAULT,
+   },
.ipv6 = 0
 };

@@ -774,13 +798,12 @@ acx_init(void)
if (config.acx == NULL)
rte_exit(rte_errno, "failed to create ACL context\n");

-   /* set default classify method to scalar for this context. */
-   if (config.scalar) {
-   ret = rte_acl_set_ctx_classify(config.acx,
-   RTE_ACL_CLASSIFY_SCALAR);
+   /* set default classify method for this context. */
+   if (config.alg.alg != RTE_ACL_CLASSIFY_DEFAULT) {
+   ret = rte_acl_set_ctx_classify(config.acx, config.alg.alg);
if (ret != 0)
-   rte_exit(ret, "failed to setup classify method "
-   "for ACL context\n");
+   rte_exit(ret, "failed to setup %s method "
+   "for ACL context\n", config.alg.name);
}

/* add ACL rules. */
@@ -809,7 +832,7 @@ acx_init(void)
 }

 static uint32_t
-search_ip5tuples_once(uint32_t categories, uint32_t step, int scalar)
+search_ip5tuples_once(uint32_t categories, uint32_t step, const char *alg)
 {
int ret;
uint32_t i, j, k, n, r;
@@ -847,7 +870,7 @@ search_ip5tuples_once(uint32_t categories, uint32_t step, 
int scalar)

dump_verbose(DUMP_SEARCH, stdout,
"%s(%u, %u, %s) returns %u\n", __func__,
-   categories, step, scalar != 0 ? "scalar" : "sse", i);
+   categories, step, alg, i);
return i;
 }

@@ -863,7 +886,7 @@ search_ip5tuples(__attribute__((unused)) void *arg)

for (i = 0; i != config.iter_num; i++) {
pkt += search_ip5tuples_once(config.run_categories,
-   config.trace_step, config.scalar);
+   config.trace_step, config.alg.name);
}

tm = rte_rdtsc() - start;
@@ -891,8 +914,40 @@ get_uint32_opt(const char *opt, const char *name, uint32_t 
min, uint32_t max)
 }

 static void
+get_alg_opt(const char *opt, const char *name)
+{
+   uint32_t i;
+
+   for (i = 0; i != RTE_DIM(acl_alg); i++) {
+   if (strcmp(opt, acl_alg[i].name) == 0) {
+   config.alg = acl_alg[i];
+   return;
+   }
+   }
+
+   rte_exit(-EINVAL, "invalid value: \"%s\" for option: %s\n",
+   opt, name);
+}
+
+static void
 print_usage(const char *prgname)
 {
+   uint32_t i, n, rc;
+   char buf[PATH_MAX];
+
+   n = 0;
+   buf[0] = 0;
+
+   for (i = 0; i < RTE_DIM(acl_alg) - 1; i++) {
+   rc = snprintf(buf + n, sizeof(buf) - n, "%s|",
+   acl_alg[i].name);
+   if (rc > sizeof(buf) - n)
+   break;
+   n += rc;
+   }
+
+   snprintf(buf + n, sizeof(buf) - n, "%s", acl_alg[i].name);
+

[dpdk-dev] [PATCH v3 16/18] libte_acl: introduce max_size into rte_acl_config.

If at build phase we don't make any trie splitting,
then temporary build structures and resulting RT structure might be
much bigger than current.
>From other side - having just one trie instead of multiple can speedup
search quite significantly.
>From my measurements on rule-sets with ~10K rules:
RT table up to 8 times bigger, classify() up to 80% faster
than current implementation.
To make it possible for the user to decide about performance/space trade-off -
new parameter for build config structure (max_size) is introduced.
Setting it to the value greater than zero, instructs  rte_acl_build() to:
- make sure that size of RT table wouldn't exceed given value.
- attempt to minimise number of tries in the table.
Setting it to zero maintains current behaviour.
That introduces a minor change in the public API, but I think the possible
performance gain is too big to ignore it.

Signed-off-by: Konstantin Ananyev 
---
 app/test-acl/main.c   |  33 
 examples/l3fwd-acl/main.c |   3 +-
 lib/librte_acl/acl.h  |   2 +-
 lib/librte_acl/acl_bld.c  | 134 +-
 lib/librte_acl/acl_gen.c  |  22 +---
 lib/librte_acl/rte_acl.c  |   1 +
 lib/librte_acl/rte_acl.h  |   2 +
 7 files changed, 131 insertions(+), 66 deletions(-)

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index 52f43c6..5e8db04 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -85,6 +85,7 @@
 #defineOPT_SEARCH_ALG  "alg"
 #defineOPT_BLD_CATEGORIES  "bldcat"
 #defineOPT_RUN_CATEGORIES  "runcat"
+#defineOPT_MAX_SIZE"maxsize"
 #defineOPT_ITER_NUM"iter"
 #defineOPT_VERBOSE "verbose"
 #defineOPT_IPV6"ipv6"
@@ -126,6 +127,7 @@ static struct {
const char *prgname;
const char *rule_file;
const char *trace_file;
+   size_t  max_size;
uint32_tbld_categories;
uint32_trun_categories;
uint32_tnb_rules;
@@ -780,6 +782,8 @@ acx_init(void)
FILE *f;
struct rte_acl_config cfg;

+   memset(&cfg, 0, sizeof(cfg));
+
/* setup ACL build config. */
if (config.ipv6) {
cfg.num_fields = RTE_DIM(ipv6_defs);
@@ -789,6 +793,7 @@ acx_init(void)
memcpy(&cfg.defs, ipv4_defs, sizeof(ipv4_defs));
}
cfg.num_categories = config.bld_categories;
+   cfg.max_size = config.max_size;

/* setup ACL creation parameters. */
prm.rule_size = RTE_ACL_RULE_SZ(cfg.num_fields);
@@ -899,8 +904,8 @@ search_ip5tuples(__attribute__((unused)) void *arg)
return 0;
 }

-static uint32_t
-get_uint32_opt(const char *opt, const char *name, uint32_t min, uint32_t max)
+static unsigned long
+get_ulong_opt(const char *opt, const char *name, size_t min, size_t max)
 {
unsigned long val;
char *end;
@@ -964,6 +969,9 @@ print_usage(const char *prgname)
"= "
"should be either 1 or multiple of %zu, "
"but not greater then %u]\n"
+   "[--" OPT_MAX_SIZE
+   "= "
+   "leave 0 for default behaviour]\n"
"[--" OPT_ITER_NUM "=]\n"
"[--" OPT_VERBOSE "=]\n"
"[--" OPT_SEARCH_ALG "=%s]\n"
@@ -984,6 +992,7 @@ dump_config(FILE *f)
fprintf(f, "%s:%u\n", OPT_TRACE_STEP, config.trace_step);
fprintf(f, "%s:%u\n", OPT_BLD_CATEGORIES, config.bld_categories);
fprintf(f, "%s:%u\n", OPT_RUN_CATEGORIES, config.run_categories);
+   fprintf(f, "%s:%zu\n", OPT_MAX_SIZE, config.max_size);
fprintf(f, "%s:%u\n", OPT_ITER_NUM, config.iter_num);
fprintf(f, "%s:%u\n", OPT_VERBOSE, config.verbose);
fprintf(f, "%s:%u(%s)\n", OPT_SEARCH_ALG, config.alg.alg,
@@ -1010,6 +1019,7 @@ get_input_opts(int argc, char **argv)
{OPT_TRACE_FILE, 1, 0, 0},
{OPT_TRACE_NUM, 1, 0, 0},
{OPT_RULE_NUM, 1, 0, 0},
+   {OPT_MAX_SIZE, 1, 0, 0},
{OPT_TRACE_STEP, 1, 0, 0},
{OPT_BLD_CATEGORIES, 1, 0, 0},
{OPT_RUN_CATEGORIES, 1, 0, 0},
@@ -1034,29 +1044,32 @@ get_input_opts(int argc, char **argv)
} else if (strcmp(lgopts[opt_idx].name, OPT_TRACE_FILE) == 0) {
config.trace_file = optarg;
} else if (strcmp(lgopts[opt_idx].name, OPT_RULE_NUM) == 0) {
-   config.nb_rules = get_uint32_opt(optarg,
+   config.nb_rules = get_ulong_opt(optarg,
lgopts[opt_idx].name, 1, RTE_ACL_MAX_INDEX + 1);
+   } else if (strcmp(lgopts[opt_idx].name, OPT_MAX_SIZE) == 0) {
+   config.max_size = get_ulong_opt(optarg,
+   lgopts[opt_idx].name, 0, SIZE_MAX);

[dpdk-dev] [PATCH v3 11/18] librte_acl: add AVX2 as new rte_acl_classify() method

v2 changes:
When build with the compilers that don't support AVX2 instructions,
make rte_acl_classify_avx2() do nothing and return an error.
Remove unneeded 'ifdef __AVX2__' in acl_run_avx2.*.

Introduce new classify() method that uses AVX2 instructions.
>From my measurements:
On HSW boards when processing >= 16 packets per call,
AVX2 method outperforms it's SSE counterpart by 10-25%,
(depending on the ruleset).
At runtime, if librte_acl was build with the compiler that supports AVX2,
this method is selected as default one on HW that supports AVX2.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/Makefile   |  18 ++
 lib/librte_acl/acl.h  |   4 +
 lib/librte_acl/acl_run.h  |   2 +-
 lib/librte_acl/acl_run_avx2.c |  54 +
 lib/librte_acl/acl_run_avx2.h | 301 +++
 lib/librte_acl/acl_run_sse.c  | 537 +-
 lib/librte_acl/acl_run_sse.h  | 533 +
 lib/librte_acl/rte_acl.c  |  27 +++
 lib/librte_acl/rte_acl.h  |   2 +
 9 files changed, 941 insertions(+), 537 deletions(-)
 create mode 100644 lib/librte_acl/acl_run_avx2.c
 create mode 100644 lib/librte_acl/acl_run_avx2.h
 create mode 100644 lib/librte_acl/acl_run_sse.h

diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
index 65e566d..6b74dc9 100644
--- a/lib/librte_acl/Makefile
+++ b/lib/librte_acl/Makefile
@@ -48,6 +48,24 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c

 CFLAGS_acl_run_sse.o += -msse4.1

+#
+# If the compiler supports AVX2 instructions,
+# then add support for AVX2 classify method.
+#
+
+CC_AVX2_SUPPORT=$(shell $(CC) -march=core-avx2 -dM -E - &1 | \
+grep -q AVX2 && echo 1)
+
+ifeq ($(CC_AVX2_SUPPORT), 1)
+   SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_avx2.c
+   CFLAGS_rte_acl.o += -DCC_AVX2_SUPPORT
+   ifeq ($(CC), icc)
+   CFLAGS_acl_run_avx2.o += -march=core-avx2
+   else
+   CFLAGS_acl_run_avx2.o += -mavx2
+   endif
+endif
+
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include := rte_acl_osdep.h
 SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl.h
diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 96bb318..d33d7ad 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -196,6 +196,10 @@ int
 rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
uint32_t *results, uint32_t num, uint32_t categories);

+int
+rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
+   uint32_t *results, uint32_t num, uint32_t categories);
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
diff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h
index 4c843c1..850bc81 100644
--- a/lib/librte_acl/acl_run.h
+++ b/lib/librte_acl/acl_run.h
@@ -35,9 +35,9 @@
 #define_ACL_RUN_H_

 #include 
-#include "acl_vect.h"
 #include "acl.h"

+#define MAX_SEARCHES_AVX16 16
 #define MAX_SEARCHES_SSE8  8
 #define MAX_SEARCHES_SSE4  4
 #define MAX_SEARCHES_SSE2  2
diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c
new file mode 100644
index 000..0a42f72
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx2.c
@@ -0,0 +1,54 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "acl_run_avx2.h"
+
+/*
+ * Note, that to be

[dpdk-dev] [PATCH v3 14/18] libter_acl: move lo/hi dwords shuffle out from calc_addr

Reorganise SSE code-path a bit by moving lo/hi dwords shuffle
out from calc_addr().
That allows to make calc_addr() for SSE and AVX2 practically identical
and opens opportunity for further code deduplication.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_run_sse.h | 38 --
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h
index 1b7870e..4a174e9 100644
--- a/lib/librte_acl/acl_run_sse.h
+++ b/lib/librte_acl/acl_run_sse.h
@@ -172,9 +172,9 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, 
struct parms *parms,
  */
 static inline __attribute__((always_inline)) xmm_t
 calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-   xmm_t ones_16, xmm_t indices1, xmm_t indices2)
+   xmm_t ones_16, xmm_t tr_lo, xmm_t tr_hi)
 {
-   xmm_t addr, node_types, range, temp;
+   xmm_t addr, node_types;
xmm_t dfa_msk, dfa_ofs, quad_ofs;
xmm_t in, r, t;

@@ -187,18 +187,14 @@ calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t 
shuffle_input,
 * it reaches a match.
 */

-   /* Shuffle low 32 into temp and high 32 into indices2 */
-   temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);
-   range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);
-
t = MM_XOR(index_mask, index_mask);

/* shuffle input byte to all 4 positions of 32 bit value */
in = MM_SHUFFLE8(next_input, shuffle_input);

/* Calc node type and node addr */
-   node_types = MM_ANDNOT(index_mask, temp);
-   addr = MM_AND(index_mask, temp);
+   node_types = MM_ANDNOT(index_mask, tr_lo);
+   addr = MM_AND(index_mask, tr_lo);

/*
 * Calc addr for DFAs - addr = dfa_index + input_byte
@@ -211,7 +207,7 @@ calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t 
shuffle_input,
r = _mm_add_epi8(r, range_base);

t = _mm_srli_epi32(in, 24);
-   r = _mm_shuffle_epi8(range, r);
+   r = _mm_shuffle_epi8(tr_hi, r);

dfa_ofs = _mm_sub_epi32(t, r);

@@ -224,22 +220,22 @@ calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t 
shuffle_input,
 */

/* check ranges */
-   temp = MM_CMPGT8(in, range);
+   t = MM_CMPGT8(in, tr_hi);

/* convert -1 to 1 (bytes greater than input byte */
-   temp = MM_SIGN8(temp, temp);
+   t = MM_SIGN8(t, t);

/* horizontal add pairs of bytes into words */
-   temp = MM_MADD8(temp, temp);
+   t = MM_MADD8(t, t);

/* horizontal add pairs of words into dwords */
-   quad_ofs = MM_MADD16(temp, ones_16);
+   quad_ofs = MM_MADD16(t, ones_16);

-   /* mask to range type nodes */
-   temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
+   /* blend DFA and QUAD/SINGLE. */
+   t = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);

/* add index into node position */
-   return MM_ADD32(addr, temp);
+   return MM_ADD32(addr, t);
 }

 /*
@@ -249,13 +245,19 @@ static inline __attribute__((always_inline)) xmm_t
 transition4(xmm_t next_input, const uint64_t *trans,
xmm_t *indices1, xmm_t *indices2)
 {
-   xmm_t addr;
+   xmm_t addr, tr_lo, tr_hi;
uint64_t trans0, trans2;

+   /* Shuffle low 32 into tr_lo and high 32 into tr_hi */
+   tr_lo = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,
+   0x88);
+   tr_hi = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,
+   0xdd);
+
 /* Calculate the address (array index) for all 4 transitions. */

addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,
-   xmm_ones_16.x, *indices1, *indices2);
+   xmm_ones_16.x, tr_lo, tr_hi);

 /* Gather 64 bit transitions and pack back into 2 registers. */

-- 
1.8.5.3

[dpdk-dev] [PATCH v3 02/18] app/test: few small fixes fot test_acl.c

Make sure that test_acl would not ignore error conditions.
Run classify() with all possible values.

Signed-off-by: Konstantin Ananyev 
---
 app/test/test_acl.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/app/test/test_acl.c b/app/test/test_acl.c
index 356d620..7119ad3 100644
--- a/app/test/test_acl.c
+++ b/app/test/test_acl.c
@@ -111,7 +111,7 @@ test_classify_run(struct rte_acl_ctx *acx)
 * these will run quite a few times, it's necessary to test code paths
 * from num=0 to num>8
 */
-   for (count = 0; count < RTE_DIM(acl_test_data); count++) {
+   for (count = 0; count <= RTE_DIM(acl_test_data); count++) {
ret = rte_acl_classify(acx, data, results,
count, RTE_ACL_MAX_CATEGORIES);
if (ret != 0) {
@@ -128,6 +128,7 @@ test_classify_run(struct rte_acl_ctx *acx)
"(expected %"PRIu32" got %"PRIu32")!\n",
__LINE__, i, acl_test_data[i].allow,
result);
+   ret = -EINVAL;
goto err;
}
}
@@ -140,6 +141,7 @@ test_classify_run(struct rte_acl_ctx *acx)
"(expected %"PRIu32" got %"PRIu32")!\n",
__LINE__, i, acl_test_data[i].deny,
result);
+   ret = -EINVAL;
goto err;
}
}
@@ -150,7 +152,7 @@ test_classify_run(struct rte_acl_ctx *acx)
RTE_DIM(acl_test_data), RTE_ACL_MAX_CATEGORIES,
RTE_ACL_CLASSIFY_SCALAR);
if (ret != 0) {
-   printf("Line %i: SSE classify failed!\n", __LINE__);
+   printf("Line %i: scalar classify failed!\n", __LINE__);
goto err;
}

@@ -162,6 +164,7 @@ test_classify_run(struct rte_acl_ctx *acx)
"(expected %"PRIu32" got %"PRIu32")!\n",
__LINE__, i, acl_test_data[i].allow,
result);
+   ret = -EINVAL;
goto err;
}
}
@@ -174,6 +177,7 @@ test_classify_run(struct rte_acl_ctx *acx)
"(expected %"PRIu32" got %"PRIu32")!\n",
__LINE__, i, acl_test_data[i].deny,
result);
+   ret = -EINVAL;
goto err;
}
}
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 05/18] librte_acl: fix a bug at build phase that can cause matches beeing overwirtten.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_bld.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index 8bf4a54..22f7934 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -1907,7 +1907,7 @@ rte_acl_build(struct rte_acl_ctx *ctx, const struct 
rte_acl_config *cfg)
bcx.num_tries, bcx.cfg.num_categories,
RTE_ACL_MAX_FIELDS * RTE_DIM(bcx.tries) *
sizeof(ctx->data_indexes[0]),
-   bcx.num_build_rules);
+   bcx.num_build_rules + 1);
if (rc == 0) {

/* set data indexes. */
-- 
1.8.5.3

[dpdk-dev] [PATCH v3 13/18] librte_acl: Remove search_sse_2 and relatives.

Previous improvements made scalar method the fastest one
for tiny bunch of packets (< 4).
That allows us to remove specific vector code-path for small number of packets
(search_sse_2)
and always use scalar method for such cases.

Signed-off-by: Konstantin Ananyev 
---
 lib/librte_acl/acl_run_avx2.c |   2 +-
 lib/librte_acl/acl_run_sse.c  |   3 +-
 lib/librte_acl/acl_run_sse.h  | 110 --
 3 files changed, 3 insertions(+), 112 deletions(-)

diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c
index 0a42f72..79ebbd6 100644
--- a/lib/librte_acl/acl_run_avx2.c
+++ b/lib/librte_acl/acl_run_avx2.c
@@ -49,6 +49,6 @@ rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const 
uint8_t **data,
else if (num >= MAX_SEARCHES_SSE4)
return search_sse_4(ctx, data, results, num, categories);
else
-   return search_sse_2(ctx, data, results, num,
+   return rte_acl_classify_scalar(ctx, data, results, num,
categories);
 }
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 77b32b3..a5a7d36 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -42,5 +42,6 @@ rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const 
uint8_t **data,
else if (num >= MAX_SEARCHES_SSE4)
return search_sse_4(ctx, data, results, num, categories);
else
-   return search_sse_2(ctx, data, results, num, categories);
+   return rte_acl_classify_scalar(ctx, data, results, num,
+   categories);
 }
diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h
index e33e16b..1b7870e 100644
--- a/lib/librte_acl/acl_run_sse.h
+++ b/lib/librte_acl/acl_run_sse.h
@@ -45,10 +45,6 @@ static const rte_xmm_t xmm_shuffle_input = {
.u32 = {0x, 0x04040404, 0x08080808, 0x0c0c0c0c},
 };

-static const rte_xmm_t xmm_shuffle_input64 = {
-   .u32 = {0x, 0x04040404, 0x80808080, 0x80808080},
-};
-
 static const rte_xmm_t xmm_ones_16 = {
.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
 };
@@ -62,15 +58,6 @@ static const rte_xmm_t xmm_match_mask = {
},
 };

-static const rte_xmm_t xmm_match_mask64 = {
-   .u32 = {
-   RTE_ACL_NODE_MATCH,
-   0,
-   RTE_ACL_NODE_MATCH,
-   0,
-   },
-};
-
 static const rte_xmm_t xmm_index_mask = {
.u32 = {
RTE_ACL_NODE_INDEX,
@@ -80,16 +67,6 @@ static const rte_xmm_t xmm_index_mask = {
},
 };

-static const rte_xmm_t xmm_index_mask64 = {
-   .u32 = {
-   RTE_ACL_NODE_INDEX,
-   RTE_ACL_NODE_INDEX,
-   0,
-   0,
-   },
-};
-
-
 /*
  * Resolve priority for multiple results (sse version).
  * This consists comparing the priority of the current traversal with the
@@ -161,22 +138,6 @@ acl_process_matches(xmm_t *indices, int slot, const struct 
rte_acl_ctx *ctx,
 }

 /*
- * Check for a match in 2 transitions (contained in SSE register)
- */
-static inline __attribute__((always_inline)) void
-acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms 
*parms,
-   struct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask)
-{
-   xmm_t temp;
-
-   temp = MM_AND(match_mask, *indices);
-   while (!MM_TESTZ(temp, temp)) {
-   acl_process_matches(indices, slot, ctx, parms, flows);
-   temp = MM_AND(match_mask, *indices);
-   }
-}
-
-/*
  * Check for any match in 4 transitions (contained in 2 SSE registers)
  */
 static inline __attribute__((always_inline)) void
@@ -460,74 +421,3 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t 
**data,

return 0;
 }
-
-static inline __attribute__((always_inline)) xmm_t
-transition2(xmm_t next_input, const uint64_t *trans, xmm_t *indices1)
-{
-   uint64_t t;
-   xmm_t addr, indices2;
-
-   indices2 = _mm_setzero_si128();
-
-   addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,
-   xmm_ones_16.x, *indices1, indices2);
-
-   /* Gather 64 bit transitions and pack 2 per register. */
-
-   t = trans[MM_CVT32(addr)];
-
-   /* get slot 1 */
-   addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
-   *indices1 = MM_SET64(trans[MM_CVT32(addr)], t);
-
-   return MM_SRL32(next_input, CHAR_BIT);
-}
-
-/*
- * Execute trie traversal with 2 traversals in parallel.
- */
-static inline int
-search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
-   uint32_t *results, uint32_t total_packets, uint32_t categories)
-{
-   int n;
-   struct acl_flow_data flows;
-   uint64_t index_array[MAX_SEARCHES_SSE2];
-   struct completion cmplt[MAX_SEARCHES_SSE2];
-   struct parms parms[MAX_SEARCHES_SSE2];
-   xmm_t input, indices;
-
-   acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
-   total_packets, cate

[dpdk-dev] [PATCH v3 10/18] EAL: introduce rte_ymm and relatives in rte_common_vect.h.

New data type to manipulate 256 bit AVX values.
Rename field in the rte_xmm to keep common naming across SSE/AVX fields.

Signed-off-by: Konstantin Ananyev 
---
 examples/l3fwd/main.c   |  2 +-
 lib/librte_acl/acl_run_sse.c| 88 -
 lib/librte_acl/rte_acl_osdep_alone.h| 35 +-
 lib/librte_eal/common/include/rte_common_vect.h | 27 +++-
 lib/librte_lpm/rte_lpm.h|  2 +-
 5 files changed, 104 insertions(+), 50 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 918f2cb..6f7d7d4 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -1170,7 +1170,7 @@ processx4_step2(const struct lcore_conf *qconf, __m128i 
dip, uint32_t flag,
if (likely(flag != 0)) {
rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
} else {
-   dst.m = dip;
+   dst.x = dip;
dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);
dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid);
dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid);
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 09e32be..4605b58 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -359,16 +359,16 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t 
**data,

 /* Check for any matches. */
acl_match_check_x4(0, ctx, parms, &flows,
-   &indices1, &indices2, mm_match_mask.m);
+   &indices1, &indices2, mm_match_mask.x);
acl_match_check_x4(4, ctx, parms, &flows,
-   &indices3, &indices4, mm_match_mask.m);
+   &indices3, &indices4, mm_match_mask.x);

while (flows.started > 0) {

/* Gather 4 bytes of input data for each stream. */
-   input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
+   input0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0),
0);
-   input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
+   input1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4),
0);

input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
@@ -382,43 +382,43 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t 
**data,

 /* Process the 4 bytes of input on each stream. */

-   input0 = transition4(mm_index_mask.m, input0,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input0 = transition4(mm_index_mask.x, input0,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);

-   input1 = transition4(mm_index_mask.m, input1,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input1 = transition4(mm_index_mask.x, input1,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);

-   input0 = transition4(mm_index_mask.m, input0,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input0 = transition4(mm_index_mask.x, input0,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);

-   input1 = transition4(mm_index_mask.m, input1,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input1 = transition4(mm_index_mask.x, input1,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);

-   input0 = transition4(mm_index_mask.m, input0,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input0 = transition4(mm_index_mask.x, input0,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);

-   input1 = transition4(mm_index_mask.m, input1,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input1 = transition4(mm_index_mask.x, input1,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);

-   input0 = transition4(mm_index_mask.m, input0,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input0 = transition4(mm_index_mask.x, input0,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices1, &indices2);

-   input1 = transition4(mm_index_mask.m, input1,
-   mm_shuffle_input.m, mm_ones_16.m,
+   input1 = transition4(mm_index_mask.x, input1,
+   mm_shuffle_input.x, mm_ones_16.x,
flows.trans, &indices3, &indices4);

 /* Check for any matches. */

[dpdk-dev] [PATCH] eal/common: Fix enabled core number with core list argument

2015-01-20 Thread Remi Pommarel

When using core list argument to define which core to enable (ie -l) the
core_num field of the rte configuration is not updated the same way as using
coremask. This causes rte_lcore_num() to yield different value from the one
using coremask.

Signed-off-by: Remi Pommarel 
---
 lib/librte_eal/common/eal_common_options.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_options.c 
b/lib/librte_eal/common/eal_common_options.c
index e2810ab..67e02dc 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -268,6 +268,9 @@ eal_parse_corelist(const char *corelist)
if (count == 0)
return -1;

+   /* Update the count of enabled logical cores of the EAL configuration */
+   cfg->lcore_count = count;
+
lcores_parsed = 1;
return 0;
 }
-- 
2.0.1

[dpdk-dev] [PATCH] pcap: Fix ethernet device's name for pcap port

2015-01-20 Thread Remi Pommarel

Ethernet device's data should contain the virtual device name for pcap port.
This name is correctly set by rte_eth_dev_allocate() at initialization time,
but it is directly lost.

Signed-off-by: Remi Pommarel 
---
 lib/librte_pmd_pcap/rte_eth_pcap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/librte_pmd_pcap/rte_eth_pcap.c 
b/lib/librte_pmd_pcap/rte_eth_pcap.c
index f12d1e7..aa01464 100644
--- a/lib/librte_pmd_pcap/rte_eth_pcap.c
+++ b/lib/librte_pmd_pcap/rte_eth_pcap.c
@@ -735,6 +735,7 @@ rte_pmd_init_internals(const char *name, const unsigned 
nb_rx_queues,

data->dev_private = *internals;
data->port_id = (*eth_dev)->data->port_id;
+   snprintf(data->name, sizeof(data->name), "%s", (*eth_dev)->data->name);
data->nb_rx_queues = (uint16_t)nb_rx_queues;
data->nb_tx_queues = (uint16_t)nb_tx_queues;
data->dev_link = pmd_link;
-- 
2.0.1

[dpdk-dev] [PATCH 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

On Tue, Jan 20, 2015 at 09:15:38AM -0800, Stephen Hemminger wrote:
> On Mon, 19 Jan 2015 09:53:34 +0800
> zhihong.wang at intel.com wrote:
> 
> > Main code changes:
> > 
> > 1. Differentiate architectural features based on CPU flags
> > 
> > a. Implement separated move functions for SSE/AVX/AVX2 to make full 
> > utilization of cache bandwidth
> > 
> > b. Implement separated copy flow specifically optimized for target 
> > architecture
> > 
> > 2. Rewrite the memcpy function "rte_memcpy"
> > 
> > a. Add store aligning
> > 
> > b. Add load aligning based on architectural features
> > 
> > c. Put block copy loop into inline move functions for better control of 
> > instruction order
> > 
> > d. Eliminate unnecessary MOVs
> > 
> > 3. Rewrite the inline move functions
> > 
> > a. Add move functions for unaligned load cases
> > 
> > b. Change instruction order in copy loops for better pipeline 
> > utilization
> > 
> > c. Use intrinsics instead of assembly code
> > 
> > 4. Remove slow glibc call for constant copies
> > 
> > Signed-off-by: Zhihong Wang 
> 
> Dumb question: why not fix glibc memcpy instead?
> What is special about rte_memcpy?
> 
> 
Fair point.  Though, does glibc implement optimized memcpys per arch?  Or do
they just rely on the __builtin's from gcc to get optimized variants?

Neil

[dpdk-dev] [PATCH] vhost: Add -lfuse into the LDFLAGS list

the vhost library relies on libfuse, and thats included when we do a normal
shared object build, but when we specify combined libs, its gets left out.  Add
it back in

Signed-off-by: Neil Horman 
---
 mk/rte.app.mk | 4 
 1 file changed, 4 insertions(+)

diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index e1a0dbf..86d9865 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -127,6 +127,10 @@ ifeq ($(CONFIG_RTE_LIBRTE_PMD_PCAP),y)
 LDLIBS += -lpcap
 endif

+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
+LDLIBS += -lfuse
+endif
+
 LDLIBS += --start-group

 ifeq ($(CONFIG_RTE_BUILD_COMBINE_LIBS),n)
-- 
2.1.0

[dpdk-dev] [PATCH v6 1/4] compat: Add infrastructure to support symbol versioning

Add initial pass header files to support symbol versioning.

Signed-off-by: Neil Horman 
CC: Thomas Monjalon 
CC: "Richardson, Bruce" 
CC: "Gonzalez Monroy, Sergio" 

---
Change Notes:
V2)
Moved ifeq to _INSTALL target

V3)
Undo V2 changes and make librte_compat use the rte.install.mk file
instead

v4)
changed --version-script to accept SRCDIR in this patch at per request
documented versioning macros
cleaned up macro parameter consistency
converted SA macro to RTE_STR macro
fixed copyright
---
 lib/Makefile   |   1 +
 lib/librte_compat/Makefile |  38 +
 lib/librte_compat/rte_compat.h | 117 +
 mk/rte.lib.mk  |   4 ++
 4 files changed, 160 insertions(+)
 create mode 100644 lib/librte_compat/Makefile
 create mode 100644 lib/librte_compat/rte_compat.h

diff --git a/lib/Makefile b/lib/Makefile
index 0ffc982..d617d81 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -31,6 +31,7 @@

 include $(RTE_SDK)/mk/rte.vars.mk

+DIRS-y += librte_compat
 DIRS-$(CONFIG_RTE_LIBRTE_EAL) += librte_eal
 DIRS-$(CONFIG_RTE_LIBRTE_MALLOC) += librte_malloc
 DIRS-$(CONFIG_RTE_LIBRTE_RING) += librte_ring
diff --git a/lib/librte_compat/Makefile b/lib/librte_compat/Makefile
new file mode 100644
index 000..0bab870
--- /dev/null
+++ b/lib/librte_compat/Makefile
@@ -0,0 +1,38 @@
+#   BSD LICENSE
+#
+#   Copyright(c) 2013 Neil Horman 
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in
+#   the documentation and/or other materials provided with the
+#   distribution.
+# * Neither the name of Intel Corporation nor the names of its
+#   contributors may be used to endorse or promote products derived
+#   from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+
+# install includes
+SYMLINK-y-include := rte_compat.h
+
+include $(RTE_SDK)/mk/rte.install.mk
diff --git a/lib/librte_compat/rte_compat.h b/lib/librte_compat/rte_compat.h
new file mode 100644
index 000..d7cc176
--- /dev/null
+++ b/lib/librte_compat/rte_compat.h
@@ -0,0 +1,117 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010 Neil Horman .
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF TH

[dpdk-dev] [PATCH v6 2/4] Provide initial versioning for all DPDK libraries

Add linker version script files to each DPDK library to put a stake in the
ground from which we can start cleaning up API's

Signed-off-by: Neil Horman 
CC: Thomas Monjalon 
CC: "Richardson, Bruce" 

---
Change Notes:

v2)
* Updated export map to not require full path
---
 lib/librte_acl/Makefile|   2 +
 lib/librte_acl/rte_acl_version.map |  21 
 lib/librte_cfgfile/Makefile|   2 +
 lib/librte_cfgfile/rte_cfgfile_version.map |  14 +++
 lib/librte_cmdline/Makefile|   2 +
 lib/librte_cmdline/rte_cmdline_version.map |  69 +
 lib/librte_distributor/Makefile|   2 +
 lib/librte_distributor/rte_distributor_version.map |  16 +++
 lib/librte_eal/bsdapp/eal/Makefile |   2 +
 lib/librte_eal/bsdapp/eal/rte_eal_version.map  |  90 
 lib/librte_eal/linuxapp/eal/Makefile   |   2 +
 lib/librte_eal/linuxapp/eal/rte_eal_version.map|  90 
 lib/librte_ether/Makefile  |   2 +
 lib/librte_ether/rte_ether_version.map | 113 +
 lib/librte_hash/Makefile   |   2 +
 lib/librte_hash/rte_hash_version.map   |  18 
 lib/librte_ip_frag/Makefile|   2 +
 lib/librte_ip_frag/rte_ipfrag_version.map  |  14 +++
 lib/librte_ivshmem/Makefile|   2 +
 lib/librte_ivshmem/rte_ivshmem_version.map |  13 +++
 lib/librte_kni/Makefile|   2 +
 lib/librte_kni/rte_kni_version.map |  20 
 lib/librte_kvargs/Makefile |   2 +
 lib/librte_kvargs/rte_kvargs_version.map   |  10 ++
 lib/librte_lpm/Makefile|   2 +
 lib/librte_lpm/rte_lpm_version.map |  24 +
 lib/librte_malloc/Makefile |   2 +
 lib/librte_malloc/rte_malloc_version.map   |  19 
 lib/librte_mbuf/Makefile   |   2 +
 lib/librte_mbuf/rte_mbuf_version.map   |  14 +++
 lib/librte_mempool/Makefile|   2 +
 lib/librte_mempool/rte_mempool_version.map |  18 
 lib/librte_meter/Makefile  |   2 +
 lib/librte_meter/rte_meter_version.map |  13 +++
 lib/librte_pipeline/Makefile   |   2 +
 lib/librte_pipeline/rte_pipeline_version.map   |  23 +
 lib/librte_pmd_af_packet/Makefile  |   2 +
 .../rte_pmd_af_packet_version.map  |   7 ++
 lib/librte_pmd_bond/Makefile   |   2 +
 lib/librte_pmd_bond/rte_eth_bond_version.map   |  21 
 lib/librte_pmd_e1000/Makefile  |   2 +
 lib/librte_pmd_e1000/rte_pmd_e1000_version.map |   5 +
 lib/librte_pmd_enic/Makefile   |   2 +
 lib/librte_pmd_enic/rte_pmd_enic_version.map   |   5 +
 lib/librte_pmd_i40e/Makefile   |   2 +
 lib/librte_pmd_i40e/rte_pmd_i40e_version.map   |   5 +
 lib/librte_pmd_ixgbe/Makefile  |   2 +
 lib/librte_pmd_ixgbe/rte_pmd_ixgbe_version.map |   5 +
 lib/librte_pmd_pcap/Makefile   |   2 +
 lib/librte_pmd_pcap/rte_pmd_pcap_version.map   |   5 +
 lib/librte_pmd_ring/Makefile   |   2 +
 lib/librte_pmd_ring/rte_eth_ring.c |   2 +-
 lib/librte_pmd_ring/rte_eth_ring.h |   6 --
 lib/librte_pmd_ring/rte_eth_ring_version.map   |  10 ++
 lib/librte_pmd_virtio/Makefile |   1 +
 lib/librte_pmd_virtio/rte_pmd_virtio_version.map   |   5 +
 lib/librte_pmd_vmxnet3/Makefile|   2 +
 lib/librte_pmd_vmxnet3/rte_pmd_vmxnet3_version.map |   5 +
 lib/librte_pmd_xenvirt/Makefile|   2 +
 lib/librte_pmd_xenvirt/rte_eth_xenvirt_version.map |   8 ++
 lib/librte_port/Makefile   |   2 +
 lib/librte_port/rte_port_version.map   |  18 
 lib/librte_power/Makefile  |   2 +
 lib/librte_power/rte_power_version.map |  18 
 lib/librte_ring/Makefile   |   2 +
 lib/librte_ring/rte_ring_version.map   |  12 +++
 lib/librte_sched/Makefile  |   2 +
 lib/librte_sched/rte_sched_version.map |  22 
 lib/librte_table/Makefile  |   2 +
 lib/librte_table/rte_table_version.map |  22 
 lib/librte_timer/Makefile  |   2 +
 lib/librte_timer/rte_timer_version.map |  16 +++
 lib/librte_vhost/Makefile  |   2 +
 lib/librte_vhost/rte_vhost_version.map |  14 +++
 74 files changed, 874 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_acl/rte_acl_version.map
 create mode 100644 lib/librte_cfgfile/rte_cfg

[dpdk-dev] [PATCH v6 3/4] Add library version extenstion

To differentiate libraries that break ABI, we add a library version number
suffix to the library, which must be incremented when a given libraries ABI is
broken.  This patch enforces that addition, sets the initial abi soname
extension to 1 for each library and creates a symlink to the base SONAME so that
the test applications will link properly.

Signed-off-by: Neil Horman 
CC: Thomas Monjalon 
CC: "Richardson, Bruce" 

---
Change Notes:
v3)
Made symlinking of libraries conditional on a DSO build

v4) Removed erroneous newline
changed @exit 1 to @false
changed ./$(LIB) to $<
---
 lib/librte_acl/Makefile  |  2 ++
 lib/librte_cfgfile/Makefile  |  2 ++
 lib/librte_cmdline/Makefile  |  2 ++
 lib/librte_compat/Makefile   |  2 ++
 lib/librte_distributor/Makefile  |  2 ++
 lib/librte_eal/bsdapp/eal/Makefile   |  2 ++
 lib/librte_eal/linuxapp/eal/Makefile |  2 ++
 lib/librte_ether/Makefile|  2 ++
 lib/librte_hash/Makefile |  2 ++
 lib/librte_ip_frag/Makefile  |  2 ++
 lib/librte_ivshmem/Makefile  |  2 ++
 lib/librte_kni/Makefile  |  2 ++
 lib/librte_kvargs/Makefile   |  2 ++
 lib/librte_lpm/Makefile  |  2 ++
 lib/librte_malloc/Makefile   |  2 ++
 lib/librte_mbuf/Makefile |  2 ++
 lib/librte_mempool/Makefile  |  2 ++
 lib/librte_meter/Makefile|  2 ++
 lib/librte_pipeline/Makefile |  2 ++
 lib/librte_pmd_af_packet/Makefile|  2 ++
 lib/librte_pmd_bond/Makefile |  2 ++
 lib/librte_pmd_e1000/Makefile|  2 ++
 lib/librte_pmd_enic/Makefile |  2 ++
 lib/librte_pmd_i40e/Makefile |  2 ++
 lib/librte_pmd_ixgbe/Makefile|  2 ++
 lib/librte_pmd_pcap/Makefile |  2 ++
 lib/librte_pmd_ring/Makefile |  2 ++
 lib/librte_pmd_virtio/Makefile   |  2 ++
 lib/librte_pmd_vmxnet3/Makefile  |  2 ++
 lib/librte_pmd_xenvirt/Makefile  |  2 ++
 lib/librte_port/Makefile |  2 ++
 lib/librte_power/Makefile|  2 ++
 lib/librte_ring/Makefile |  2 ++
 lib/librte_sched/Makefile|  2 ++
 lib/librte_table/Makefile|  2 ++
 lib/librte_timer/Makefile|  2 ++
 lib/librte_vhost/Makefile|  2 ++
 mk/rte.lib.mk| 12 ++--
 38 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
index 45cbf80..765deb1 100644
--- a/lib/librte_acl/Makefile
+++ b/lib/librte_acl/Makefile
@@ -39,6 +39,8 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)

 EXPORT_MAP := rte_acl_version.map

+LIBABIVER := 1
+
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += tb_mem.c

diff --git a/lib/librte_cfgfile/Makefile b/lib/librte_cfgfile/Makefile
index a4f73de..032c240 100644
--- a/lib/librte_cfgfile/Makefile
+++ b/lib/librte_cfgfile/Makefile
@@ -41,6 +41,8 @@ CFLAGS += $(WERROR_FLAGS)

 EXPORT_MAP := rte_cfgfile_version.map

+LIBABIVER := 1
+
 #
 # all source are stored in SRCS-y
 #
diff --git a/lib/librte_cmdline/Makefile b/lib/librte_cmdline/Makefile
index 3c71831..719dff6 100644
--- a/lib/librte_cmdline/Makefile
+++ b/lib/librte_cmdline/Makefile
@@ -38,6 +38,8 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3

 EXPORT_MAP := rte_cmdline_version.map

+LIBABIVER := 1
+
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) := cmdline.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_cirbuf.c
diff --git a/lib/librte_compat/Makefile b/lib/librte_compat/Makefile
index 0bab870..0c57533 100644
--- a/lib/librte_compat/Makefile
+++ b/lib/librte_compat/Makefile
@@ -32,6 +32,8 @@
 include $(RTE_SDK)/mk/rte.vars.mk


+LIBABIVER := 1
+
 # install includes
 SYMLINK-y-include := rte_compat.h

diff --git a/lib/librte_distributor/Makefile b/lib/librte_distributor/Makefile
index 3674a2c..4c9af17 100644
--- a/lib/librte_distributor/Makefile
+++ b/lib/librte_distributor/Makefile
@@ -39,6 +39,8 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)

 EXPORT_MAP := rte_distributor_version.map

+LIBABIVER := 1
+
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) := rte_distributor.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile 
b/lib/librte_eal/bsdapp/eal/Makefile
index 0b5f9d9..ae214a4 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS) -O3

 EXPORT_MAP := rte_eal_version.map

+LIBABIVER := 1
+
 # specific to linuxapp exec-env
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) := eal.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_memory.c
diff --git a/lib/librte_eal/linuxapp/eal/Makefile 
b/lib/librte_eal/linuxapp/eal/Makefile
index bae8af1..e117cec 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -35,6 +35,8 @@ LIB = librte_eal.a

 EXPORT_MAP := rte_eal_version.map

+LIBABIVER := 1
+
 VPATH += $(RTE_SDK)/lib/librte_eal/common

 CFLAGS += -I$(SRCDI

[dpdk-dev] [PATCH v6 4/4] docs: Add ABI documentation