date:20161229

[PATCH v2] net: fix incorrect original ingress device index in PKTINFO

2016-12-29 Thread Wei Zhang

When we send a packet for our own local address on a non-loopback
interface (e.g. eth0), due to the change had been introduced from
commit 0b922b7a829c ("net: original ingress device index in PKTINFO"), the
original ingress device index would be set as the loopback interface.
However, the packet should be considered as if it is being arrived via the
sending interface (eth0), otherwise it would break the expectation of the
userspace application (e.g. the DHCPRELEASE message from dhcp_release
binary would be ignored by the dnsmasq daemon, since it come from lo which
is not the interface dnsmasq bind to)

Fixes: 0b922b7a829c ("net: original ingress device index in PKTINFO")
Acked-by: David Ahern 
Signed-off-by: Wei Zhang 
---
v2:
 - add the missing Fixes line
 - better comment come from David Ahern

 net/ipv4/ip_sockglue.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 57e1405..53ae0c6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1225,8 +1225,14 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct 
sk_buff *skb)
 * which has interface index (iif) as the first member of the
 * underlying inet{6}_skb_parm struct. This code then overlays
 * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
-* element so the iif is picked up from the prior IPCB
+* element so the iif is picked up from the prior IPCB. If iif
+* is the loopback interface, then return the sending interface
+* (e.g., process binds socket to eth0 for Tx which is
+* redirected to loopback in the rtable/dst).
 */
+   if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+   pktinfo->ipi_ifindex = inet_iif(skb);
+
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
-- 
1.8.3.1

Re: [PATCH 05/12] Support for NIC-specific code

2016-12-29 Thread David VomLehn


Responses inline.

On 12/27/2016 09:21 PM, Rami Rosen wrote:

Hi, David,

Several nitpicks and comments, from a brief overview:

The commented label //err_exit:  should be removed

+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -0,0 +1,993 @@
+//err_exit:
+//err_exit:

Shouldn't aq_nic_rss_init() be static? isn't it called only from
aq_nic_cfg_init_defaults()?
and it always returns 0, shouldn't it be void as well ? (+ remove
checking the return code when invoking it in
aq_nic_cfg_init_defaults())

Yes, thanks.


+int aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues)
+{
+   struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+   struct aq_receive_scale_parameters *rss_params = &cfg->aq_rss;
+   int i = 0;
+

...

+   return 0;
+}


Shouldn't aq_nic_ndev_alloc() be static ? Isn't it invoked only from
aq_nic_alloc_cold()?

Yes.



+struct net_device *aq_nic_ndev_alloc(void)
+{

...

+}




+
+static unsigned int aq_nic_map_skb_lso(struct aq_nic_s *self,
+  struct sk_buff *skb,
+  struct aq_ring_buff_s *dx)
+{
+   unsigned int ret = 0U;
+
+   dx->flags = 0U;
+   dx->len_pkt = skb->len;
+   dx->len_l2 = ETH_HLEN;
+   dx->len_l3 = ip_hdrlen(skb);
+   dx->len_l4 = tcp_hdrlen(skb);
+   dx->mss = skb_shinfo(skb)->gso_size;
+   dx->is_txc = 1U;
+   ret = 1U;
+

Why not remove this "ret" variable, and simply return 1 ? the method
always returns 1:


+   return ret;
+}
+

Yes, better.

+int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
+{
+   struct aq_ring_s *ring = NULL;
+   unsigned int frags = 0U;
+   unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
+   unsigned int tc = 0U;
+   int err = 0;
+   bool is_nic_in_bad_state;
+   bool is_locked = false;
+   bool is_busy = false;
+   struct aq_ring_buff_s buffers[AQ_CFG_SKB_FRAGS_MAX];
+
+   frags = skb_shinfo(skb)->nr_frags + 1;
+
+   ring = self->aq_ring_tx[AQ_NIC_TCVEC2RING(self, tc, vec)];
+
+   atomic_inc(&self->busy_count);
+   is_busy = true;
+
+   if (frags > AQ_CFG_SKB_FRAGS_MAX) {
+   dev_kfree_skb_any(skb);
+   goto err_exit;
+   }
+
+   is_nic_in_bad_state = AQ_OBJ_TST(self, AQ_NIC_FLAGS_IS_NOT_TX_READY) ||
+   (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX);
+
+   if (is_nic_in_bad_state) {
+   aq_nic_ndev_queue_stop(self, ring->idx);
+   err = NETDEV_TX_BUSY;
+   goto err_exit;
+   }
+

Usage of this internal block is not common (unless it is under #ifdef,
and also not very common also in that case). I suggest move "unsigned
int trys" to the variables definitions in the beginning of the method
and remove the opening and closing brackets of the following block:

+   {
+   unsigned int trys = AQ_CFG_LOCK_TRYS;
+
+   frags = aq_nic_map_skb(self, skb, &buffers[0]);
+
+   do {
+   is_locked = spin_trylock(&ring->lock);
+   } while (--trys && !is_locked);
+   if (!(is_locked)) {
+   err = NETDEV_TX_BUSY;
+   goto err_exit;
+   }
+

Yes, this is better.

Usually you don't let the mtu be less than 68, for example:
http://lxr.free-electrons.com/source/drivers/net/ethernet/intel/i40e/i40e_main.c#L2246
See also RFV 791:
https://tools.ietf.org/html/rfc791



+int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu)
+{
+   int err = 0;
+
+   if (new_mtu > self->aq_hw_caps.mtu) {
+   err = 0;
+   goto err_exit;
+   }
+   self->aq_nic_cfg.mtu = new_mtu;
+
+err_exit:
+   return err;
+}

Clearly a must--thanks!

+
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h 
b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
new file mode 100644
index 000..89958e7
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -0,0 +1,111 @@
+/*
+ * Aquantia Corporation Network Driver
+ * Copyright (C) 2014-2016 Aquantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/*

Should be, of course, aq_nic.h:


+ * File aq_nic.c: Declaration of common code for NIC.
+ */
+
Good point. Better still, including the name of the file has little 
value and makes the comment incorrect if it gets renamed. So, thanks!

Regards,
Rami Rosen



--
David VL

RE: [PATCH net] net: stmmac: Fix error path after register_netdev move

2016-12-29 Thread Kweh, Hock Leong

> -Original Message-
> From: Florian Fainelli [mailto:f.faine...@gmail.com]
> Sent: Thursday, December 29, 2016 7:45 AM
> To: netdev@vger.kernel.org
> Cc: Florian Fainelli ; pa...@ucw.cz;
> joao.pi...@synopsys.com; seraphin.bonna...@st.com;
> alexandre.tor...@gmail.com; manab...@gmail.com; niklas.cas...@axis.com;
> jo...@kernel.org; Ong, Boon Leong ; Voon,
> Weifeng ; lars.pers...@axis.com; linux-
> ker...@vger.kernel.org; Giuseppe Cavallaro ;
> Alexandre Torgue 
> Subject: [PATCH net] net: stmmac: Fix error path after register_netdev move
> 
> Commit 5701659004d6 ("net: stmmac: Fix race between stmmac_drv_probe and
> stmmac_open") re-ordered how the MDIO bus registration and the network
> device are registered, but missed to unwind the MDIO bus registration in case
> we fail to register the network device.
> 
> Fixes: 5701659004d6 ("net: stmmac: Fix race between stmmac_drv_probe and
> stmmac_open")
> Signed-off-by: Florian Fainelli 
> ---

Acked-by: Kweh, Hock Leong 



>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 5910ea51f8f6..39eb7a65bb9f 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -3366,12 +3366,19 @@ int stmmac_dvr_probe(struct device *device,
>   }
> 
>   ret = register_netdev(ndev);
> - if (ret)
> + if (ret) {
>   netdev_err(priv->dev, "%s: ERROR %i registering the device\n",
>  __func__, ret);
> + goto error_netdev_register;
> + }
> 
>   return ret;
> 
> +error_netdev_register:
> + if (priv->hw->pcs != STMMAC_PCS_RGMII &&
> + priv->hw->pcs != STMMAC_PCS_TBI &&
> + priv->hw->pcs != STMMAC_PCS_RTBI)
> + stmmac_mdio_unregister(ndev);
>  error_mdio_register:
>   netif_napi_del(&priv->napi);
>  error_hw_init:
> --
> 2.9.3

[PATCH v4] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread Matthias Tafelmeier

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.
---
 Documentation/sysctl/net.txt | 21 +
 include/linux/netdevice.h|  4 
 net/core/dev.c   |  6 +-
 net/core/sysctl_net_core.c   | 31 ++-
 net/sched/sch_generic.c  |  2 +-
 5 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a 
NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll 
function
+of the driver for the per softirq cycle netdev_budget. This parameter 
influences
+the proportion of the configured netdev_budget that is spent on RPS based 
packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network 
stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is 
based
+on dev_weight and is calculated multiplicative (dev_weight * 
dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--
+
+Scales the maximum number of packets that can be processed during a TX softirq 
cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for 
asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a 
CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 
*stats64,
 extern int netdev_max_backlog;
 extern int netdev_tstamp_prequeue;
 extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+extern int dev_rx_weight;
+extern int dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device 
*upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..f2fe98b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,10 @@ EXPORT_SYMBOL(netdev_max_backlog);
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;/* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;/* bias for backlog 
weight */
+int dev_weight_tx_bias __read_mostly = 1;/* bias for output_queue 
quota */
+int dev_rx_weight __read_mostly = weight_p;
+int dev_tx_weight __read_mostly = weight_p;
 
 /* Called with irq disabled */
 static inline void napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int 
quota)
net_rps_action_and_irq_enable(sd);
}
 
-   napi->weight = weight_p;
+   napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int 
write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+   int ret;
+
+   ret = proc_dointvec(table, write, buffer, lenp, ppos);
+   if (ret != 0)
+   return ret;
+
+   dev_rx_weight = weight_p * dev_weight_rx_bias;
+   dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+   return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
   void __user *buffer, size_t *

Re: [PATCH] vif queue counters from int to long

2016-12-29 Thread Wei Liu

On Fri, Dec 23, 2016 at 04:09:23PM +0100, Mart van Santen wrote:
> 
> Hello,
> 
> This patch fixes an issue where counters in the queue have type int,
> while the counters of the vif itself are specified as long. This can
> cause incorrect reporting of tx/rx values of the vif interface.
> More extensively reported on xen-devel mailinglist.
> 

Hello,

Please also CC xen-de...@lists.xenproject.org for your future patch(es).
And please note that the most up to date maintainer information should
be used.

Wei.

> 
> 
> Signed-off-by: Mart van Santen 
> --- a/drivers/net/xen-netback/common.h  2016-12-22 15:41:07.785535748 +
> +++ b/drivers/net/xen-netback/common.h  2016-12-23 13:08:18.123080064 +
> @@ -113,10 +113,10 @@ struct xenvif_stats {
>  * A subset of struct net_device_stats that contains only the
>  * fields that are updated in netback.c for each queue.
>  */
> -   unsigned int rx_bytes;
> -   unsigned int rx_packets;
> -   unsigned int tx_bytes;
> -   unsigned int tx_packets;
> +   unsigned long rx_bytes;
> +   unsigned long rx_packets;
> +   unsigned long tx_bytes;
> +   unsigned long tx_packets;
> 
> /* Additional stats used by xenvif */
> unsigned long rx_gso_checksum_fixup;
> 
> -- 
> Mart van Santen
> Greenhost
> E: m...@greenhost.nl
> T: +31 20 4890444
> W: https://greenhost.nl
> 
> A PGP signature can be attached to this e-mail,
> you need PGP software to verify it. 
> My public key is available in keyserver(s)
> see: http://tinyurl.com/openpgp-manual
> 
> PGP Fingerprint: CA85 EB11 2B70 042D AF66  B29A 6437 01A1 10A3 D3A5
> 
>

[PATCH] scm: fix possible control message header alignment issue

2016-12-29 Thread yuan linyu

From: yuan linyu 

1. put_cmsg{_compat}() may copy data to user when buffer free space less than
   control message header alignment size.
2. scm_detach_fds{_compat}() may calc wrong fdmax if control message header
   have greater alignment size.

Signed-off-by: yuan linyu 
---
 net/compat.c   | 10 --
 net/core/scm.c |  8 +---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index 96c544b..fe1f41c 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -245,7 +245,9 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int 
type, int len, void *dat
 
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
return -EFAULT;
-   if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct 
compat_cmsghdr)))
+   if (cmlen > CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) && 
+   copy_to_user(CMSG_COMPAT_DATA(cm), data, 
+   cmlen - CMSG_COMPAT_ALIGN(sizeof(struct 
compat_cmsghdr
return -EFAULT;
cmlen = CMSG_COMPAT_SPACE(len);
if (kmsg->msg_controllen < cmlen)
@@ -258,12 +260,16 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int 
type, int len, void *dat
 void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
 {
struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) 
kmsg->msg_control;
-   int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / 
sizeof(int);
+   int fdmax = 0;
int fdnum = scm->fp->count;
struct file **fp = scm->fp->fp;
int __user *cmfptr;
int err = 0, i;
 
+   if (kmsg->msg_controllen > CMSG_COMPAT_ALIGN(sizeof(struct 
compat_cmsghdr)))
+   fdmax = (kmsg->msg_controllen - 
+CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))) / 
sizeof(int);
+
if (fdnum < fdmax)
fdmax = fdnum;
 
diff --git a/net/core/scm.c b/net/core/scm.c
index d882043..5d8ef4f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -238,7 +238,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int 
len, void *data)
err = -EFAULT;
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
goto out;
-   if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+   if (cmlen > CMSG_ALIGN(sizeof(struct cmsghdr)) && 
+   copy_to_user(CMSG_DATA(cm), data, 
+   cmlen - CMSG_ALIGN(sizeof(struct cmsghdr
goto out;
cmlen = CMSG_SPACE(len);
if (msg->msg_controllen < cmlen)
@@ -267,8 +269,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie 
*scm)
return;
}
 
-   if (msg->msg_controllen > sizeof(struct cmsghdr))
-   fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
+   if (msg->msg_controllen > CMSG_ALIGN(sizeof(struct cmsghdr)))
+   fdmax = ((msg->msg_controllen - CMSG_ALIGN(sizeof(struct 
cmsghdr)))
 / sizeof(int));
 
if (fdnum < fdmax)
-- 
2.7.4

[PATCH v2] scm: fix possible control message header alignment issue

2016-12-29 Thread yuan linyu

From: yuan linyu 

1. put_cmsg{_compat}() may copy data to user when buffer free space less than
   control message header alignment size.
2. scm_detach_fds{_compat}() may calc wrong fdmax if control message header
   have greater alignment size.

Signed-off-by: yuan linyu 
---
 net/compat.c   | 10 --
 net/core/scm.c |  8 +---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index 96c544b..ffe7a04 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -245,7 +245,9 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int 
type, int len, void *dat
 
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
return -EFAULT;
-   if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct 
compat_cmsghdr)))
+   if (cmlen > CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) &&
+   copy_to_user(CMSG_COMPAT_DATA(cm), data,
+   cmlen - CMSG_COMPAT_ALIGN(sizeof(struct 
compat_cmsghdr
return -EFAULT;
cmlen = CMSG_COMPAT_SPACE(len);
if (kmsg->msg_controllen < cmlen)
@@ -258,12 +260,16 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int 
type, int len, void *dat
 void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
 {
struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) 
kmsg->msg_control;
-   int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / 
sizeof(int);
+   int fdmax = 0;
int fdnum = scm->fp->count;
struct file **fp = scm->fp->fp;
int __user *cmfptr;
int err = 0, i;
 
+   if (kmsg->msg_controllen > CMSG_COMPAT_ALIGN(sizeof(struct 
compat_cmsghdr)))
+   fdmax = (kmsg->msg_controllen -
+CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))) / 
sizeof(int);
+
if (fdnum < fdmax)
fdmax = fdnum;
 
diff --git a/net/core/scm.c b/net/core/scm.c
index d882043..b2e60fd 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -238,7 +238,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int 
len, void *data)
err = -EFAULT;
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
goto out;
-   if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+   if (cmlen > CMSG_ALIGN(sizeof(struct cmsghdr)) &&
+   copy_to_user(CMSG_DATA(cm), data,
+   cmlen - CMSG_ALIGN(sizeof(struct cmsghdr
goto out;
cmlen = CMSG_SPACE(len);
if (msg->msg_controllen < cmlen)
@@ -267,8 +269,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie 
*scm)
return;
}
 
-   if (msg->msg_controllen > sizeof(struct cmsghdr))
-   fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
+   if (msg->msg_controllen > CMSG_ALIGN(sizeof(struct cmsghdr)))
+   fdmax = ((msg->msg_controllen - CMSG_ALIGN(sizeof(struct 
cmsghdr)))
 / sizeof(int));
 
if (fdnum < fdmax)
-- 
2.7.4

[PATCH] ipv4: make tcp_notsent_lowat sysctl knob behave as true unsigned int

2016-12-29 Thread Pavel Tikhomirov

> cat /proc/sys/net/ipv4/tcp_notsent_lowat
-1
> echo 4294967295 > /proc/sys/net/ipv4/tcp_notsent_lowat
-bash: echo: write error: Invalid argument
> echo -2147483648 > /proc/sys/net/ipv4/tcp_notsent_lowat
> cat /proc/sys/net/ipv4/tcp_notsent_lowat
-2147483648

but in documentation we have "tcp_notsent_lowat - UNSIGNED INTEGER"

Signed-off-by: Pavel Tikhomirov 
---
 net/ipv4/sysctl_net_ipv4.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 80bc36b..5361373 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -41,6 +41,7 @@ static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static unsigned int uint_max = UINT_MAX;
 
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
@@ -958,7 +959,9 @@ static struct ctl_table ipv4_net_table[] = {
.data   = &init_net.ipv4.sysctl_tcp_notsent_lowat,
.maxlen = sizeof(unsigned int),
.mode   = 0644,
-   .proc_handler   = proc_dointvec,
+   .proc_handler   = proc_doulongvec_minmax,
+   .extra1 = &zero,
+   .extra2 = &uint_max,
},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
{
-- 
2.9.3

Re: [PATCH net] sctp: do not loose window information if in rwnd_over

2016-12-29 Thread Neil Horman

On Fri, Dec 23, 2016 at 02:29:02PM -0200, Marcelo Ricardo Leitner wrote:
> It's possible that we receive a packet that is larger than current
> window. If it's the first packet in this way, it will cause it to
> increase rwnd_over. Then, if we receive another data chunk (specially as
> SCTP allows you to have one data chunk in flight even during 0 window),
> rwnd_over will be overwritten instead of added to.
> 
> In the long run, this could cause the window to grow bigger than its
> initial size, as rwnd_over would be charged only for the last received
> data chunk while the code will try open the window for all packets that
> were received and had its value in rwnd_over overwritten. This, then,
> can lead to the worsening of payload/buffer ratio and cause rwnd_press
> to kick in more often.
> 
> The fix is to sum it too, same as is done for rwnd_press, so that if we
> receive 3 chunks after closing the window, we still have to release that
> same amount before re-opening it.
> 
> Log snippet from sctp_test exhibiting the issue:
> [  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:88013928e000
> rwnd decreased by 1 to (0, 1, 114221)
> [  146.209232] sctp: sctp_assoc_rwnd_decrease:
> association:88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1!
> [  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:88013928e000
> rwnd decreased by 1 to (0, 1, 114221)
> [  146.209232] sctp: sctp_assoc_rwnd_decrease:
> association:88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1!
> [  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:88013928e000
> rwnd decreased by 1 to (0, 1, 114221)
> [  146.209232] sctp: sctp_assoc_rwnd_decrease:
> association:88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1!
> [  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:88013928e000
> rwnd decreased by 1 to (0, 1, 114221)
> 
> Signed-off-by: Marcelo Ricardo Leitner 
> ---
>  net/sctp/associola.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 
> 68428e1f71810fbe65b7f86c750c3ad61f0266ec..56ddcfaeb4f64235b54b0daa915fabf0cc0170a9
>  100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -1539,7 +1539,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association 
> *asoc, unsigned int len)
>   asoc->rwnd = 0;
>   }
>   } else {
> - asoc->rwnd_over = len - asoc->rwnd;
> + asoc->rwnd_over += len - asoc->rwnd;
>   asoc->rwnd = 0;
>   }
>  
> -- 
> 2.9.3
> 
> 
Acked-by: Neil Horman

[PATCH] rtlwifi: fix spelling mistake: "contry" -> "country"

2016-12-29 Thread Colin King

From: Colin Ian King 

trivial fix to spelling mistake in RT_TRACE message

Signed-off-by: Colin Ian King 
---
 drivers/net/wireless/realtek/rtlwifi/regd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/regd.c 
b/drivers/net/wireless/realtek/rtlwifi/regd.c
index 6ee6bf8..558c31b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/regd.c
+++ b/drivers/net/wireless/realtek/rtlwifi/regd.c
@@ -440,7 +440,7 @@ int rtl_regd_init(struct ieee80211_hw *hw,
 
if (rtlpriv->regd.country_code >= COUNTRY_CODE_MAX) {
RT_TRACE(rtlpriv, COMP_REGD, DBG_DMESG,
-"rtl: EEPROM indicates invalid contry code, world wide 
13 should be used\n");
+"rtl: EEPROM indicates invalid country code, world 
wide 13 should be used\n");
 
rtlpriv->regd.country_code = COUNTRY_CODE_WORLD_WIDE_13;
}
-- 
2.10.2

Re: [PATCH] net: ethernet: ti: davinci_cpdma: fix access to uninitialized variable in cpdma_chan_set_descs()

2016-12-29 Thread Grygorii Strashko

Hi Ivan,

On 12/28/2016 07:49 PM, Ivan Khoronzhuk wrote:
> On Wed, Dec 28, 2016 at 05:42:13PM -0600, Grygorii Strashko wrote:
>> Now below code sequence causes "Unable to handle kernel NULL pointer
>> dereference.." exception and system crash during CPSW CPDMA initialization:
>>
>> cpsw_probe
>> |-cpdma_chan_create (TX channel)
>>   |-cpdma_chan_split_pool
>> |-cpdma_chan_set_descs(for TX channels)
>> |-cpdma_chan_set_descs(for RX channels) [1]
>>
>> - and -
>> static void cpdma_chan_set_descs(struct cpdma_ctlr *ctlr,
>>   int rx, int desc_num,
>>   int per_ch_desc)
>> {
>>  struct cpdma_chan *chan, *most_chan = NULL;
>>
>> ...
>>
>>  for (i = min; i < max; i++) {
>>  chan = ctlr->channels[i];
>>  if (!chan)
>>  continue;
>> ...
>>
>>  if (most_dnum < chan->desc_num) {
>>  most_dnum = chan->desc_num;
>>  most_chan = chan;
>>  }
>>  }
>>  /* use remains */
>>  most_chan->desc_num += desc_cnt; [2]
>> }
>>
>> So, most_chan value will never be reassigned when cpdma_chan_set_descs() is
>> called second time [1], because there are no RX channels yet and system
>> will crash at [2].
> 
> How did you get this?
> I just remember as I fixed it before sending patchset.
> 
> Maybe it was some experiment with it.
> I just wonder and want to find actual reason what's happening.
> 
> Look bellow:
> 
> cpsw_probe
> |-cpdma_chan_create (TX channel)
>   |-cpdma_chan_split_pool
> |-cpdma_chan_set_descs(for TX channels)
> |-cpdma_chan_set_descs(for RX channels) [1]
> 
> |-cpdma_chan_set_descs(for RX channels) in case you'be described has to be
> called with rx_desc_num = 0, because all descs are assigned already for tx
> channel. And, if desc_num = 0, cpdma_chan_set_descs just exits and no issues.
> So, could you please explain how you get this, in what circumstances.

You are right. I've hit this issue while working on other feature which allows
to split pool between RX and TX path and as part of it cpdma_chan_set_descs()
is called with different set of arguments. I probably will just squash it in my 
changes or 
or send as part of my series.


-- 
regards,
-grygorii

Re: [PATCH net-next V2 3/3] tun: rx batching

2016-12-29 Thread David Miller

From: Jason Wang 
Date: Wed, 28 Dec 2016 16:09:31 +0800

> + spin_lock(&queue->lock);
> + qlen = skb_queue_len(queue);
> + if (qlen > rx_batched)
> + goto drop;
> + __skb_queue_tail(queue, skb);
> + if (!more || qlen + 1 > rx_batched) {
> + __skb_queue_head_init(&process_queue);
> + skb_queue_splice_tail_init(queue, &process_queue);
> + rcv = true;
> + }
> + spin_unlock(&queue->lock);

Since you always clear the 'queue' when you insert the skb that hits
the limit, I don't see how the "goto drop" path can be possibly taken.

[PATCH net 1/5] net/mlx4_core: Use-after-free causes a resource leak in flow-steering detach

2016-12-29 Thread Tariq Toukan

From: Jack Morgenstein 

mlx4_QP_FLOW_STEERING_DETACH_wrapper first removes the steering
rule (which results in freeing the rule structure), and then
references a field in this struct (the qp number) when releasing the
busy-status on the rule's qp.

Since this memory was freed, it could reallocated and changed.
Therefore, the qp number in the struct may be incorrect,
so that we are releasing the incorrect qp. This leaves the rule's qp
in the busy state (and could possibly release an incorrect qp as well).

Fix this by saving the qp number in a local variable, for use after
removing the steering rule.

Fixes: 2c473ae7e582 ("net/mlx4_core: Disallow releasing VF QPs which have 
steering rules")
Signed-off-by: Jack Morgenstein 
Signed-off-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c 
b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index c548beaaf910..4b3e139e9c82 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -4473,6 +4473,7 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev 
*dev, int slave,
struct res_qp *rqp;
struct res_fs_rule *rrule;
u64 mirr_reg_id;
+   int qpn;
 
if (dev->caps.steering_mode !=
MLX4_STEERING_MODE_DEVICE_MANAGED)
@@ -4489,10 +4490,11 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct 
mlx4_dev *dev, int slave,
}
mirr_reg_id = rrule->mirr_rule_id;
kfree(rrule->mirr_mbox);
+   qpn = rrule->qpn;
 
/* Release the rule form busy state before removal */
put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
-   err = get_res(dev, slave, rrule->qpn, RES_QP, &rqp);
+   err = get_res(dev, slave, qpn, RES_QP, &rqp);
if (err)
return err;
 
@@ -4517,7 +4519,7 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev 
*dev, int slave,
if (!err)
atomic_dec(&rqp->ref_count);
 out:
-   put_res(dev, slave, rrule->qpn, RES_QP);
+   put_res(dev, slave, qpn, RES_QP);
return err;
 }
 
-- 
1.8.3.1

[PATCH net 2/5] net/mlx4_en: Fix bad WQE issue

2016-12-29 Thread Tariq Toukan

From: Eugenia Emantayev 

Single send WQE in RX buffer should be stamped with software
ownership in order to prevent the flow of QP in error in FW
once UPDATE_QP is called.


Fixes: 9f519f68cfff ('mlx4_en: Not using Shared Receive Queues')
Signed-off-by: Eugenia Emantayev 
Signed-off-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 3c37e216bbf3..eac527e25ec9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -445,8 +445,14 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn;
 
ring->stride = stride;
-   if (ring->stride <= TXBB_SIZE)
+   if (ring->stride <= TXBB_SIZE) {
+   /* Stamp first unused send wqe */
+   __be32 *ptr = (__be32 *)ring->buf;
+   __be32 stamp = cpu_to_be32(1 << STAMP_SHIFT);
+   *ptr = stamp;
+   /* Move pointer to start of rx section */
ring->buf += TXBB_SIZE;
+   }
 
ring->log_stride = ffs(ring->stride) - 1;
ring->buf_size = ring->size * ring->stride;
-- 
1.8.3.1

[PATCH net 5/5] net/mlx4_core: Fix raw qp flow steering rules under SRIOV

2016-12-29 Thread Tariq Toukan

From: Jack Morgenstein 

Demoting simple flow steering rule priority (for DPDK) was achieved by
wrapping FW commands MLX4_QP_FLOW_STEERING_ATTACH/DETACH for the PF
as well, and forcing the priority to MLX4_DOMAIN_NIC in the wrapper
function for the PF and all VFs.

In function mlx4_ib_create_flow(), this change caused the main rule
creation for the PF to be wrapped, while it left the associated
tunnel steering rule creation unwrapped for the PF.

This mismatch caused rule deletion failures in mlx4_ib_destroy_flow()
for the PF when the detach wrapper function did not find the associated
tunnel-steering rule (since creation of that rule for the PF did not
go through the wrapper function).

Fix this by setting MLX4_QP_FLOW_STEERING_ATTACH/DETACH to be "native"
(so that the PF invocation does not go through the wrapper), and perform
the required priority demotion for the PF in the mlx4_ib_create_flow()
code path.

Fixes: 48564135cba8 ("net/mlx4_core: Demote simple multicast and broadcast flow 
steering rules")
Signed-off-by: Jack Morgenstein 
Signed-off-by: Tariq Toukan 
---
 drivers/infiniband/hw/mlx4/main.c  | 14 --
 drivers/net/ethernet/mellanox/mlx4/main.c  | 18 ++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 22 +-
 include/linux/mlx4/device.h|  2 ++
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index c8413fc120e6..7031a8dd4d14 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1682,9 +1682,19 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, 
struct ib_flow_attr *flow_att
size += ret;
}
 
+   if (mlx4_is_master(mdev->dev) && flow_type == MLX4_FS_REGULAR &&
+   flow_attr->num_of_specs == 1) {
+   struct _rule_hw *rule_header = (struct _rule_hw *)(ctrl + 1);
+   enum ib_flow_spec_type header_spec =
+   ((union ib_flow_spec *)(flow_attr + 1))->type;
+
+   if (header_spec == IB_FLOW_SPEC_ETH)
+   mlx4_handle_eth_header_mcast_prio(ctrl, rule_header);
+   }
+
ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
-  MLX4_CMD_WRAPPED);
+  MLX4_CMD_NATIVE);
if (ret == -ENOMEM)
pr_err("mcg table is full. Fail to register network rule.\n");
else if (ret == -ENXIO)
@@ -1701,7 +1711,7 @@ static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, 
u64 reg_id)
int err;
err = mlx4_cmd(dev, reg_id, 0, 0,
   MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
-  MLX4_CMD_WRAPPED);
+  MLX4_CMD_NATIVE);
if (err)
pr_err("Fail to detach network rule. registration id = 
0x%llx\n",
   reg_id);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index 5e7840a7a33b..bffa6f345f2f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -782,6 +783,23 @@ int mlx4_is_slave_active(struct mlx4_dev *dev, int slave)
 }
 EXPORT_SYMBOL(mlx4_is_slave_active);
 
+void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl 
*ctrl,
+  struct _rule_hw *eth_header)
+{
+   if (is_multicast_ether_addr(eth_header->eth.dst_mac) ||
+   is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+   struct mlx4_net_trans_rule_hw_eth *eth =
+   (struct mlx4_net_trans_rule_hw_eth *)eth_header;
+   struct _rule_hw *next_rule = (struct _rule_hw *)(eth + 1);
+   bool last_rule = next_rule->size == 0 && next_rule->id == 0 &&
+   next_rule->rsvd == 0;
+
+   if (last_rule)
+   ctrl->prio = cpu_to_be16(MLX4_DOMAIN_NIC);
+   }
+}
+EXPORT_SYMBOL(mlx4_handle_eth_header_mcast_prio);
+
 static void slave_adjust_steering_mode(struct mlx4_dev *dev,
   struct mlx4_dev_cap *dev_cap,
   struct mlx4_init_hca_param *hca_param)
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c 
b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 4b3e139e9c82..56185a0b827d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -4164,22 +4164,6 @@ static int validate_eth_header_mac(int slave, struct 
_rule_hw *eth_header,
return 0;
 }
 
-static void handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl 
*ctrl,
-

[PATCH net 3/5] net/mlx4: Remove BUG_ON from ICM allocation routine

2016-12-29 Thread Tariq Toukan

From: Leon Romanovsky 

This patch removes BUG_ON() macro from mlx4_alloc_icm_coherent()
by checking DMA address alignment in advance and performing proper
folding in case of error.

Fixes: 5b0bf5e25efe ("mlx4_core: Support ICM tables in coherent memory")
Reported-by: Ozgur Karatas 
Signed-off-by: Leon Romanovsky 
Signed-off-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx4/icm.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 2a9dd460a95f..e1f9e7cebf8f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -118,8 +118,13 @@ static int mlx4_alloc_icm_coherent(struct device *dev, 
struct scatterlist *mem,
if (!buf)
return -ENOMEM;
 
+   if (offset_in_page(buf)) {
+   dma_free_coherent(dev, PAGE_SIZE << order,
+ buf, sg_dma_address(mem));
+   return -ENOMEM;
+   }
+
sg_set_buf(mem, buf, PAGE_SIZE << order);
-   BUG_ON(mem->offset);
sg_dma_len(mem) = PAGE_SIZE << order;
return 0;
 }
-- 
1.8.3.1

[PATCH net 4/5] net/mlx4_en: Fix type mismatch for 32-bit systems

2016-12-29 Thread Tariq Toukan

From: Slava Shwartsman 

is_power_of_2 expects unsigned long and we pass u64 max_val_cycles,
this will be truncated on 32 bit systems, and the result is not what we
were expecting.
div_u64 expects u32 as a second argument and we pass
max_val_cycles_rounded which is u64 hence it will always be truncated.
Fix was tested on both 64 and 32 bit systems and got same results for
max_val_cycles and max_val_cycles_rounded.

Fixes: 4850cf458157 ("net/mlx4_en: Resolve dividing by zero in 32-bit system")
Signed-off-by: Slava Shwartsman 
Signed-off-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx4/en_clock.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c 
b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index 015198c14fa8..504461a464c5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -245,13 +245,9 @@ static u32 freq_to_shift(u16 freq)
 {
u32 freq_khz = freq * 1000;
u64 max_val_cycles = freq_khz * 1000 * MLX4_EN_WRAP_AROUND_SEC;
-   u64 tmp_rounded =
-   roundup_pow_of_two(max_val_cycles) > max_val_cycles ?
-   roundup_pow_of_two(max_val_cycles) - 1 : UINT_MAX;
-   u64 max_val_cycles_rounded = is_power_of_2(max_val_cycles + 1) ?
-   max_val_cycles : tmp_rounded;
+   u64 max_val_cycles_rounded = 1ULL << fls64(max_val_cycles - 1);
/* calculate max possible multiplier in order to fit in 64bit */
-   u64 max_mul = div_u64(0xULL, max_val_cycles_rounded);
+   u64 max_mul = div64_u64(ULLONG_MAX, max_val_cycles_rounded);
 
/* This comes from the reverse of clocksource_khz2mult */
return ilog2(div_u64(max_mul * freq_khz, 100));
-- 
1.8.3.1

[PATCH net 0/5] mlx4 misc fixes

2016-12-29 Thread Tariq Toukan

Hi Dave,

This patchset contains several bug fixes from the team to the
mlx4 Eth and Core drivers.

Series generated against net commit:
60133867f1f1 'net: wan: slic_ds26522: fix spelling mistake: "configurated" -> 
"configured"'

Thanks,
Tariq.

Eugenia Emantayev (1):
  net/mlx4_en: Fix bad WQE issue

Jack Morgenstein (2):
  net/mlx4_core: Use-after-free causes a resource leak in flow-steering
detach
  net/mlx4_core: Fix raw qp flow steering rules under SRIOV

Leon Romanovsky (1):
  net/mlx4: Remove BUG_ON from ICM allocation routine

Slava Shwartsman (1):
  net/mlx4_en: Fix type mismatch for 32-bit systems

 drivers/infiniband/hw/mlx4/main.c  | 14 +--
 drivers/net/ethernet/mellanox/mlx4/en_clock.c  |  8 ++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c |  8 ++-
 drivers/net/ethernet/mellanox/mlx4/icm.c   |  7 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  | 18 ++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 28 --
 include/linux/mlx4/device.h|  2 ++
 7 files changed, 52 insertions(+), 33 deletions(-)

-- 
1.8.3.1

Re: [PATCH 1/2] ipv4: Namespaceify tcp_tw_recycle and tcp_max_tw_buckets knob

2016-12-29 Thread David Miller

From: Haishuang Yan 
Date: Wed, 28 Dec 2016 17:52:32 +0800

> Different namespace application might require fast recycling
> TIME-WAIT sockets independently of the host.
> 
> Signed-off-by: Haishuang Yan 

Applied, but:

> @@ -111,6 +121,7 @@ struct netns_ipv4 {
>   int sysctl_tcp_fin_timeout;
>   unsigned int sysctl_tcp_notsent_lowat;
>   int sysctl_tcp_tw_reuse;
> + struct inet_timewait_death_row tcp_death_row; 
 ^^

Trailing whitespace I had to fix up.

Re: [PATCH 2/2] ipv4: Namespaceify tcp_max_syn_backlog knob

2016-12-29 Thread David Miller

From: Haishuang Yan 
Date: Wed, 28 Dec 2016 17:52:33 +0800

> Different namespace application might require different maximal
> number of remembered connection requests.
> 
> Signed-off-by: Haishuang Yan 

Applied.

Re: [PATCH v4] stmmac: enable rx queues

2016-12-29 Thread David Miller

From: Joao Pinto 
Date: Wed, 28 Dec 2016 12:57:48 +

> When the hardware is synthesized with multiple queues, all queues are
> disabled for default. This patch adds the rx queues configuration.
> This patch was successfully tested in a Synopsys QoS Reference design.
> 
> Signed-off-by: Joao Pinto 

Applied, thanks.

Re: [PATCH] ipv6: Should use consistent conditional judgement for ip6 fragment between __ip6_append_data and ip6_finish_output

2016-12-29 Thread David Miller

From: Zheng Li 
Date: Wed, 28 Dec 2016 23:23:46 +0800

> From: Zheng Li 
> 
> There is an inconsistent conditional judgement between __ip6_append_data
> and ip6_finish_output functions, the variable length in __ip6_append_data
> just include the length of application's payload and udp6 header, don't
> include the length of ipv6 header, but in ip6_finish_output use
> (skb->len > ip6_skb_dst_mtu(skb)) as judgement, and skb->len include the
> length of ipv6 header.
> 
> That causes some particular application's udp6 payloads whose length are
> between (MTU - IPv6 Header) and MTU were fragmented by ip6_fragment even
> though the rst->dev support UFO feature.
> 
> Add the length of ipv6 header to length in __ip6_append_data to keep
> consistent conditional judgement as ip6_finish_output for ip6 fragment.
> 
> Signed-off-by: Zheng Li 

Applied, thank you.

Re: [PATCHv2 net-next 00/11] net: mvpp2: misc improvements and preparation patches

2016-12-29 Thread David Miller

From: Thomas Petazzoni 
Date: Wed, 28 Dec 2016 17:45:56 +0100

> This series contains a number of misc improvements and preparation
> patches for an upcoming series that adds support for the new PPv2.2
> network controller to the mvpp2 driver.
> 
> The most significant improvements are:
> 
>  - Switching to using build_skb(), which is necessary for the upcoming
>PPv2.2 support, but anyway a good improvement to the current mvpp2
>driver (supporting PPv2.1).
> 
>  - Making the driver build on 64-bit platforms.
> 
> Changes since v1:
> 
>  - This series is split as a separate series from the larger patch set
>adding support for PPv2.2 in the mvpp2 driver, as requested by
>David Miller.
> 
>  - Rebased on top of v4.10-rc1.

You still have warnings to fix for the 64-bit build:

drivers/net/ethernet/marvell/mvpp2.c: In function ‘mvpp2_rx’:
drivers/net/ethernet/marvell/mvpp2.c:5125:10: warning: cast to pointer from 
integer of different size [-Wint-to-pointer-cast]
   data = (void *)rx_desc->buf_cookie;
  ^

Re: ipv6: remove unnecessary inet6_sk check

2016-12-29 Thread David Miller

From: Dave Jones 
Date: Wed, 28 Dec 2016 11:53:18 -0500

> np is already assigned in the variable declaration of ping_v6_sendmsg.
> At this point, we have already dereferenced np several times, so the
> NULL check is also redundant.
> 
> Suggested-by: Eric Dumazet 
> Signed-off-by: Dave Jones 

Applied, thanks Dave.

Re: [PATCH] drivers: atm: eni: rename macro DAUGTHER_ID to fix spelling mistake

2016-12-29 Thread David Miller

From: Colin King 
Date: Wed, 28 Dec 2016 17:31:20 +

> From: Colin Ian King 
> 
> Rename DAUGTHER_ID to DAUGHTER_ID to fix spelling mistake
> 
> Signed-off-by: Colin Ian King 

Applied.

Re: [PATCH net] net: stmmac: Fix error path after register_netdev move

2016-12-29 Thread David Miller

From: Florian Fainelli 
Date: Wed, 28 Dec 2016 15:44:41 -0800

> Commit 5701659004d6 ("net: stmmac: Fix race between stmmac_drv_probe and
> stmmac_open") re-ordered how the MDIO bus registration and the network
> device are registered, but missed to unwind the MDIO bus registration in
> case we fail to register the network device.
> 
> Fixes: 5701659004d6 ("net: stmmac: Fix race between stmmac_drv_probe and 
> stmmac_open")
> Signed-off-by: Florian Fainelli 

Applied, thanks Florian.

[PATCH] stmmac: adding EEE to GMAC4

2016-12-29 Thread Joao Pinto

This patch adds Energy Efficiency Ethernet to GMAC4.

Signed-off-by: Joao Pinto 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h  | 12 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 59 +++
 2 files changed, 71 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h 
b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index b524598..73d1dab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -90,6 +90,18 @@ enum power_event {
power_down = 0x0001,
 };
 
+/* Energy Efficient Ethernet (EEE) for GMAC4
+ *
+ * LPI status, timer and control register offset
+ */
+#define GMAC4_LPI_CTRL_STATUS  0xd0
+#define GMAC4_LPI_TIMER_CTRL   0xd4
+
+/* LPI control and status defines */
+#define GMAC4_LPI_CTRL_STATUS_LPITXA   BIT(19) /* Enable LPI TX Automate */
+#define GMAC4_LPI_CTRL_STATUS_PLS  BIT(17) /* PHY Link Status */
+#define GMAC4_LPI_CTRL_STATUS_LPIENBIT(16) /* LPI Enable */
+
 /* MAC Debug bitmap */
 #define GMAC_DEBUG_TFCSTS_MASK GENMASK(18, 17)
 #define GMAC_DEBUG_TFCSTS_SHIFT17
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index ecfbf57..02eab79 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -137,6 +137,61 @@ static void dwmac4_get_umac_addr(struct mac_device_info 
*hw,
   GMAC_ADDR_LOW(reg_n));
 }
 
+static void dwmac4_set_eee_mode(struct mac_device_info *hw)
+{
+   void __iomem *ioaddr = hw->pcsr;
+   u32 value;
+
+   /* Enable the link status receive on RGMII, SGMII ore SMII
+* receive path and instruct the transmit to enter in LPI
+* state.
+*/
+   value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
+   value |= GMAC4_LPI_CTRL_STATUS_LPIEN | GMAC4_LPI_CTRL_STATUS_LPITXA;
+
+   writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
+}
+
+static void dwmac4_reset_eee_mode(struct mac_device_info *hw)
+{
+   void __iomem *ioaddr = hw->pcsr;
+   u32 value;
+
+   value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
+   value &= ~(GMAC4_LPI_CTRL_STATUS_LPIEN | GMAC4_LPI_CTRL_STATUS_LPITXA);
+   writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
+}
+
+static void dwmac4_set_eee_pls(struct mac_device_info *hw, int link)
+{
+   void __iomem *ioaddr = hw->pcsr;
+   u32 value;
+
+   value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
+
+   if (link)
+   value |= GMAC4_LPI_CTRL_STATUS_PLS;
+   else
+   value &= ~GMAC4_LPI_CTRL_STATUS_PLS;
+
+   writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
+}
+
+static void dwmac4_set_eee_timer(struct mac_device_info *hw, int ls, int tw)
+{
+   void __iomem *ioaddr = hw->pcsr;
+   int value = ((tw & 0x)) | ((ls & 0x7ff) << 16);
+
+   /* Program the timers in the LPI timer control register:
+* LS: minimum time (ms) for which the link
+*  status from PHY should be ok before transmitting
+*  the LPI pattern.
+* TW: minimum time (us) for which the core waits
+*  after it has stopped transmitting the LPI pattern.
+*/
+   writel(value, ioaddr + GMAC4_LPI_TIMER_CTRL);
+}
+
 static void dwmac4_set_filter(struct mac_device_info *hw,
  struct net_device *dev)
 {
@@ -410,6 +465,10 @@ static const struct stmmac_ops dwmac4_ops = {
.pmt = dwmac4_pmt,
.set_umac_addr = dwmac4_set_umac_addr,
.get_umac_addr = dwmac4_get_umac_addr,
+   .set_eee_mode = dwmac4_set_eee_mode,
+   .reset_eee_mode = dwmac4_reset_eee_mode,
+   .set_eee_timer = dwmac4_set_eee_timer,
+   .set_eee_pls = dwmac4_set_eee_pls,
.pcs_ctrl_ane = dwmac4_ctrl_ane,
.pcs_rane = dwmac4_rane,
.pcs_get_adv_lp = dwmac4_get_adv_lp,
-- 
2.9.3

[PATCH net-next 10/14] bnxt_en: Add IPV6 hardware RFS support.

2016-12-29 Thread Michael Chan

Accept ipv6 flows in .ndo_rx_flow_steer() and support ETHTOOL_GRXCLSRULE
ipv6 flows.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 32 +++---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 53 +--
 2 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0ca530e..4d478e7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3316,10 +3316,26 @@ static int bnxt_hwrm_cfa_ntuple_filter_alloc(struct 
bnxt *bp,
req.ip_addr_type = CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4;
req.ip_protocol = keys->basic.ip_proto;
 
-   req.src_ipaddr[0] = keys->addrs.v4addrs.src;
-   req.src_ipaddr_mask[0] = cpu_to_be32(0x);
-   req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
-   req.dst_ipaddr_mask[0] = cpu_to_be32(0x);
+   if (keys->basic.n_proto == htons(ETH_P_IPV6)) {
+   int i;
+
+   req.ethertype = htons(ETH_P_IPV6);
+   req.ip_addr_type =
+   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6;
+   *(struct in6_addr *)&req.src_ipaddr[0] =
+   keys->addrs.v6addrs.src;
+   *(struct in6_addr *)&req.dst_ipaddr[0] =
+   keys->addrs.v6addrs.dst;
+   for (i = 0; i < 4; i++) {
+   req.src_ipaddr_mask[i] = cpu_to_be32(0x);
+   req.dst_ipaddr_mask[i] = cpu_to_be32(0x);
+   }
+   } else {
+   req.src_ipaddr[0] = keys->addrs.v4addrs.src;
+   req.src_ipaddr_mask[0] = cpu_to_be32(0x);
+   req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
+   req.dst_ipaddr_mask[0] = cpu_to_be32(0x);
+   }
 
req.src_port = keys->ports.src;
req.src_port_mask = cpu_to_be16(0x);
@@ -6588,12 +6604,18 @@ static int bnxt_rx_flow_steer(struct net_device *dev, 
const struct sk_buff *skb,
goto err_free;
}
 
-   if ((fkeys->basic.n_proto != htons(ETH_P_IP)) ||
+   if ((fkeys->basic.n_proto != htons(ETH_P_IP) &&
+fkeys->basic.n_proto != htons(ETH_P_IPV6)) ||
((fkeys->basic.ip_proto != IPPROTO_TCP) &&
 (fkeys->basic.ip_proto != IPPROTO_UDP))) {
rc = -EPROTONOSUPPORT;
goto err_free;
}
+   if (fkeys->basic.n_proto == htons(ETH_P_IPV6) &&
+   bp->hwrm_spec_code < 0x10601) {
+   rc = -EPROTONOSUPPORT;
+   goto err_free;
+   }
 
memcpy(new_fltr->dst_mac_addr, eth->h_dest, ETH_ALEN);
memcpy(new_fltr->src_mac_addr, eth->h_source, ETH_ALEN);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 1cfa7a6..e6b1196 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -524,24 +524,49 @@ static int bnxt_grxclsrule(struct bnxt *bp, struct 
ethtool_rxnfc *cmd)
 
 fltr_found:
fkeys = &fltr->fkeys;
-   if (fkeys->basic.ip_proto == IPPROTO_TCP)
-   fs->flow_type = TCP_V4_FLOW;
-   else if (fkeys->basic.ip_proto == IPPROTO_UDP)
-   fs->flow_type = UDP_V4_FLOW;
-   else
-   goto fltr_err;
+   if (fkeys->basic.n_proto == htons(ETH_P_IP)) {
+   if (fkeys->basic.ip_proto == IPPROTO_TCP)
+   fs->flow_type = TCP_V4_FLOW;
+   else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+   fs->flow_type = UDP_V4_FLOW;
+   else
+   goto fltr_err;
+
+   fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
+   fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+
+   fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
+   fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
 
-   fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
-   fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+   fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
+   fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
 
-   fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
-   fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
+   fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
+   fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+   } else {
+   int i;
 
-   fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
-   fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
+   if (fkeys->basic.ip_proto == IPPROTO_TCP)
+   fs->flow_type = TCP_V6_FLOW;
+   else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+   fs->flow_type = UDP_V6_FLOW;
+   else
+   got

[PATCH net-next 00/14] bnxt_en: updates for net-next.

2016-12-29 Thread Michael Chan

This patch series for net-next contains cleanups, new features and minor
fixes.  The driver specific busy polling code is removed to use busy
polling support in core networking.  Hardware RFS support is enhanced with
added ipv6 flows support and VF support.  A new scheme to allocate TX
rings from the firmware is implemented for newer chips and firmware.  Plus
some misc. cleanups, minor fixes, and to add the maintainer entry.  Please
review.

Michael Chan (14):
  bnxt_en: Remove busy poll logic in the driver.
  bnxt_en: Use napi_complete_done()
  bnxt_en: Improve the IRQ disable sequence during shutdown.
  bnxt_en: Fix and clarify link_info->advertising.
  bnxt_en: Refactor TPA code path.
  bnxt_en: Add function to get vnic capability.
  bnxt_en: Refactor code that determines RFS capability.
  bnxt_en: Add new hardware RFS mode.
  bnxt_en: Assign additional vnics to VFs.
  bnxt_en: Add IPV6 hardware RFS support.
  bnxt_en: Implement new scheme to reserve tx rings.
  bnxt_en: Set default completion ring for async events.
  bnxt_en: Handle no aggregation ring gracefully.
  MAINTAINERS: Add bnxt_en maintainer info.

 MAINTAINERS   |   6 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 385 +++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 108 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  73 +++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h |  34 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   |  15 +-
 6 files changed, 388 insertions(+), 233 deletions(-)

-- 
1.8.3.1

[PATCH net-next 05/14] bnxt_en: Refactor TPA code path.

2016-12-29 Thread Michael Chan

Call tcp_gro_complete() in the common code path instead of the chip-
specific method.  The newer 5731x method is missing the call.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4a8059f..0654c3f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1127,7 +1127,6 @@ static struct sk_buff *bnxt_gro_func_5730x(struct 
bnxt_tpa_info *tpa_info,
dev_kfree_skb_any(skb);
return NULL;
}
-   tcp_gro_complete(skb);
 
if (nw_off) { /* tunnel */
struct udphdr *uh = NULL;
@@ -1177,6 +1176,8 @@ static inline struct sk_buff *bnxt_gro_skb(struct bnxt 
*bp,
   RX_TPA_END_CMP_PAYLOAD_OFFSET) >>
  RX_TPA_END_CMP_PAYLOAD_OFFSET_SHIFT;
skb = bp->gro_func(tpa_info, payload_off, TPA_END_GRO_TS(tpa_end), skb);
+   if (likely(skb))
+   tcp_gro_complete(skb);
 #endif
return skb;
 }
-- 
1.8.3.1

[PATCH net-next 02/14] bnxt_en: Use napi_complete_done()

2016-12-29 Thread Michael Chan

For better busy polling and GRO support.  Do not re-arm IRQ if
napi_complete_done() returns false.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index b53f958..3fbc842 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1778,8 +1778,9 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
break;
 
if (!bnxt_has_work(bp, cpr)) {
-   napi_complete(napi);
-   BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+   if (napi_complete_done(napi, work_done))
+   BNXT_CP_DB_REARM(cpr->cp_doorbell,
+cpr->cp_raw_cons);
break;
}
}
-- 
1.8.3.1

[PATCH net-next 14/14] MAINTAINERS: Add bnxt_en maintainer info.

2016-12-29 Thread Michael Chan

Signed-off-by: Michael Chan 
---
 MAINTAINERS | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cfff2c9..11904a9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2605,6 +2605,12 @@ L:   netdev@vger.kernel.org
 S: Supported
 F: drivers/net/ethernet/broadcom/bnx2x/
 
+BROADCOM BNXT_EN 50 GIGABIT ETHERNET DRIVER
+M: Michael Chan 
+L: netdev@vger.kernel.org
+S: Supported
+F: drivers/net/ethernet/broadcom/bnxt/
+
 BROADCOM BCM281XX/BCM11XXX/BCM216XX ARM ARCHITECTURE
 M: Florian Fainelli 
 M: Ray Jui 
-- 
1.8.3.1

[PATCH net-next 03/14] bnxt_en: Improve the IRQ disable sequence during shutdown.

2016-12-29 Thread Michael Chan

The IRQ is disabled by writing to the completion ring doorbell.  This
should be done before the hardware completion ring is freed for correctness.
The current code disables IRQs after all the completion rings are freed.

Fix it by calling bnxt_disable_int_sync() before freeing the completion
rings.  Rearrange the code to avoid forward declaration.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 89 ---
 1 file changed, 46 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 3fbc842..277573b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2953,6 +2953,45 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool 
irq_re_init)
return rc;
 }
 
+static void bnxt_disable_int(struct bnxt *bp)
+{
+   int i;
+
+   if (!bp->bnapi)
+   return;
+
+   for (i = 0; i < bp->cp_nr_rings; i++) {
+   struct bnxt_napi *bnapi = bp->bnapi[i];
+   struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+   BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+   }
+}
+
+static void bnxt_disable_int_sync(struct bnxt *bp)
+{
+   int i;
+
+   atomic_inc(&bp->intr_sem);
+
+   bnxt_disable_int(bp);
+   for (i = 0; i < bp->cp_nr_rings; i++)
+   synchronize_irq(bp->irq_tbl[i].vector);
+}
+
+static void bnxt_enable_int(struct bnxt *bp)
+{
+   int i;
+
+   atomic_set(&bp->intr_sem, 0);
+   for (i = 0; i < bp->cp_nr_rings; i++) {
+   struct bnxt_napi *bnapi = bp->bnapi[i];
+   struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+   BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+   }
+}
+
 void bnxt_hwrm_cmd_hdr_init(struct bnxt *bp, void *request, u16 req_type,
u16 cmpl_ring, u16 target_id)
 {
@@ -3937,6 +3976,12 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool 
close_path)
}
}
 
+   /* The completion rings are about to be freed.  After that the
+* IRQ doorbell will not work anymore.  So we need to disable
+* IRQ here.
+*/
+   bnxt_disable_int_sync(bp);
+
for (i = 0; i < bp->cp_nr_rings; i++) {
struct bnxt_napi *bnapi = bp->bnapi[i];
struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
@@ -4658,34 +4703,6 @@ static int bnxt_init_nic(struct bnxt *bp, bool 
irq_re_init)
return bnxt_init_chip(bp, irq_re_init);
 }
 
-static void bnxt_disable_int(struct bnxt *bp)
-{
-   int i;
-
-   if (!bp->bnapi)
-   return;
-
-   for (i = 0; i < bp->cp_nr_rings; i++) {
-   struct bnxt_napi *bnapi = bp->bnapi[i];
-   struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-
-   BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
-   }
-}
-
-static void bnxt_enable_int(struct bnxt *bp)
-{
-   int i;
-
-   atomic_set(&bp->intr_sem, 0);
-   for (i = 0; i < bp->cp_nr_rings; i++) {
-   struct bnxt_napi *bnapi = bp->bnapi[i];
-   struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-
-   BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
-   }
-}
-
 static int bnxt_set_real_num_queues(struct bnxt *bp)
 {
int rc;
@@ -5640,19 +5657,6 @@ static int bnxt_open(struct net_device *dev)
return __bnxt_open_nic(bp, true, true);
 }
 
-static void bnxt_disable_int_sync(struct bnxt *bp)
-{
-   int i;
-
-   atomic_inc(&bp->intr_sem);
-   if (!netif_running(bp->dev))
-   return;
-
-   bnxt_disable_int(bp);
-   for (i = 0; i < bp->cp_nr_rings; i++)
-   synchronize_irq(bp->irq_tbl[i].vector);
-}
-
 int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 {
int rc = 0;
@@ -5674,13 +5678,12 @@ int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, 
bool link_re_init)
while (test_bit(BNXT_STATE_IN_SP_TASK, &bp->state))
msleep(20);
 
-   /* Flush rings before disabling interrupts */
+   /* Flush rings and and disable interrupts */
bnxt_shutdown_nic(bp, irq_re_init);
 
/* TODO CHIMP_FW: Link/PHY related cleanup if (link_re_init) */
 
bnxt_disable_napi(bp);
-   bnxt_disable_int_sync(bp);
del_timer_sync(&bp->timer);
bnxt_free_skbs(bp);
 
-- 
1.8.3.1

[PATCH net-next 08/14] bnxt_en: Add new hardware RFS mode.

2016-12-29 Thread Michael Chan

The existing hardware RFS mode uses one hardware RSS context block
per ring just to calculate the RSS hash.  This is very wasteful and
prevents VF functions from using it.  The new hardware mode shares
the same hardware RSS context for RSS placement and RFS steering.
This allows VFs to enable RFS.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 27 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f7ea99f..0ca530e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2630,6 +2630,10 @@ static int bnxt_alloc_vnic_attributes(struct bnxt *bp)
goto out;
}
 
+   if ((bp->flags & BNXT_FLAG_NEW_RSS_CAP) &&
+   !(vnic->flags & BNXT_VNIC_RSS_FLAG))
+   continue;
+
/* Allocate rss table and hash key */
vnic->rss_table = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
 &vnic->rss_table_dma_addr,
@@ -3562,6 +3566,12 @@ int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
req.rss_rule = cpu_to_le16(vnic->fw_rss_cos_lb_ctx[0]);
req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
   VNIC_CFG_REQ_ENABLES_MRU);
+   } else if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG) {
+   req.rss_rule =
+   cpu_to_le16(bp->vnic_info[0].fw_rss_cos_lb_ctx[0]);
+   req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
+  VNIC_CFG_REQ_ENABLES_MRU);
+   req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE);
} else {
req.rss_rule = cpu_to_le16(0x);
}
@@ -4490,8 +4500,12 @@ static void bnxt_hwrm_resource_free(struct bnxt *bp, 
bool close_path,
 
 static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
 {
+   struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
int rc;
 
+   if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG)
+   goto skip_rss_ctx;
+
/* allocate context for vnic */
rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 0);
if (rc) {
@@ -4511,6 +4525,7 @@ static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
bp->rsscos_nr_ctxs++;
}
 
+skip_rss_ctx:
/* configure default vnic, ring grp */
rc = bnxt_hwrm_vnic_cfg(bp, vnic_id);
if (rc) {
@@ -4545,13 +4560,17 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp)
int i, rc = 0;
 
for (i = 0; i < bp->rx_nr_rings; i++) {
+   struct bnxt_vnic_info *vnic;
u16 vnic_id = i + 1;
u16 ring_id = i;
 
if (vnic_id >= bp->nr_vnics)
break;
 
-   bp->vnic_info[vnic_id].flags |= BNXT_VNIC_RFS_FLAG;
+   vnic = &bp->vnic_info[vnic_id];
+   vnic->flags |= BNXT_VNIC_RFS_FLAG;
+   if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+   vnic->flags |= BNXT_VNIC_RFS_NEW_RSS_FLAG;
rc = bnxt_hwrm_vnic_alloc(bp, vnic_id, ring_id, 1);
if (rc) {
netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: 
%x\n",
@@ -5985,6 +6004,8 @@ static bool bnxt_rfs_supported(struct bnxt *bp)
 {
if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
return true;
+   if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+   return true;
return false;
 }
 
@@ -6000,6 +6021,10 @@ static bool bnxt_rfs_capable(struct bnxt *bp)
vnics = 1 + bp->rx_nr_rings;
max_vnics = bnxt_get_max_func_vnics(bp);
max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+
+   /* RSS contexts not a limiting factor */
+   if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+   max_rss_ctxs = max_vnics;
if (vnics > max_vnics || vnics > max_rss_ctxs) {
netdev_warn(bp->dev,
"Not enough resources to support NTUPLE filters, 
enough resources for up to %d rx rings\n",
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 80bf1ab..75803e5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -708,6 +708,7 @@ struct bnxt_vnic_info {
 #define BNXT_VNIC_RFS_FLAG 2
 #define BNXT_VNIC_MCAST_FLAG   4
 #define BNXT_VNIC_UCAST_FLAG   8
+#define BNXT_VNIC_RFS_NEW_RSS_FLAG 0x10
 };
 
 #if defined(CONFIG_BNXT_SRIOV)
-- 
1.8.3.1

[PATCH net-next 01/14] bnxt_en: Remove busy poll logic in the driver.

2016-12-29 Thread Michael Chan

Use native NAPI polling instead.  The next patch will complete the work
by switching to use napi_complete_done()

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 53 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 99 ---
 2 files changed, 3 insertions(+), 149 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9608cb4..b53f958 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -39,9 +39,6 @@
 #include 
 #include 
 #include 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-#include 
-#endif
 #include 
 #include 
 #include 
@@ -1356,11 +1353,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
rc = -ENOMEM;
if (likely(skb)) {
skb_record_rx_queue(skb, bnapi->index);
-   skb_mark_napi_id(skb, &bnapi->napi);
-   if (bnxt_busy_polling(bnapi))
-   netif_receive_skb(skb);
-   else
-   napi_gro_receive(&bnapi->napi, skb);
+   napi_gro_receive(&bnapi->napi, skb);
rc = 1;
}
goto next_rx_no_prod;
@@ -1460,11 +1453,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
}
 
skb_record_rx_queue(skb, bnapi->index);
-   skb_mark_napi_id(skb, &bnapi->napi);
-   if (bnxt_busy_polling(bnapi))
-   netif_receive_skb(skb);
-   else
-   napi_gro_receive(&bnapi->napi, skb);
+   napi_gro_receive(&bnapi->napi, skb);
rc = 1;
 
 next_rx:
@@ -1782,9 +1771,6 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
int work_done = 0;
 
-   if (!bnxt_lock_napi(bnapi))
-   return budget;
-
while (1) {
work_done += bnxt_poll_work(bp, bnapi, budget - work_done);
 
@@ -1798,36 +1784,9 @@ static int bnxt_poll(struct napi_struct *napi, int 
budget)
}
}
mmiowb();
-   bnxt_unlock_napi(bnapi);
return work_done;
 }
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-static int bnxt_busy_poll(struct napi_struct *napi)
-{
-   struct bnxt_napi *bnapi = container_of(napi, struct bnxt_napi, napi);
-   struct bnxt *bp = bnapi->bp;
-   struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-   int rx_work, budget = 4;
-
-   if (atomic_read(&bp->intr_sem) != 0)
-   return LL_FLUSH_FAILED;
-
-   if (!bp->link_info.link_up)
-   return LL_FLUSH_FAILED;
-
-   if (!bnxt_lock_poll(bnapi))
-   return LL_FLUSH_BUSY;
-
-   rx_work = bnxt_poll_work(bp, bnapi, budget);
-
-   BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
-
-   bnxt_unlock_poll(bnapi);
-   return rx_work;
-}
-#endif
-
 static void bnxt_free_tx_skbs(struct bnxt *bp)
 {
int i, max_idx;
@@ -5094,10 +5053,8 @@ static void bnxt_disable_napi(struct bnxt *bp)
if (!bp->bnapi)
return;
 
-   for (i = 0; i < bp->cp_nr_rings; i++) {
+   for (i = 0; i < bp->cp_nr_rings; i++)
napi_disable(&bp->bnapi[i]->napi);
-   bnxt_disable_poll(bp->bnapi[i]);
-   }
 }
 
 static void bnxt_enable_napi(struct bnxt *bp)
@@ -5106,7 +5063,6 @@ static void bnxt_enable_napi(struct bnxt *bp)
 
for (i = 0; i < bp->cp_nr_rings; i++) {
bp->bnapi[i]->in_reset = false;
-   bnxt_enable_poll(bp->bnapi[i]);
napi_enable(&bp->bnapi[i]->napi);
}
 }
@@ -6765,9 +6721,6 @@ static void bnxt_udp_tunnel_del(struct net_device *dev,
 #endif
.ndo_udp_tunnel_add = bnxt_udp_tunnel_add,
.ndo_udp_tunnel_del = bnxt_udp_tunnel_del,
-#ifdef CONFIG_NET_RX_BUSY_POLL
-   .ndo_busy_poll  = bnxt_busy_poll,
-#endif
 };
 
 static void bnxt_remove_one(struct pci_dev *pdev)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 16defe9..fddc316 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -654,21 +654,9 @@ struct bnxt_napi {
struct bnxt_rx_ring_info*rx_ring;
struct bnxt_tx_ring_info*tx_ring;
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-   atomic_tpoll_state;
-#endif
boolin_reset;
 };
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-enum bnxt_poll_state_t {
-   BNXT_STATE_IDLE = 0,
-   BNXT_STATE_NAPI,
-   BNXT_STATE_POLL,
-   BNXT_STATE_DISABLE,
-};
-#endif
-
 struct bnxt_irq {
irq_handler_t   handler;
unsigned intvector;
@@ -1141,93 +1129,6 @@ struct bnxt {
((offsetof(struct tx_port_stats, counter) + \
  sizeof(struct rx_port_st

[PATCH net-next 13/14] bnxt_en: Handle no aggregation ring gracefully.

2016-12-29 Thread Michael Chan

The current code assumes that we will always have at least 2 rx rings, 1
will be used as an aggregation ring for TPA and jumbo page placements.
However, it is possible, especially on a VF, that there is only 1 rx
ring available.  In this scenario, the current code will fail to initialize.
To handle it, we need to properly set up only 1 ring without aggregation.
Set a new flag BNXT_FLAG_NO_AGG_RINGS for this condition and add logic to
set up the chip to place RX data linearly into a single buffer per packet.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f54a7a..98e9484 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2496,7 +2496,7 @@ void bnxt_set_ring_params(struct bnxt *bp)
agg_factor = min_t(u32, 4, 65536 / BNXT_RX_PAGE_SIZE);
 
bp->flags &= ~BNXT_FLAG_JUMBO;
-   if (rx_space > PAGE_SIZE) {
+   if (rx_space > PAGE_SIZE && !(bp->flags & BNXT_FLAG_NO_AGG_RINGS)) {
u32 jumbo_factor;
 
bp->flags |= BNXT_FLAG_JUMBO;
@@ -6174,6 +6174,9 @@ static int bnxt_set_features(struct net_device *dev, 
netdev_features_t features)
if (features & NETIF_F_LRO)
flags |= BNXT_FLAG_LRO;
 
+   if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+   flags &= ~BNXT_FLAG_TPA;
+
if (features & NETIF_F_HW_VLAN_CTAG_RX)
flags |= BNXT_FLAG_STRIP_VLAN;
 
@@ -7040,8 +7043,17 @@ static int bnxt_get_dflt_rings(struct bnxt *bp, int 
*max_rx, int *max_tx,
int rc;
 
rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
-   if (rc)
-   return rc;
+   if (rc && (bp->flags & BNXT_FLAG_AGG_RINGS)) {
+   /* Not enough rings, try disabling agg rings. */
+   bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+   rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+   if (rc)
+   return rc;
+   bp->flags |= BNXT_FLAG_NO_AGG_RINGS;
+   bp->dev->hw_features &= ~NETIF_F_LRO;
+   bp->dev->features &= ~NETIF_F_LRO;
+   bnxt_set_ring_params(bp);
+   }
 
if (bp->flags & BNXT_FLAG_ROCE_CAP) {
int max_cp, max_stat, max_irq;
@@ -7236,7 +7248,12 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
bnxt_set_tpa_flags(bp);
bnxt_set_ring_params(bp);
bnxt_set_max_func_irqs(bp, max_irqs);
-   bnxt_set_dflt_rings(bp);
+   rc = bnxt_set_dflt_rings(bp);
+   if (rc) {
+   netdev_err(bp->dev, "Not enough rings available.\n");
+   rc = -ENOMEM;
+   goto init_err;
+   }
 
/* Default RSS hash cfg. */
bp->rss_hash_cfg = VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4 |
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d174729d..f6b9b1c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -950,6 +950,7 @@ struct bnxt {
#define BNXT_FLAG_ROCEV2_CAP0x1
#define BNXT_FLAG_ROCE_CAP  (BNXT_FLAG_ROCEV1_CAP | \
 BNXT_FLAG_ROCEV2_CAP)
+   #define BNXT_FLAG_NO_AGG_RINGS  0x2
#define BNXT_FLAG_CHIP_NITRO_A0 0x100
 
#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \
-- 
1.8.3.1

[PATCH net-next 12/14] bnxt_en: Set default completion ring for async events.

2016-12-29 Thread Michael Chan

With the added support for the bnxt_re RDMA driver, both drivers can be
allocating completion rings in any order.  The firmware does not know
which completion ring should be receiving async events.  Add an
extra step to tell firmware the completion ring number for receiving
async events after bnxt_en allocates the completion rings.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 338dbd0..1f54a7a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3858,6 +3858,30 @@ static int hwrm_ring_alloc_send_msg(struct bnxt *bp,
return rc;
 }
 
+static int bnxt_hwrm_set_async_event_cr(struct bnxt *bp, int idx)
+{
+   int rc;
+
+   if (BNXT_PF(bp)) {
+   struct hwrm_func_cfg_input req = {0};
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+   req.fid = cpu_to_le16(0x);
+   req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+   req.async_event_cr = cpu_to_le16(idx);
+   rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   } else {
+   struct hwrm_func_vf_cfg_input req = {0};
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1);
+   req.enables =
+   cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+   req.async_event_cr = cpu_to_le16(idx);
+   rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   }
+   return rc;
+}
+
 static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
 {
int i, rc = 0;
@@ -3874,6 +3898,12 @@ static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
goto err_out;
BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
bp->grp_info[i].cp_fw_ring_id = ring->fw_ring_id;
+
+   if (!i) {
+   rc = bnxt_hwrm_set_async_event_cr(bp, ring->fw_ring_id);
+   if (rc)
+   netdev_warn(bp->dev, "Failed to set async event 
completion ring.\n");
+   }
}
 
for (i = 0; i < bp->tx_nr_rings; i++) {
-- 
1.8.3.1

[PATCH net-next 06/14] bnxt_en: Add function to get vnic capability.

2016-12-29 Thread Michael Chan

The new vnic RSS capability will enhance NTUPLE support, to be added
in subsequent patches.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 22 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 34 +++
 3 files changed, 57 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0654c3f..9168326 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3665,6 +3665,27 @@ static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, u16 
vnic_id,
return rc;
 }
 
+static int bnxt_hwrm_vnic_qcaps(struct bnxt *bp)
+{
+   struct hwrm_vnic_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+   struct hwrm_vnic_qcaps_input req = {0};
+   int rc;
+
+   if (bp->hwrm_spec_code < 0x10600)
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_QCAPS, -1, -1);
+   mutex_lock(&bp->hwrm_cmd_lock);
+   rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (!rc) {
+   if (resp->flags &
+   cpu_to_le32(VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP))
+   bp->flags |= BNXT_FLAG_NEW_RSS_CAP;
+   }
+   mutex_unlock(&bp->hwrm_cmd_lock);
+   return rc;
+}
+
 static int bnxt_hwrm_ring_grp_alloc(struct bnxt *bp)
 {
u16 i;
@@ -7070,6 +7091,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6;
}
 
+   bnxt_hwrm_vnic_qcaps(bp);
if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) {
dev->hw_features |= NETIF_F_NTUPLE;
if (bnxt_rfs_capable(bp)) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0eb6401..80bf1ab 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -944,6 +944,7 @@ struct bnxt {
#define BNXT_FLAG_PORT_STATS0x400
#define BNXT_FLAG_UDP_RSS_CAP   0x800
#define BNXT_FLAG_EEE_CAP   0x1000
+   #define BNXT_FLAG_NEW_RSS_CAP   0x2000
#define BNXT_FLAG_ROCEV1_CAP0x8000
#define BNXT_FLAG_ROCEV2_CAP0x1
#define BNXT_FLAG_ROCE_CAP  (BNXT_FLAG_ROCEV1_CAP | \
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
index 2ddfa51..d0d49ed 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
@@ -2797,6 +2797,40 @@ struct hwrm_vnic_cfg_output {
u8 valid;
 };
 
+/* hwrm_vnic_qcaps */
+/* Input (24 bytes) */
+struct hwrm_vnic_qcaps_input {
+   __le16 req_type;
+   __le16 cmpl_ring;
+   __le16 seq_id;
+   __le16 target_id;
+   __le64 resp_addr;
+   __le32 enables;
+   __le32 unused_0;
+};
+
+/* Output (24 bytes) */
+struct hwrm_vnic_qcaps_output {
+   __le16 error_code;
+   __le16 req_type;
+   __le16 seq_id;
+   __le16 resp_len;
+   __le16 mru;
+   u8 unused_0;
+   u8 unused_1;
+   __le32 flags;
+   #define VNIC_QCAPS_RESP_FLAGS_VLAN_STRIP_CAP0x2UL
+   #define VNIC_QCAPS_RESP_FLAGS_BD_STALL_CAP  0x4UL
+   #define VNIC_QCAPS_RESP_FLAGS_ROCE_DUAL_VNIC_CAP0x8UL
+   #define VNIC_QCAPS_RESP_FLAGS_ROCE_ONLY_VNIC_CAP0x10UL
+   #define VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP   0x20UL
+   __le32 unused_2;
+   u8 unused_3;
+   u8 unused_4;
+   u8 unused_5;
+   u8 valid;
+};
+
 /* hwrm_vnic_tpa_cfg */
 /* Input (40 bytes) */
 struct hwrm_vnic_tpa_cfg_input {
-- 
1.8.3.1

[PATCH net-next 04/14] bnxt_en: Fix and clarify link_info->advertising.

2016-12-29 Thread Michael Chan

The advertising field is closely related to the auto_link_speeds field.
The former is the user setting while the latter is the firmware setting.
Both should be u16.  We should use the advertising field in
bnxt_get_link_ksettings because the auto_link_speeds field may not
be updated with the latest from the firmware yet.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 277573b..4a8059f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5363,7 +5363,7 @@ static void bnxt_hwrm_set_link_common(struct bnxt *bp,
 {
u8 autoneg = bp->link_info.autoneg;
u16 fw_link_speed = bp->link_info.req_link_speed;
-   u32 advertising = bp->link_info.advertising;
+   u16 advertising = bp->link_info.advertising;
 
if (autoneg & BNXT_AUTONEG_SPEED) {
req->auto_mode |=
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index fddc316..0eb6401 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -828,7 +828,7 @@ struct bnxt_link_info {
 #define BNXT_LINK_SPEED_40GB   PORT_PHY_QCFG_RESP_LINK_SPEED_40GB
 #define BNXT_LINK_SPEED_50GB   PORT_PHY_QCFG_RESP_LINK_SPEED_50GB
u16 support_speeds;
-   u16 auto_link_speeds;
+   u16 auto_link_speeds;   /* fw adv setting */
 #define BNXT_LINK_SPEED_MSK_100MB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB
 #define BNXT_LINK_SPEED_MSK_1GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB
 #define BNXT_LINK_SPEED_MSK_2GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB
@@ -851,7 +851,7 @@ struct bnxt_link_info {
u8  req_duplex;
u8  req_flow_ctrl;
u16 req_link_speed;
-   u32 advertising;
+   u16 advertising;/* user adv setting */
boolforce_link_chng;
 
/* a copy of phy_qcfg output used to report link
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 784aa77..1cfa7a6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -893,7 +893,7 @@ u32 _bnxt_fw_to_ethtool_adv_spds(u16 fw_speeds, u8 fw_pause)
 static void bnxt_fw_to_ethtool_advertised_spds(struct bnxt_link_info 
*link_info,
struct ethtool_link_ksettings *lk_ksettings)
 {
-   u16 fw_speeds = link_info->auto_link_speeds;
+   u16 fw_speeds = link_info->advertising;
u8 fw_pause = 0;
 
if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
@@ -1090,8 +1090,9 @@ static int bnxt_set_link_ksettings(struct net_device *dev,
struct bnxt *bp = netdev_priv(dev);
struct bnxt_link_info *link_info = &bp->link_info;
const struct ethtool_link_settings *base = &lk_ksettings->base;
-   u32 speed, fw_advertising = 0;
bool set_pause = false;
+   u16 fw_advertising = 0;
+   u32 speed;
int rc = 0;
 
if (!BNXT_SINGLE_PF(bp))
-- 
1.8.3.1

[PATCH net-next 11/14] bnxt_en: Implement new scheme to reserve tx rings.

2016-12-29 Thread Michael Chan

In order to properly support TX rate limiting in SRIOV VF functions or
NPAR functions, firmware needs better control over tx ring allocations.
The new scheme requires the driver to reserve the number of tx rings
and to query to see if the requested number of tx rings is reserved.
The driver will use the new scheme when the firmware interface spec is
1.6.1 or newer.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 59 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   | 10 +++-
 4 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4d478e7..338dbd0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4045,6 +4045,50 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool 
close_path)
}
 }
 
+/* Caller must hold bp->hwrm_cmd_lock */
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings)
+{
+   struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+   struct hwrm_func_qcfg_input req = {0};
+   int rc;
+
+   if (bp->hwrm_spec_code < 0x10601)
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+   req.fid = cpu_to_le16(fid);
+   rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (!rc)
+   *tx_rings = le16_to_cpu(resp->alloc_tx_rings);
+
+   return rc;
+}
+
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings)
+{
+   struct hwrm_func_cfg_input req = {0};
+   int rc;
+
+   if (bp->hwrm_spec_code < 0x10601)
+   return 0;
+
+   if (BNXT_VF(bp))
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+   req.fid = cpu_to_le16(0x);
+   req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS);
+   req.num_tx_rings = cpu_to_le16(*tx_rings);
+   rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (rc)
+   return rc;
+
+   mutex_lock(&bp->hwrm_cmd_lock);
+   rc = __bnxt_hwrm_get_tx_rings(bp, 0x, tx_rings);
+   mutex_unlock(&bp->hwrm_cmd_lock);
+   return rc;
+}
+
 static void bnxt_hwrm_set_coal_params(struct bnxt *bp, u32 max_bufs,
u32 buf_tmrs, u16 flags,
struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
@@ -6509,10 +6553,16 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
sh = true;
 
if (tc) {
-   int max_rx_rings, max_tx_rings, rc;
+   int max_rx_rings, max_tx_rings, req_tx_rings, rsv_tx_rings, rc;
 
+   req_tx_rings = bp->tx_nr_rings_per_tc * tc;
rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
-   if (rc || bp->tx_nr_rings_per_tc * tc > max_tx_rings)
+   if (rc || req_tx_rings > max_tx_rings)
+   return -ENOMEM;
+
+   rsv_tx_rings = req_tx_rings;
+   if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings) ||
+   rsv_tx_rings < req_tx_rings)
return -ENOMEM;
}
 
@@ -7000,6 +7050,11 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
return rc;
bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
bp->tx_nr_rings_per_tc = min_t(int, dflt_rings, max_tx_rings);
+
+   rc = bnxt_hwrm_reserve_tx_rings(bp, &bp->tx_nr_rings_per_tc);
+   if (rc)
+   netdev_warn(bp->dev, "Unable to reserve tx rings\n");
+
bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
   bp->tx_nr_rings + bp->rx_nr_rings;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 75803e5..d174729d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1149,6 +1149,8 @@ struct bnxt {
 int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
 int bmap_size);
 int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings);
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings);
 int bnxt_hwrm_set_coal(struct bnxt *);
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
 void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index e6b1196..dd21be4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -388,6 +388,7 @@ static int bnxt_set_c

[PATCH net-next 07/14] bnxt_en: Refactor code that determines RFS capability.

2016-12-29 Thread Michael Chan

Add function bnxt_rfs_supported() that determines if the chip supports
RFS.  Refactor the existing function bnxt_rfs_capable() that determines
if run-time conditions support RFS.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 38 +++
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9168326..f7ea99f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4835,6 +4835,24 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
return rc;
 }
 
+static unsigned int bnxt_get_max_func_rss_ctxs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+   if (BNXT_VF(bp))
+   return bp->vf.max_rsscos_ctxs;
+#endif
+   return bp->pf.max_rsscos_ctxs;
+}
+
+static unsigned int bnxt_get_max_func_vnics(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+   if (BNXT_VF(bp))
+   return bp->vf.max_vnics;
+#endif
+   return bp->pf.max_vnics;
+}
+
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -5962,20 +5980,30 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp)
return rc;
 }
 
+/* If the chip and firmware supports RFS */
+static bool bnxt_rfs_supported(struct bnxt *bp)
+{
+   if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
+   return true;
+   return false;
+}
+
+/* If runtime conditions support RFS */
 static bool bnxt_rfs_capable(struct bnxt *bp)
 {
 #ifdef CONFIG_RFS_ACCEL
-   struct bnxt_pf_info *pf = &bp->pf;
-   int vnics;
+   int vnics, max_vnics, max_rss_ctxs;
 
if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_MSIX_CAP))
return false;
 
vnics = 1 + bp->rx_nr_rings;
-   if (vnics > pf->max_rsscos_ctxs || vnics > pf->max_vnics) {
+   max_vnics = bnxt_get_max_func_vnics(bp);
+   max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+   if (vnics > max_vnics || vnics > max_rss_ctxs) {
netdev_warn(bp->dev,
"Not enough resources to support NTUPLE filters, 
enough resources for up to %d rx rings\n",
-   min(pf->max_rsscos_ctxs - 1, pf->max_vnics - 1));
+   min(max_rss_ctxs - 1, max_vnics - 1));
return false;
}
 
@@ -7092,7 +7120,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
}
 
bnxt_hwrm_vnic_qcaps(bp);
-   if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+   if (bnxt_rfs_supported(bp)) {
dev->hw_features |= NETIF_F_NTUPLE;
if (bnxt_rfs_capable(bp)) {
bp->flags |= BNXT_FLAG_RFS;
-- 
1.8.3.1

[PATCH net-next 09/14] bnxt_en: Assign additional vnics to VFs.

2016-12-29 Thread Michael Chan

Assign additional vnics to VFs whenever possible so that NTUPLE can be
supported on the VFs.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index c696025..0c9f6c1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -429,6 +429,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings) / num_vfs;
vf_ring_grps = (bp->pf.max_hw_ring_grps - bp->rx_nr_rings) / num_vfs;
vf_tx_rings = (pf->max_tx_rings - bp->tx_nr_rings) / num_vfs;
+   vf_vnics = (pf->max_vnics - bp->nr_vnics) / num_vfs;
+   vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
 
req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MTU |
  FUNC_CFG_REQ_ENABLES_MRU |
@@ -451,7 +453,6 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
req.num_rx_rings = cpu_to_le16(vf_rx_rings);
req.num_hw_ring_grps = cpu_to_le16(vf_ring_grps);
req.num_l2_ctxs = cpu_to_le16(4);
-   vf_vnics = 1;
 
req.num_vnics = cpu_to_le16(vf_vnics);
/* FIXME spec currently uses 1 bit for stats ctx */
@@ -506,6 +507,8 @@ static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs)
min_rx_rings)
rx_ok = 1;
}
+   if (bp->pf.max_vnics - bp->nr_vnics < min_rx_rings)
+   rx_ok = 0;
 
if (bp->pf.max_tx_rings - bp->tx_nr_rings >= min_tx_rings)
tx_ok = 1;
-- 
1.8.3.1

[PATCH v1 2/2] bpf: Add tests for the lpm trie map

2016-12-29 Thread Daniel Mack

From: David Herrmann 

The first part of this program runs randomized tests against the
lpm-bpf-map. It implements a "Trivial Longest Prefix Match" (tlpm)
based on simple, linear, single linked lists. The implementation
should be pretty straightforward.

Based on tlpm, this inserts randomized data into bpf-lpm-maps and
verifies the trie-based bpf-map implementation behaves the same way
as tlpm.

The second part uses 'real world' IPv4 and IPv6 addresses and tests
the trie with those.

Signed-off-by: David Herrmann 
Signed-off-by: Daniel Mack 
---
 tools/testing/selftests/bpf/.gitignore |   1 +
 tools/testing/selftests/bpf/Makefile   |   4 +-
 tools/testing/selftests/bpf/test_lpm_map.c | 348 +
 3 files changed, 351 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c

diff --git a/tools/testing/selftests/bpf/.gitignore 
b/tools/testing/selftests/bpf/.gitignore
index 071431b..d3b1c9b 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -1,3 +1,4 @@
 test_verifier
 test_maps
 test_lru_map
+test_lpm_map
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 7a5f245..064a3e5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,8 +1,8 @@
 CFLAGS += -Wall -O2 -I../../../../usr/include
 
-test_objs = test_verifier test_maps test_lru_map
+test_objs = test_verifier test_maps test_lru_map test_lpm_map
 
-TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
+TEST_PROGS := test_verifier test_maps test_lru_map test_lpm_map test_kmod.sh
 TEST_FILES := $(test_objs)
 
 all: $(test_objs)
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c 
b/tools/testing/selftests/bpf/test_lpm_map.c
new file mode 100644
index 000..08db750
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -0,0 +1,348 @@
+/*
+ * Randomized tests for eBPF longest-prefix-match maps
+ *
+ * This program runs randomized tests against the lpm-bpf-map. It implements a
+ * "Trivial Longest Prefix Match" (tlpm) based on simple, linear, singly linked
+ * lists. The implementation should be pretty straightforward.
+ *
+ * Based on tlpm, this inserts randomized data into bpf-lpm-maps and verifies
+ * the trie-based bpf-map implementation behaves the same way as tlpm.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "bpf_sys.h"
+#include "bpf_util.h"
+
+struct tlpm_node {
+   struct tlpm_node *next;
+   size_t n_bits;
+   uint8_t key[];
+};
+
+static struct tlpm_node *tlpm_add(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+   struct tlpm_node *node;
+   size_t n;
+
+   /* add new entry with @key/@n_bits to @list and return new head */
+
+   n = (n_bits + 7) / 8;
+   node = malloc(sizeof(*node) + n);
+   assert(node);
+
+   node->next = list;
+   node->n_bits = n_bits;
+   memcpy(node->key, key, n);
+
+   return node;
+}
+
+static void tlpm_clear(struct tlpm_node *list)
+{
+   struct tlpm_node *node;
+
+   /* free all entries in @list */
+
+   while ((node = list)) {
+   list = list->next;
+   free(node);
+   }
+}
+
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+   const uint8_t *key,
+   size_t n_bits)
+{
+   struct tlpm_node *best = NULL;
+   size_t i;
+
+   /*
+* Perform longest prefix-match on @key/@n_bits. That is, iterate all
+* entries and match each prefix against @key. Remember the "best"
+* entry we find (i.e., the longest prefix that matches) and return it
+* to the caller when done.
+*/
+
+   for ( ; list; list = list->next) {
+   for (i = 0; i < n_bits && i < list->n_bits; ++i) {
+   if ((key[i / 8] & (1 << (7 - i % 8))) !=
+   (list->key[i / 8] & (1 << (7 - i % 8
+   break;
+   }
+
+   if (i >= list->n_bits) {
+   if (!best || i > best->n_bits)
+   best = list;
+   }
+   }
+
+   return best;
+}
+
+static void test_lpm_basic(void)
+{
+   struct tlpm_node *list = NULL, *t1, *t2;
+
+   /* very basic, static tests to verify tlpm works as expected */
+
+   assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
+   t1 = list = tlpm_add(list, (uint8_t[]){ 0xff }, 8);
+   assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+   assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+   assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0x00 }, 16));
+   assert(!tlpm_match(list, (uint8_t[]){ 0x7f }, 8));
+   assert(!tlpm_mat

[PATCH v1 0/2] bpf: add longest prefix match map

2016-12-29 Thread Daniel Mack

This patch set adds a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges. It is exposed as a
bpf map type.
   
Internally, data is stored in an unbalanced tree of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.
 
Not that this has nothing to do with fib or fib6 and is in no way meant
to replace or share code with it. It's rather a much simpler
implementation that is specifically written with bpf maps in mind.
 
Patch 1/2 adds the implementation, and 2/2 an extensive test suite.
 
Feedback is much appreciated.
 
 
Thanks,
Daniel

Changelog:

rfc -> v1:
* Add __rcu pointer annotations to make sparse happy
* Fold _lpm_trie_find_target_node() into its only caller
* Fix some minor documentation issues


Daniel Mack (1):
  bpf: add a longest prefix match trie map implementation

David Herrmann (1):
  bpf: Add tests for the lpm trie map

 include/uapi/linux/bpf.h   |   7 +
 kernel/bpf/Makefile|   2 +-
 kernel/bpf/lpm_trie.c  | 468 +
 tools/testing/selftests/bpf/.gitignore |   1 +
 tools/testing/selftests/bpf/Makefile   |   4 +-
 tools/testing/selftests/bpf/test_lpm_map.c | 348 +
 6 files changed, 827 insertions(+), 3 deletions(-)
 create mode 100644 kernel/bpf/lpm_trie.c
 create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c

-- 
2.9.3

[PATCH v1 1/2] bpf: add a longest prefix match trie map implementation

2016-12-29 Thread Daniel Mack

This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.

Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.

Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.

The code carries more information about the internal implementation.

Signed-off-by: Daniel Mack 
Reviewed-by: David Herrmann 
---
 include/uapi/linux/bpf.h |   7 +
 kernel/bpf/Makefile  |   2 +-
 kernel/bpf/lpm_trie.c| 468 +++
 3 files changed, 476 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/lpm_trie.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87..d564277 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
__s32   imm;/* signed immediate constant */
 };
 
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+   __u32   prefixlen;  /* up to 32 for AF_INET, 128 for AF_INET6 */
+   __u8data[0];/* Arbitrary size */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
+   BPF_MAP_TYPE_LPM_TRIE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474..e1ce4f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
bpf_lru_list.o lpm_trie.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000..8b6a61d
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,468 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+   struct rcu_head rcu;
+   struct lpm_trie_node __rcu  *child[2];
+   u32 prefixlen;
+   u32 flags;
+   u64 value;
+   u8  data[0];
+};
+
+struct lpm_trie {
+   struct bpf_map  map;
+   struct lpm_trie_node __rcu  *root;
+   size_t  n_entries;
+   size_t  max_prefixlen;
+   size_t  data_size;
+   spinlock_t  lock;
+};
+
+/*
+ * This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ *  ++
+ *  |   (1)  (R) |
+ *  | 192.168.0.0/16 |
+ *  |value: 1|
+ *  |   [0][1]   |
+ *  ++
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In chi

RE: [Open-FCoE] [PATCH RFC net-next 1/5] qed: Add support for hardware offloaded FCoE.

2016-12-29 Thread Mintz, Yuval

> > +struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
> > +   union fcoe_cleanup_addr_exp_ro_union
> cleanup_addr_exp_ro_union;
> > +   __le16 flags;
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK
> 0x7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT  0
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT
> 3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT   4
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT  5
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT
> 6
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MAS
> K   0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT
> 7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK
> 0x3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT   8
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK
> 0x3F
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT
> 10
> 
> A very odd way of defining a bitfield ...
> Why not use a 'normal' bitfield here?

This is the format of our generated firmware HSI; We already have
Thousands of definitions using this same format.

[PATCH net-next] sctp: refactor sctp_datamsg_from_user

2016-12-29 Thread Marcelo Ricardo Leitner

This patch refactors sctp_datamsg_from_user() in an attempt to make it
better to read and avoid code duplication for handling the last
fragment.

It also avoids doing division and remaining operations. Even though, it
should still operate similarly as before this patch.

Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/chunk.c | 107 +--
 1 file changed, 32 insertions(+), 75 deletions(-)

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 
615f0ddd41dfb1ff46a9d4e564716de8e7b60ea6..e3621cb4827fadb5f5cb41ebe8455dfa3300a765
 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo,
struct iov_iter *from)
 {
-   int max, whole, i, offset, over, err;
-   int len, first_len;
-   int max_data;
+   size_t len, first_len, max_data, remaining;
+   size_t msg_len = iov_iter_count(from);
+   struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
-   struct list_head *pos, *temp;
-   size_t msg_len = iov_iter_count(from);
-   __u8 frag;
+   int err;
 
msg = sctp_datamsg_new(GFP_KERNEL);
if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
(SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
 !SCTP_PR_POLICY(sinfo->sinfo_flags)))
msg->expires_at = jiffies +
-   msecs_to_jiffies(sinfo->sinfo_timetolive);
+ msecs_to_jiffies(sinfo->sinfo_timetolive);
 
/* This is the biggest possible DATA chunk that can fit into
 * the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
max_data = SCTP_TRUNC4(max_data);
 
-   max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
 * we need to account for bundling of the AUTH chunks along with
 * DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
  hmac_desc->hmac_len);
}
 
-   /* Now, check if we need to reduce our max */
-   if (max > max_data)
-   max = max_data;
+   /* Check what's our max considering the above */
+   max_data = min_t(size_t, max_data, asoc->frag_point);
 
-   whole = 0;
-   first_len = max;
+   /* Set first_len and then account for possible bundles on first frag */
+   first_len = max_data;
 
/* Check to see if we have a pending SACK and try to let it be bundled
 * with this message.  Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
-   msg_len > max)
-   max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+   msg_len > max_data)
+   first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
 
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
-   max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
-   /* Now that we adjusted completely, reset first_len */
-   if (first_len > max_data)
-   first_len = max_data;
+   first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
 
/* Account for a different sized first fragment */
if (msg_len >= first_len) {
-   msg_len -= first_len;
-   whole = 1;
msg->can_delay = 0;
-   }
-
-   /* How many full sized?  How many bytes leftover? */
-   whole += msg_len / max;
-   over = msg_len % max;
-   offset = 0;
-
-   if ((whole > 1) || (whole && over))
SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+   } else {
+   /* Which may be the only one... */
+   first_len = msg_len;
+   }
 
-   /* Create chunks for all the full sized DATA chunks. */
-   for (i = 0, len = first_len; i < whole; i++) {
-   frag = SCTP_DATA_MIDDLE_FRAG;
+   /* Create chunks for all DATA chunks. */
+   for (remaining = msg_len; remaining; remaining -= len) {
+   u8 frag = SCTP_DATA_MIDDLE_FRAG;
 
-   if (0 == i)
+   if (remaining == msg_len) {
+   /* First frag, which may also be the last */
frag |= SCTP_DATA_FIRST_FRAG;
+   len = first_len;
+   } el

Re: [PATCH iproute2 net-next] tc: flower: support matching flags

2016-12-29 Thread Stephen Hemminger

On Wed, 28 Dec 2016 15:06:49 +0200
Paul Blakey  wrote:

> Enhance flower to support matching on flags.
> 
> The 1st flag allows to match on whether the packet is
> an IP fragment.
> 
> Example:
> 
>   # add a flower filter that will drop fragmented packets
>   # (bit 0 of control flags)
>   tc filter add dev ens4f0 protocol ip parent : \
>   flower \
>   src_mac e4:1d:2d:fd:8b:01 \
>   dst_mac e4:1d:2d:fd:8b:02 \
>   indev ens4f0 \
>   matching_flags 0x1/0x1 \
>   action drop
> 
> Signed-off-by: Paul Blakey 
> Signed-off-by: Or Gerlitz 
> Reviewed-by: Roi Dayan 

Applied. Had to manually fixup merge conflicts with other flower changes.

Re: [PATCH iproute2] fix typo in ip-xfrm man page, rmd610 -> rmd160

2016-12-29 Thread Stephen Hemminger

On Fri, 23 Dec 2016 14:03:16 +0300
Alexey Kodanev  wrote:

> Signed-off-by: Alexey Kodanev 


Applied.

Re: [PATCH] tc: add missing limits.h header

2016-12-29 Thread Stephen Hemminger

On Thu, 22 Dec 2016 20:52:48 +0200
Baruch Siach  wrote:

> This fixes under musl build issues like:
> 
> f_matchall.c: In function ‘matchall_parse_opt’:
> f_matchall.c:48:12: error: ‘LONG_MIN’ undeclared (first use in this function)
>if (h == LONG_MIN || h == LONG_MAX) {
> ^
> f_matchall.c:48:12: note: each undeclared identifier is reported only once 
> for each function it appears in
> f_matchall.c:48:29: error: ‘LONG_MAX’ undeclared (first use in this function)
>if (h == LONG_MIN || h == LONG_MAX) {
>  ^
> 
> Signed-off-by: Baruch Siach 

Sure, applied

Re: [PATCH net] rtnl: stats - add missing netlink message size checks

2016-12-29 Thread David Miller

From: Mathias Krause 
Date: Wed, 28 Dec 2016 17:52:15 +0100

> We miss to check if the netlink message is actually big enough to contain
> a struct if_stats_msg.
> 
> Add a check to prevent userland from sending us short messages that would
> make us access memory beyond the end of the message.
> 
> Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump...")
> Signed-off-by: Mathias Krause 

Looks good, applied and queued up for -stable.

Re: [PATCH v2] net: fix incorrect original ingress device index in PKTINFO

2016-12-29 Thread David Miller

From: Wei Zhang 
Date: Thu, 29 Dec 2016 16:45:04 +0800

> When we send a packet for our own local address on a non-loopback
> interface (e.g. eth0), due to the change had been introduced from
> commit 0b922b7a829c ("net: original ingress device index in PKTINFO"), the
> original ingress device index would be set as the loopback interface.
> However, the packet should be considered as if it is being arrived via the
> sending interface (eth0), otherwise it would break the expectation of the
> userspace application (e.g. the DHCPRELEASE message from dhcp_release
> binary would be ignored by the dnsmasq daemon, since it come from lo which
> is not the interface dnsmasq bind to)
> 
> Fixes: 0b922b7a829c ("net: original ingress device index in PKTINFO")
> Acked-by: David Ahern 
> Signed-off-by: Wei Zhang 

Applied and queued up for -stable.

Re: [PATCH v4] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread David Miller

From: Matthias Tafelmeier 
Date: Thu, 29 Dec 2016 10:58:41 +0100

> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior 
> processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.

This is missing a signoff.

Re: [PATCH net 0/5] mlx4 misc fixes

2016-12-29 Thread David Miller

From: Tariq Toukan 
Date: Thu, 29 Dec 2016 18:37:08 +0200

> This patchset contains several bug fixes from the team to the
> mlx4 Eth and Core drivers.
> 
> Series generated against net commit:
> 60133867f1f1 'net: wan: slic_ds26522: fix spelling mistake: "configurated" -> 
> "configured"'

Series applied, thank you.

[PATCH v4] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread Matthias Tafelmeier

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.

Signed-off-by: Matthias Tafelmeier 
---
 Documentation/sysctl/net.txt | 21 +
 include/linux/netdevice.h|  4 
 net/core/dev.c   |  6 +-
 net/core/sysctl_net_core.c   | 31 ++-
 net/sched/sch_generic.c  |  2 +-
 5 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a 
NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll 
function
+of the driver for the per softirq cycle netdev_budget. This parameter 
influences
+the proportion of the configured netdev_budget that is spent on RPS based 
packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network 
stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is 
based
+on dev_weight and is calculated multiplicative (dev_weight * 
dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--
+
+Scales the maximum number of packets that can be processed during a TX softirq 
cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for 
asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a 
CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 
*stats64,
 extern int netdev_max_backlog;
 extern int netdev_tstamp_prequeue;
 extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+extern int dev_rx_weight;
+extern int dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device 
*upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..f2fe98b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,10 @@ EXPORT_SYMBOL(netdev_max_backlog);
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;/* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;/* bias for backlog 
weight */
+int dev_weight_tx_bias __read_mostly = 1;/* bias for output_queue 
quota */
+int dev_rx_weight __read_mostly = weight_p;
+int dev_tx_weight __read_mostly = weight_p;
 
 /* Called with irq disabled */
 static inline void napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int 
quota)
net_rps_action_and_irq_enable(sd);
}
 
-   napi->weight = weight_p;
+   napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int 
write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+   int ret;
+
+   ret = proc_dointvec(table, write, buffer, lenp, ppos);
+   if (ret != 0)
+   return ret;
+
+   dev_rx_weight = weight_p * dev_weight_rx_bias;
+   dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+   return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,

Re: [PATCH net-next 00/14] bnxt_en: updates for net-next.

2016-12-29 Thread David Miller

From: Michael Chan 
Date: Thu, 29 Dec 2016 12:13:30 -0500

> This patch series for net-next contains cleanups, new features and minor
> fixes.  The driver specific busy polling code is removed to use busy
> polling support in core networking.  Hardware RFS support is enhanced with
> added ipv6 flows support and VF support.  A new scheme to allocate TX
> rings from the firmware is implemented for newer chips and firmware.  Plus
> some misc. cleanups, minor fixes, and to add the maintainer entry.  Please
> review.

Looks good, series applied, thanks Michael.

Re: [PATCH net-next] sctp: refactor sctp_datamsg_from_user

2016-12-29 Thread David Miller

From: Marcelo Ricardo Leitner 
Date: Thu, 29 Dec 2016 15:53:28 -0200

> This patch refactors sctp_datamsg_from_user() in an attempt to make it
> better to read and avoid code duplication for handling the last
> fragment.
> 
> It also avoids doing division and remaining operations. Even though, it
> should still operate similarly as before this patch.
> 
> Signed-off-by: Marcelo Ricardo Leitner 

Applied.

Re: [PATCH v4] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread David Miller

From: Matthias Tafelmeier 
Date: Thu, 29 Dec 2016 20:23:18 +0100

> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior 
> processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.
> 
> Signed-off-by: Matthias Tafelmeier 

Applied.

Re: [PATCH v4] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread David Miller


Actually, reverted, you didn't even build test this:

net/core/dev.c:3433:35: error: initializer element is not constant
 int dev_rx_weight __read_mostly = weight_p;
   ^~~~
net/core/dev.c:3434:35: error: initializer element is not constant
 int dev_tx_weight __read_mostly = weight_p;
   ^~~~

Re: [PATCH v4] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread Matthias Tafelmeier


> Actually, reverted, you didn't even build test this:
>
> net/core/dev.c:3433:35: error: initializer element is not constant
>  int dev_rx_weight __read_mostly = weight_p;
>^~~~
> net/core/dev.c:3434:35: error: initializer element is not constant
>  int dev_tx_weight __read_mostly = weight_p;
>^~~~

Thought I would have ... let me check.



0x8ADF343B.asc
Description: application/pgp-keys


signature.asc
Description: OpenPGP digital signature

Re: [PATCH] stmmac: adding EEE to GMAC4

2016-12-29 Thread David Miller

From: Joao Pinto 
Date: Thu, 29 Dec 2016 17:10:27 +

> This patch adds Energy Efficiency Ethernet to GMAC4.
> 
> Signed-off-by: Joao Pinto 

Applied, thanks.

[PATCH] wlcore: fix spelling mistake in wl1271_warning

2016-12-29 Thread Colin King

From: Colin Ian King 

trivial fix to spelling mistake of function name in wl1271_warning,
should be dynamic_ps_timeout instead of dyanmic_ps_timeout.

Signed-off-by: Colin Ian King 
---
 drivers/net/wireless/ti/wlcore/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ti/wlcore/debugfs.c 
b/drivers/net/wireless/ti/wlcore/debugfs.c
index 7f672f6..58e148d 100644
--- a/drivers/net/wireless/ti/wlcore/debugfs.c
+++ b/drivers/net/wireless/ti/wlcore/debugfs.c
@@ -281,7 +281,7 @@ static ssize_t dynamic_ps_timeout_write(struct file *file,
}
 
if (value < 1 || value > 65535) {
-   wl1271_warning("dyanmic_ps_timeout is not in valid range");
+   wl1271_warning("dynamic_ps_timeout is not in valid range");
return -ERANGE;
}
 
-- 
2.10.2

[PATCH] [media] gp8psk: fix spelling mistake: "firmare" -> "firmware"

2016-12-29 Thread Colin King

From: Colin Ian King 

trivial fix to spelling mistake in err message

Signed-off-by: Colin Ian King 
---
 drivers/media/usb/dvb-usb/gp8psk.c  | 2 +-
 drivers/net/wireless/realtek/rtlwifi/core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/media/usb/dvb-usb/gp8psk.c 
b/drivers/media/usb/dvb-usb/gp8psk.c
index 2360e7e..26461f2 100644
--- a/drivers/media/usb/dvb-usb/gp8psk.c
+++ b/drivers/media/usb/dvb-usb/gp8psk.c
@@ -161,7 +161,7 @@ static int gp8psk_load_bcm4500fw(struct dvb_usb_device *d)
goto out_free;
}
if (buflen > 64) {
-   err("firmare chunk size bigger than 64 bytes.");
+   err("firmware chunk size bigger than 64 bytes.");
goto out_free;
}
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index ded1493..732de0a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1532,7 +1532,7 @@ static int rtl_op_set_key(struct ieee80211_hw *hw, enum 
set_key_cmd cmd,
key_type = AESCMAC_ENCRYPTION;
RT_TRACE(rtlpriv, COMP_SEC, DBG_DMESG, "alg:CMAC\n");
RT_TRACE(rtlpriv, COMP_SEC, DBG_DMESG,
-"HW don't support CMAC encrypiton, use software CMAC 
encrypiton\n");
+"HW don't support CMAC encryption, use software CMAC 
encryption\n");
err = -EOPNOTSUPP;
goto out_unlock;
default:
-- 
2.10.2

[PATCH v5] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread Matthias Tafelmeier

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.

Signed-off-by: Matthias Tafelmeier 
---
 Documentation/sysctl/net.txt | 21 +
 include/linux/netdevice.h|  4 
 net/core/dev.c   |  8 ++--
 net/core/sysctl_net_core.c   | 31 ++-
 net/sched/sch_generic.c  |  2 +-
 5 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a 
NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll 
function
+of the driver for the per softirq cycle netdev_budget. This parameter 
influences
+the proportion of the configured netdev_budget that is spent on RPS based 
packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network 
stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is 
based
+on dev_weight and is calculated multiplicative (dev_weight * 
dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--
+
+Scales the maximum number of packets that can be processed during a TX softirq 
cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for 
asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a 
CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 
*stats64,
 extern int netdev_max_backlog;
 extern int netdev_tstamp_prequeue;
 extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+extern int dev_rx_weight;
+extern int dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device 
*upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..0d34e1c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3427,7 +3427,11 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64;/* old backlog weight */
+int weight_p __read_mostly = 64;   /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int 
quota)
net_rps_action_and_irq_enable(sd);
}
 
-   napi->weight = weight_p;
+   napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int 
write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+   int ret;
+
+   ret = proc_dointvec(table, write, buffer, lenp, ppos);
+   if (ret != 0)
+   return ret;
+
+   dev_rx_weight = weight_p * dev_weight_rx_bias;
+   dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+   return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table

Re: [PATCH] rtlwifi: fix spelling mistake: "contry" -> "country"

2016-12-29 Thread Larry Finger


On 12/29/2016 10:00 AM, Colin King wrote:

From: Colin Ian King 

trivial fix to spelling mistake in RT_TRACE message

Signed-off-by: Colin Ian King 


Acked-by: Larry Finger 

Larry


---
 drivers/net/wireless/realtek/rtlwifi/regd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/regd.c 
b/drivers/net/wireless/realtek/rtlwifi/regd.c
index 6ee6bf8..558c31b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/regd.c
+++ b/drivers/net/wireless/realtek/rtlwifi/regd.c
@@ -440,7 +440,7 @@ int rtl_regd_init(struct ieee80211_hw *hw,

if (rtlpriv->regd.country_code >= COUNTRY_CODE_MAX) {
RT_TRACE(rtlpriv, COMP_REGD, DBG_DMESG,
-"rtl: EEPROM indicates invalid contry code, world wide 13 
should be used\n");
+"rtl: EEPROM indicates invalid country code, world wide 13 
should be used\n");

rtlpriv->regd.country_code = COUNTRY_CODE_WORLD_WIDE_13;
}

[PATCH] sh_eth: fix branch prediction in sh_eth_interrupt()

2016-12-29 Thread Sergei Shtylyov

IIUC, likely()/unlikely() should apply to the whole *if* statement's
expression, not a part of it  -- fix such expression in  sh_eth_interrupt()
accordingly...

Fixes: 283e38db65e7 ("sh_eth: Fix serialisation of interrupt disable with 
interrupt & NAPI handlers")
Signed-off-by: Sergei Shtylyov 

---
The patch is against DaveM's 'net-next.git' repo; I'm not sure if it should
be  targeted to the 'net.git' repo instead...

 drivers/net/ethernet/renesas/sh_eth.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: net-next/drivers/net/ethernet/renesas/sh_eth.c
===
--- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
+++ net-next/drivers/net/ethernet/renesas/sh_eth.c
@@ -1656,7 +1656,7 @@ static irqreturn_t sh_eth_interrupt(int
else
goto out;
 
-   if (!likely(mdp->irq_enabled)) {
+   if (unlikely(!mdp->irq_enabled)) {
sh_eth_write(ndev, 0, EESIPR);
goto out;
}

Re: [PATCH] [media] gp8psk: fix spelling mistake: "firmare" -> "firmware"

2016-12-29 Thread VDR User

> -   err("firmare chunk size bigger than 64 bytes.");
> +   err("firmware chunk size bigger than 64 bytes.");

Yup.

> -"HW don't support CMAC encrypiton, use software CMAC 
> encrypiton\n");
> +"HW don't support CMAC encryption, use software CMAC 
> encryption\n");

Should be: "HW doesn't support CMAC encryption, use software CMAC
encryption\n");

[net-next PATCH 0/6] i40e: Add VF port representator support or SR-IOV VFs

2016-12-29 Thread Sridhar Samudrala

- Patch 1 introduces devlink interface to get/set the mode of SRIOV switch.
- Patch 2 adds support to create VF port representor(VFPR) netdevs associated
  with SR-IOV VFs that can be used to control/configure VFs from PF name space.
- Patch 3 enables syncing link state between VFs and VFPRs.
- Patch 4 adds a new type to metadata_dst to allow passing VF id to lower 
device.
- Patch 5 adds TX and RX support to VFPR netdevs.
- Patch 6 enables HW and SW VFPR statistics to be exposed via netlink on VFPR
  netdevs.

Jakub Kicinski (1):
  net: store port/representator id in metadata_dst

Sridhar Samudrala (5):
  i40e: Introduce devlink interface.
  i40e: Introduce VF Port Representator(VFPR) netdevs.
  i40e: Sync link state between VFs and VFPRs
  i40e: Add TX and RX support in switchdev mode.
  i40e: Add support for exposing VF port statistics via VFPR netdev on
the host.

 drivers/net/ethernet/intel/Kconfig |   1 +
 drivers/net/ethernet/intel/i40e/i40e.h |   4 +
 drivers/net/ethernet/intel/i40e/i40e_main.c|  96 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 132 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|   1 +
 drivers/net/ethernet/intel/i40e/i40e_type.h|   3 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 321 -
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  25 ++
 include/net/dst_metadata.h |  35 ++-
 net/core/dst.c |  15 +-
 net/core/filter.c  |   1 +
 net/ipv4/ip_tunnel_core.c  |   6 +-
 net/openvswitch/flow_netlink.c |   4 +-
 13 files changed, 610 insertions(+), 34 deletions(-)

-- 
2.5.5

[net-next PATCH 1/6] i40e: Introduce devlink interface.

2016-12-29 Thread Sridhar Samudrala

Add initial devlink support to get/set the mode of SRIOV switch.
This patch sets the default mode as 'legacy' and enables getting the mode
and and setting it to 'legacy'.

The switch mode can be get/set via following 'devlink' commands.

# devlink dev eswitch show pci/:05:00.0
pci/:05:00.0: mode legacy
# devlink dev eswitch set pci/:05:00.0 mode switchdev
devlink answers: Operation not supported
# devlink dev eswitch set pci/:05:00.0 mode legacy
# devlink dev eswitch show pci/:05:00.0
pci/:05:00.0: mode legacy

Signed-off-by: Sridhar Samudrala 
---
 drivers/net/ethernet/intel/Kconfig  |  1 +
 drivers/net/ethernet/intel/i40e/i40e.h  |  3 ++
 drivers/net/ethernet/intel/i40e/i40e_main.c | 80 ++---
 3 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/Kconfig 
b/drivers/net/ethernet/intel/Kconfig
index 1349b45..0dbb87e 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -215,6 +215,7 @@ config I40E
tristate "Intel(R) Ethernet Controller XL710 Family support"
imply PTP_1588_CLOCK
depends on PCI
+   depends on MAY_USE_DEVLINK
---help---
  This driver supports Intel(R) Ethernet Controller XL710 Family of
  devices.  For more information on how to identify your adapter, go
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index ba8d309..410f83d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -54,6 +54,8 @@
 #include 
 #include 
 #include 
+#include 
+
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #ifdef I40E_FCOE
@@ -448,6 +450,7 @@ struct i40e_pf {
u32 ioremap_len;
u32 fd_inv;
u16 phy_led_val;
+   enum devlink_eswitch_mode eswitch_mode;
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ad4cf63..c01a620 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10910,6 +10910,57 @@ static void i40e_get_platform_mac_addr(struct pci_dev 
*pdev, struct i40e_pf *pf)
 }
 
 /**
+ * i40e_devlink_eswitch_mode_get
+ *
+ * @devlink: pointer to devlink struct
+ * @mode: sr-iov switch mode pointer
+ *
+ * Returns the switch mode of the associated PF in the @mode pointer.
+ */
+static int i40e_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+   struct i40e_pf *pf = devlink_priv(devlink);
+
+   *mode = pf->eswitch_mode;
+
+   return 0;
+}
+
+/**
+ * i40e_devlink_eswitch_mode_set
+ *
+ * @devlink: pointer to devlink struct
+ * @mode: sr-iov switch mode
+ *
+ * Set the switch mode of the associated PF.
+ * Returns 0 on success and -EOPNOTSUPP on error.
+ */
+static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+   struct i40e_pf *pf = devlink_priv(devlink);
+   int err = 0;
+
+   if (mode == pf->eswitch_mode)
+   goto done;
+
+   switch (mode) {
+   case DEVLINK_ESWITCH_MODE_LEGACY:
+   pf->eswitch_mode = mode;
+   break;
+   default:
+   err = -EOPNOTSUPP;
+   break;
+   }
+done:
+   return err;
+}
+
+static const struct devlink_ops i40e_devlink_ops = {
+   .eswitch_mode_get = i40e_devlink_eswitch_mode_get,
+   .eswitch_mode_set = i40e_devlink_eswitch_mode_set,
+};
+
+/**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in i40e_pci_tbl
@@ -10926,6 +10977,7 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
struct i40e_pf *pf;
struct i40e_hw *hw;
static u16 pfs_found;
+   struct devlink *devlink;
u16 wol_nvm_bits;
u16 link_status;
int err;
@@ -10959,20 +11011,28 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
pci_enable_pcie_error_reporting(pdev);
pci_set_master(pdev);
 
+   devlink = devlink_alloc(&i40e_devlink_ops, sizeof(*pf));
+   if (!devlink) {
+   dev_err(&pdev->dev, "devlink_alloc failed\n");
+   err = -ENOMEM;
+   goto err_devlink_alloc;
+   }
+
/* Now that we have a PCI connection, we need to do the
 * low level device setup.  This is primarily setting up
 * the Admin Queue structures and then querying for the
 * device's current profile information.
 */
-   pf = kzalloc(sizeof(*pf), GFP_KERNEL);
-   if (!pf) {
-   err = -ENOMEM;
-   goto err_pf_alloc;
-   }
+   pf = devlink_priv(devlink);
pf->next_vsi = 0;
pf->pdev = pdev;
set_bit(__I40E_DOWN, &pf->state);
 
+   pf->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
+   err = devlink_register(devlink, &pdev->dev);
+   if (err)
+   go

[net-next PATCH 5/6] i40e: Add TX and RX support in switchdev mode.

2016-12-29 Thread Sridhar Samudrala

In switchdev mode, broadcast filter is not enabled on VFs, The broadcasts and
unknown frames from VFs are received by the PF and passed to corresponding VF
port representator netdev.
A host based switching entity like a linux bridge or OVS redirects these frames
to the right VFs via VFPR netdevs. Any frames sent via VFPR netdevs are sent as
directed transmits to the corresponding VFs. To enable directed transmit, skb
metadata dst is used to pass the VF id and the frame is requeued to call the PFs
transmit routine.

Small script to demonstrate inter VF pings in switchdev mode.
PF: enp5s0f0, VFs: enp5s2,enp5s2f1 VFPRs:enp5s0f0-vf0, enp5s0f0-vf1

# rmmod i40e; modprobe i40e
# devlink dev eswitch set pci/:05:00.0 mode switchdev
# echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
# ip link set enp5s0f0 vf 0 mac 00:11:22:33:44:55
# ip link set enp5s0f0 vf 1 mac 00:11:22:33:44:56
# rmmod i40evf; modprobe i40evf

/* Create 2 namespaces and move the VFs to
# ip netns add ns0
# ip link set enp5s2 netns ns0
# ip netns exec ns0 ip addr add 192.168.1.10/24 dev enp5s2
# ip netns exec ns0 ip link set enp5s2 up
# ip netns add ns1
# ip link set enp5s2f1 netns ns1
# ip netns exec ns1 ip addr add 192.168.1.11/24 dev enp5s2f1
# ip netns exec ns1 ip link set enp5s2f1 up

/* bring up pf and vfpr netdevs */
# ip link set enp5s0f0 up
# ip link set enp5s0f0-vf0 up
# ip link set enp5s0f0-vf1 up

/* Create a linux bridge and add vfpr netdevs to it. */
# ip link add vfpr-br type bridge
# ip link set enp5s0f0-vf0 master vfpr-br
# ip link set enp5s0f0-vf1 master vfpr-br
# ip addr add 192.168.1.1/24 dev vfpr-br
# ip link set vfpr-br up

# ip netns exec ns0 ping -c3 192.168.1.11
# ip netns exec ns1 ping -c3 192.168.1.10

Signed-off-by: Sridhar Samudrala 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c|  4 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 92 --
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|  1 +
 drivers/net/ethernet/intel/i40e/i40e_type.h|  3 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 19 -
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  1 +
 7 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 410f83d..1c88db5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "i40e_type.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 03d07dd..431694e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10938,6 +10938,7 @@ static int i40e_devlink_eswitch_mode_get(struct devlink 
*devlink, u16 *mode)
 static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
 {
struct i40e_pf *pf = devlink_priv(devlink);
+   struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
struct i40e_vf *vf;
int i, err = 0;
 
@@ -10951,6 +10952,8 @@ static int i40e_devlink_eswitch_mode_set(struct devlink 
*devlink, u16 mode)
i40e_free_vfpr_netdev(vf);
}
pf->eswitch_mode = mode;
+   vsi->netdev->priv_flags |=
+   (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
break;
case DEVLINK_ESWITCH_MODE_SWITCHDEV:
for (i = 0; i < pf->num_alloc_vfs; i++) {
@@ -10958,6 +10961,7 @@ static int i40e_devlink_eswitch_mode_set(struct devlink 
*devlink, u16 mode)
i40e_alloc_vfpr_netdev(vf, i);
}
 pf->eswitch_mode = mode;
+   netif_keep_dst(vsi->netdev);
break;
default:
err = -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 352cf7c..b46ddaa 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1176,16 +1176,37 @@ static bool i40e_alloc_mapped_page(struct i40e_ring 
*rx_ring,
  * @rx_ring:  rx ring in play
  * @skb: packet to send up
  * @vlan_tag: vlan tag for packet
+ * @lpbk: is it a loopback frame?
  **/
 static void i40e_receive_skb(struct i40e_ring *rx_ring,
-struct sk_buff *skb, u16 vlan_tag)
+struct sk_buff *skb, u16 vlan_tag, bool lpbk)
 {
struct i40e_q_vector *q_vector = rx_ring->q_vector;
+   struct i40e_pf *pf = rx_ring->vsi->back;
+   struct i40e_vf *vf;
+   struct ethhdr *eth;
+   int vf_id;
 
if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
(vlan_tag & VLAN_VID_MASK))
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 
+   if ((pf->es

[net-next PATCH 6/6] i40e: Add support for exposing VF port statistics via VFPR netdev on the host.

2016-12-29 Thread Sridhar Samudrala

By default stats counted by HW are returned via the original ndo_get_stats64()
api. Stats counted in SW are returned via ndo_get_offload_stats() api.

Small script to demonstrate vfpr stats in switchdev mode.
PF: enp5s0f0, VFs: enp5s2,enp5s2f1 VFPRs:enp5s0f0-vf0, enp5s0f0-vf1

# rmmod i40e; modprobe i40e
# devlink dev eswitch set pci/:05:00.0 mode switchdev
# echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
# ip link set enp5s0f0 vf 0 mac 00:11:22:33:44:55
# ip link set enp5s0f0 vf 1 mac 00:11:22:33:44:56
# rmmod i40evf; modprobe i40evf

/* Create 2 namespaces and move the VFs to
# ip netns add ns0
# ip link set enp5s2 netns ns0
# ip netns exec ns0 ip addr add 192.168.1.10/24 dev enp5s2
# ip netns exec ns0 ip link set enp5s2 up
# ip netns add ns1
# ip link set enp5s2f1 netns ns1
# ip netns exec ns1 ip addr add 192.168.1.11/24 dev enp5s2f1
# ip netns exec ns1 ip link set enp5s2f1 up

/* bring up pf and vfpr netdevs */
# ip link set enp5s0f0 up
# ip link set enp5s0f0-vf0 up
# ip link set enp5s0f0-vf1 up

/* Create a linux bridge and add vfpr netdevs to it. */
# ip link add vfpr-br type bridge
# ip link set enp5s0f0-vf0 master vfpr-br
# ip link set enp5s0f0-vf1 master vfpr-br
# ip addr add 192.168.1.1/24 dev vfpr-br
# ip link set vfpr-br up

# ip netns exec ns0 ping -c3 192.168.1.11
# ip netns exec ns1 ping -c3 192.168.1.10

# ip netns exec ns0 ip -s l show enp5s2
56: enp5s2:  mtu 1500 qdisc mq state UP mode 
DEFAULT group default qlen 1000
link/ether 00:11:22:33:44:55 brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast
1468   18   0   0   0   0
TX: bytes  packets  errors  dropped carrier collsns
1398   17   0   0   0   0
# ip -s l show enp5s0f0-vf0
52: enp5s0f0-vf0:  mtu 1500 qdisc fq_codel 
master vfpr-br state UP mode DEFAULT group default qlen 1000
link/ether 68:05:ca:2e:72:68 brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast
1398   17   0   0   0   0
TX: bytes  packets  errors  dropped carrier collsns
1468   18   0   0   0   0
# ip netns exec ns1 ip -s l show enp5s2f1
57: enp5s2f1:  mtu 1500 qdisc mq state UP mode 
DEFAULT group default qlen 1000
link/ether 00:11:22:33:44:56 brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast
1486   18   0   0   0   0
TX: bytes  packets  errors  dropped carrier collsns
1538   19   0   0   0   0
# ip -s l show enp5s0f0-vf1
53: enp5s0f0-vf1:  mtu 1500 qdisc fq_codel 
master vfpr-br state UP mode DEFAULT group default qlen 1000
link/ether 68:05:ca:2e:72:68 brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast
1538   19   0   0   0   0
TX: bytes  packets  errors  dropped carrier collsns
1486   18   0   0   0   0

Signed-off-by: Sridhar Samudrala 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|  44 +++-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 111 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  10 ++
 3 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b46ddaa..9f04337 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1172,6 +1172,32 @@ static bool i40e_alloc_mapped_page(struct i40e_ring 
*rx_ring,
 }
 
 /**
+ * i40e_vfpr_receive_skb
+ * @vf: pointer to VF
+ * @skb: packet to send up
+ *
+ * Update skb dev to vfpr netdev and rx stats.
+ **/
+static void i40e_vfpr_receive_skb(struct i40e_vf *vf, struct sk_buff *skb)
+{
+   struct i40e_vfpr_netdev_priv *priv;
+   struct vfpr_pcpu_stats *vfpr_stats;
+
+   if (!vf->vfpr_netdev)
+   return;
+
+   skb->dev = vf->vfpr_netdev;
+
+   priv = netdev_priv(vf->vfpr_netdev);
+   vfpr_stats = this_cpu_ptr(priv->vfpr_stats);
+
+   u64_stats_update_begin(&vfpr_stats->syncp);
+   vfpr_stats->rx_packets++;
+   vfpr_stats->rx_bytes += skb->len;
+   u64_stats_update_end(&vfpr_stats->syncp);
+}
+
+/**
  * i40e_receive_skb - Send a completed packet up the stack
  * @rx_ring:  rx ring in play
  * @skb: packet to send up
@@ -1201,7 +1227,7 @@ static void i40e_receive_skb(struct i40e_ring *rx_ring,
for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
vf = &pf->vf[vf_id];
if (ether_addr_equal(eth->h_source, vf->default_lan_addr.addr)) 
{
-   skb->dev = vf->vfpr_netdev;
+   i40e_vfpr_receive_skb(vf, skb);
break;
}
}
@@ -3072,11 +3098,25 @@ netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff 
*skb,
struct i40e_vf *vf = priv->vf;
struct i40e_pf *pf = vf->pf;
struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+

[net-next PATCH 3/6] i40e: Sync link state between VFs and VFPRs

2016-12-29 Thread Sridhar Samudrala

This patch enables
- reflecting the link state of VFPR based on VF admin state & link state
  of VF based on admin state of VFPR.
- bringing up/down the VFPR sends a notification to update VF link state.
- bringing up/down the VF will cause the link state update of VFPR.
- enable/disable VF link state via ndo_set_vf_link_state will update the
  admin state of associated VFPR.

PF: enp5s0f0, VFs: enp5s2,enp5s2f1 VFPRs:enp5s0f0-vf0, enp5s0f0-vf1
# rmmod i40e; modprobe i40e
# devlink dev eswitch set pci/:05:00.0 mode switchdev
# echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs

# ip link set enp5s2 up
# ip link set enp5s0f0-vf0 up
# ip link set enp5s0f0-vf1 up

/* enp5s2 UP -> enp5s0f0-vf0 CARRIER ON */
# ip link show enp5s0f0-vf0
215: enp5s0f0-vf0:  mtu 1500 qdisc fq_codel 
state UP mode DEFAULT group default qlen 1000
 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

/* enp5s0f0-vf0 UP -> enp5s2 CARRIER ON */
# ip link show enp5s2
218: enp5s2:  mtu 1500 qdisc mq state UP mode 
DEFAULT group default qlen 1000
 link/ether ea:4d:60:bc:6f:85 brd ff:ff:ff:ff:ff:ff

/* enp5s2f1 DOWN -> enp5s0f0-vf1 NO-CARRIER */
# ip link show enp5s0f0-vf1
216: enp5s0f0-vf1:  mtu 1500 qdisc fq_codel 
state DOWN mode DEFAULT group default qlen 1000
 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

# ip link set enp5s0f0-vf0 down
# ip link set enp5s2f1 up

/* enp5s0-vf0 DOWN -> enp5s2 NO_CARRIER */
# ip link show enp5s2
218: enp5s2:  mtu 1500 qdisc mq state DOWN 
mode DEFAULT group default qlen 1000
 link/ether ea:4d:60:bc:6f:85 brd ff:ff:ff:ff:ff:ff

# ip -d link show enp5s0f0
213: enp5s0f0:  mtu 1500 qdisc noop portid 6805ca27268 
state DOWN mode DEFAULT group default qlen 1000
 link/ether 68:05:ca:2e:72:68 brd ff:ff:ff:ff:ff:ff promiscuity 0 
addrgenmode eui64
 vf 0 MAC 00:00:00:00:00:00, spoof checking on, link-state disable, trust 
off
 vf 1 MAC 00:00:00:00:00:00, spoof checking on, link-state enable, trust off

Signed-off-by: Sridhar Samudrala 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 44 ++
 1 file changed, 44 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 6c5b296..3ea7235 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1031,6 +1031,13 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr)
  **/
 static int i40e_vfpr_netdev_open(struct net_device *dev)
 {
+   struct i40e_vfpr_netdev_priv *priv = netdev_priv(dev);
+   struct i40e_vf *vf = priv->vf;
+
+   vf->link_forced = true;
+   vf->link_up = true;
+   i40e_vc_notify_vf_link_state(vf);
+
return 0;
 }
 
@@ -1042,6 +1049,13 @@ static int i40e_vfpr_netdev_open(struct net_device *dev)
  **/
 static int i40e_vfpr_netdev_stop(struct net_device *dev)
 {
+   struct i40e_vfpr_netdev_priv *priv = netdev_priv(dev);
+   struct i40e_vf *vf = priv->vf;
+
+   vf->link_forced = true;
+   vf->link_up = false;
+   i40e_vc_notify_vf_link_state(vf);
+
return 0;
 }
 
@@ -1127,6 +1141,13 @@ int i40e_alloc_vfpr_netdev(struct i40e_vf *vf, u16 
vf_num)
/* Delete broadcast filter for VF */
i40e_update_vf_broadcast_filter(vf, false);
 
+   /* Reset VF link as we are changing the mode to 'switchdev'. VFPR netdev
+* needs to be brought up to enable VF link.
+*/
+   vf->link_forced = true;
+   vf->link_up = false;
+   i40e_vc_notify_vf_link_state(vf);
+
return 0;
 }
 
@@ -1151,6 +1172,10 @@ void i40e_free_vfpr_netdev(struct i40e_vf *vf)
 
/* Add broadcast filter to VF */
i40e_update_vf_broadcast_filter(vf, true);
+
+   /* In legacy mode, VF link is not controlled by VFPR */
+   vf->link_forced = false;
+   i40e_vc_notify_vf_link_state(vf);
 }
 
 /**
@@ -1907,6 +1932,10 @@ static int i40e_vc_enable_queues_msg(struct i40e_vf *vf, 
u8 *msg, u16 msglen)
 
if (i40e_vsi_start_rings(pf->vsi[vf->lan_vsi_idx]))
aq_ret = I40E_ERR_TIMEOUT;
+
+   if ((0 == aq_ret) && vf->vfpr_netdev)
+   netif_carrier_on(vf->vfpr_netdev);
+
 error_param:
/* send the response to the VF */
return i40e_vc_send_resp_to_vf(vf, I40E_VIRTCHNL_OP_ENABLE_QUEUES,
@@ -1946,6 +1975,9 @@ static int i40e_vc_disable_queues_msg(struct i40e_vf *vf, 
u8 *msg, u16 msglen)
 
i40e_vsi_stop_rings(pf->vsi[vf->lan_vsi_idx]);
 
+   if ((0 == aq_ret) && vf->vfpr_netdev)
+   netif_carrier_off(vf->vfpr_netdev);
+
 error_param:
/* send the response to the VF */
return i40e_vc_send_resp_to_vf(vf, I40E_VIRTCHNL_OP_DISABLE_QUEUES,
@@ -3186,6 +3218,7 @@ int i40e_ndo_set_vf_link_state(struct net_device *netdev, 
int vf_id, int link)
struct i40e_netdev_priv *np = netdev_priv(netdev);
struct i40e_pf *pf = np->vsi->back;
struct i40e_virtchnl_pf_event pfe;
+   struct ne

[net-next PATCH 2/6] i40e: Introduce VF Port Representator(VFPR) netdevs.

2016-12-29 Thread Sridhar Samudrala

VF Port Representator netdevs are created for each VF if the switch mode
is set to 'switchdev'. These netdevs can be used to control and configure
VFs from PFs namespace. They enable exposing VF statistics, configure and
monitor link state, mtu, filters, fdb/vlan entries etc. of VFs.
Broadcast filters are not enabled in switchdev mode.

Sample script to create VF port representors
# rmmod i40e; modprobe i40e
# devlink dev eswitch set pci/:05:00.0 mode switchdev
# echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
# ip l show
297: enp5s0f0:  mtu 1500 qdisc noop portid 6805ca2e7268 
state DOWN mode DEFAULT group default qlen 1000
 link/ether 68:05:ca:2e:72:68 brd ff:ff:ff:ff:ff:ff
 vf 0 MAC 00:00:00:00:00:00, spoof checking on, link-state auto, trust off
 vf 1 MAC 00:00:00:00:00:00, spoof checking on, link-state auto, trust off
299: enp5s0f0-vf0:  mtu 1500 qdisc noop state DOWN mode 
DEFAULT group default qlen 1000
 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
300: enp5s0f0-vf1:  mtu 1500 qdisc noop state DOWN mode 
DEFAULT group default qlen 1000
 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Sridhar Samudrala 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c|  14 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 151 -
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  14 ++
 3 files changed, 172 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c01a620..03d07dd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10938,15 +10938,27 @@ static int i40e_devlink_eswitch_mode_get(struct 
devlink *devlink, u16 *mode)
 static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
 {
struct i40e_pf *pf = devlink_priv(devlink);
-   int err = 0;
+   struct i40e_vf *vf;
+   int i, err = 0;
 
if (mode == pf->eswitch_mode)
goto done;
 
switch (mode) {
case DEVLINK_ESWITCH_MODE_LEGACY:
+   for (i = 0; i < pf->num_alloc_vfs; i++) {
+   vf = &(pf->vf[i]);
+   i40e_free_vfpr_netdev(vf);
+   }
pf->eswitch_mode = mode;
break;
+   case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+   for (i = 0; i < pf->num_alloc_vfs; i++) {
+   vf = &(pf->vf[i]);
+   i40e_alloc_vfpr_netdev(vf, i);
+   }
+pf->eswitch_mode = mode;
+   break;
default:
err = -EOPNOTSUPP;
break;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index a6198b7..6c5b296 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -697,12 +697,16 @@ static int i40e_alloc_vsi_res(struct i40e_vf *vf, enum 
i40e_vsi_type type)
 "Could not add MAC filter %pM for VF 
%d\n",
vf->default_lan_addr.addr, vf->vf_id);
}
-   eth_broadcast_addr(broadcast);
-   f = i40e_add_filter(vsi, broadcast,
-   vf->port_vlan_id ? vf->port_vlan_id : -1);
-   if (!f)
-   dev_info(&pf->pdev->dev,
-"Could not allocate VF broadcast filter\n");
+
+   /* Add VF broadcast filter only in 'legacy' mode */
+   if (vsi->back->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY) {
+   eth_broadcast_addr(broadcast);
+   f = i40e_add_filter(vsi, broadcast,
+   vf->port_vlan_id ? vf->port_vlan_id 
: -1);
+   if (!f)
+   dev_info(&pf->pdev->dev,
+"Could not allocate VF broadcast 
filter\n");
+   }
spin_unlock_bh(&vsi->mac_filter_hash_lock);
i40e_write_rx_ctl(&pf->hw, I40E_VFQF_HENA1(0, vf->vf_id),
  (u32)hena);
@@ -1020,6 +1024,136 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr)
 }
 
 /**
+ * i40e_vfpr_netdev_open
+ * @dev: network interface device structure
+ *
+ * Called when vfpr netdevice is brought up.
+ **/
+static int i40e_vfpr_netdev_open(struct net_device *dev)
+{
+   return 0;
+}
+
+/**
+ * i40e_vfpr_netdev_stop
+ * @dev: network interface device structure
+ *
+ * Called when vfpr netdevice is brought down.
+ **/
+static int i40e_vfpr_netdev_stop(struct net_device *dev)
+{
+   return 0;
+}
+
+static const struct net_device_ops i40e_vfpr_netdev_ops = {
+   .ndo_open   = i40e_vfpr_netdev_open,
+   .ndo_stop   = i40e_vfpr_netdev_stop,
+};

Re: [PATCH] [media] gp8psk: fix spelling mistake: "firmare" -> "firmware"

2016-12-29 Thread Colin Ian King

On 29/12/16 21:23, VDR User wrote:
>> -   err("firmare chunk size bigger than 64 bytes.");
>> +   err("firmware chunk size bigger than 64 bytes.");
> 
> Yup.
> 
>> -"HW don't support CMAC encrypiton, use software 
>> CMAC encrypiton\n");
>> +"HW don't support CMAC encryption, use software 
>> CMAC encryption\n");
> 
> Should be: "HW doesn't support CMAC encryption, use software CMAC
> encryption\n");
> 
Very true, I was so focused on the spelling I overlooked the grammar.
I'll re-send with that fixed.

Colin

[PATCH][V2] [media] gp8psk: fix spelling mistake: "firmare" -> "firmware"

2016-12-29 Thread Colin King

From: Colin Ian King 

Trivial fix to spelling mistake in err message. Also change "don't" to
"does not".

Signed-off-by: Colin Ian King 
---
 drivers/media/usb/dvb-usb/gp8psk.c  | 2 +-
 drivers/net/wireless/realtek/rtlwifi/core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/media/usb/dvb-usb/gp8psk.c 
b/drivers/media/usb/dvb-usb/gp8psk.c
index 2360e7e..26461f2 100644
--- a/drivers/media/usb/dvb-usb/gp8psk.c
+++ b/drivers/media/usb/dvb-usb/gp8psk.c
@@ -161,7 +161,7 @@ static int gp8psk_load_bcm4500fw(struct dvb_usb_device *d)
goto out_free;
}
if (buflen > 64) {
-   err("firmare chunk size bigger than 64 bytes.");
+   err("firmware chunk size bigger than 64 bytes.");
goto out_free;
}
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index ded1493..732de0a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1532,7 +1532,7 @@ static int rtl_op_set_key(struct ieee80211_hw *hw, enum 
set_key_cmd cmd,
key_type = AESCMAC_ENCRYPTION;
RT_TRACE(rtlpriv, COMP_SEC, DBG_DMESG, "alg:CMAC\n");
RT_TRACE(rtlpriv, COMP_SEC, DBG_DMESG,
-"HW don't support CMAC encrypiton, use software CMAC 
encrypiton\n");
+"HW does not support CMAC encryption, use software 
CMAC encryption\n");
err = -EOPNOTSUPP;
goto out_unlock;
default:
-- 
2.10.2

[PATCH net-next] net: dsa: Implement ndo_get_phys_port_id

2016-12-29 Thread Florian Fainelli

Implement ndo_get_phys_port_id() by returning the physical port number
of the switch this per-port DSA created network interface corresponds
to.

Signed-off-by: Florian Fainelli 
---
 net/dsa/slave.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 68c9eea00518..ffd91969b830 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -984,6 +984,17 @@ static void dsa_slave_poll_controller(struct net_device 
*dev)
 }
 #endif
 
+static int dsa_slave_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid)
+{
+   struct dsa_slave_priv *p = netdev_priv(dev);
+
+   ppid->id_len = sizeof(p->port);
+   memcpy(ppid->id, &p->port, ppid->id_len);
+
+   return 0;
+}
+
 void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
 {
ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -1031,6 +1042,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = 
{
.ndo_bridge_getlink = switchdev_port_bridge_getlink,
.ndo_bridge_setlink = switchdev_port_bridge_setlink,
.ndo_bridge_dellink = switchdev_port_bridge_dellink,
+   .ndo_get_phys_port_id   = dsa_slave_get_phys_port_id,
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
-- 
2.9.3

[PATCH net] net: ipv4: dst for local input routes should use l3mdev if relevant

2016-12-29 Thread David Ahern

IPv4 output routes already use l3mdev device instead of loopback for dst's
if it is applicable. Change local input routes to do the same.

This fixes icmp responses for unreachable UDP ports which are directed
to the wrong table after commit 9d1a6c4ea43e4 because local_input
routes use the loopback device. Moving from ingress device to loopback
loses the L3 domain causing responses based on the dst to get to lost.

Fixes: 9d1a6c4ea43e4 ("net: icmp_route_lookup should use rt dev to
   determine L3 domain")
Signed-off-by: David Ahern 
---
 net/ipv4/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a82a11747b3f..0fcac8e7a2b2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1914,7 +1914,8 @@ out:  return err;
}
}
 
-   rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
+   rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
+  flags | RTCF_LOCAL, res.type,
   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
-- 
2.1.4

[PATCH net-next] net: Allow IP_MULTICAST_IF to set index to L3 slave

2016-12-29 Thread David Ahern

IP_MULTICAST_IF fails if sk_bound_dev_if is already set and the new index
does not match it. e.g.,

ntpd[15381]: setsockopt IP_MULTICAST_IF 192.168.1.23 fails: Invalid argument

Relax the check in setsockopt to allow setting mc_index to an L3 slave if
sk_bound_dev_if points to an L3 master.

Make a similar change for IPv6. In this case change the device lookup to
take the rcu_read_lock avoiding a refcnt. The rcu lock is also needed for
the lookup of a potential L3 master device.

This really only silences a setsockopt failure since uses of mc_index are
secondary to sk_bound_dev_if if it is set. In both cases, if either index
is an L3 slave or master, lookups are directed to the same FIB table so
relaxing the check at setsockopt time causes no harm.

Patch is based on a suggested change by Darwin for a problem noted in
their code base.

Suggested-by: Darwin Dingel 
Signed-off-by: David Ahern 
---
 net/ipv4/ip_sockglue.c   |  7 ++-
 net/ipv6/ipv6_sockglue.c | 16 
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8b13881ed064..72071a9a348d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
{
struct ip_mreqn mreq;
struct net_device *dev = NULL;
+   int midx;
 
if (sk->sk_type == SOCK_STREAM)
goto e_inval;
@@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = -EADDRNOTAVAIL;
if (!dev)
break;
+
+   midx = l3mdev_master_ifindex(dev);
+
dev_put(dev);
 
err = -EINVAL;
if (sk->sk_bound_dev_if &&
-   mreq.imr_ifindex != sk->sk_bound_dev_if)
+   mreq.imr_ifindex != sk->sk_bound_dev_if &&
+   (!midx || midx != sk->sk_bound_dev_if))
break;
 
inet->mc_index = mreq.imr_ifindex;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3ba530373560..bdf1227fd433 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -595,16 +595,24 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
 
if (val) {
struct net_device *dev;
+   int midx;
 
-   if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
-   goto e_inval;
+   rcu_read_lock();
 
-   dev = dev_get_by_index(net, val);
+   dev = dev_get_by_index_rcu(net, val);
if (!dev) {
+   rcu_read_unlock();
retv = -ENODEV;
break;
}
-   dev_put(dev);
+   midx = l3mdev_master_ifindex_rcu(dev);
+
+   rcu_read_unlock();
+
+   if (sk->sk_bound_dev_if &&
+   sk->sk_bound_dev_if != val &&
+   (!midx || midx != sk->sk_bound_dev_if))
+   goto e_inval;
}
np->mcast_oif = val;
retv = 0;
-- 
2.1.4

[PATCH net-next] liquidio: optimize reads from Octeon PCI console

2016-12-29 Thread Felix Manlunas

Reads from Octeon PCI console are inefficient because before each read
operation, a dynamic mapping to Octeon DRAM is set up.  This patch replaces
the repeated setup of a dynamic mapping with a one-time setup of a static
mapping.

Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
---
 .../net/ethernet/cavium/liquidio/octeon_config.h| 10 +++---
 .../net/ethernet/cavium/liquidio/octeon_console.c   | 10 ++
 .../net/ethernet/cavium/liquidio/octeon_device.h|  6 ++
 .../net/ethernet/cavium/liquidio/octeon_mem_ops.c   | 21 -
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_config.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_config.h
index 1cb3514..b3dc2e9 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_config.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_config.h
@@ -429,15 +429,11 @@ struct octeon_config {
 
 /* The following config values are fixed and should not be modified. */
 
-/* Maximum address space to be mapped for Octeon's BAR1 index-based access. */
-#define  MAX_BAR1_MAP_INDEX 2
+#define  BAR1_INDEX_DYNAMIC_MAP  2
+#define  BAR1_INDEX_STATIC_MAP  15
 #define  OCTEON_BAR1_ENTRY_SIZE (4 * 1024 * 1024)
 
-/* BAR1 Index 0 to (MAX_BAR1_MAP_INDEX - 1) for normal mapped memory access.
- * Bar1 register at MAX_BAR1_MAP_INDEX used by driver for dynamic access.
- */
-#define  MAX_BAR1_IOREMAP_SIZE  ((MAX_BAR1_MAP_INDEX + 1) * \
-OCTEON_BAR1_ENTRY_SIZE)
+#define  MAX_BAR1_IOREMAP_SIZE  (16 * OCTEON_BAR1_ENTRY_SIZE)
 
 /* Response lists - 1 ordered, 1 unordered-blocking, 1 unordered-nonblocking
  * NoResponse Lists are now maintained with each IQ. (Dec' 2007).
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
index 3265e0b..42b673d 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
@@ -549,6 +549,16 @@ int octeon_init_consoles(struct octeon_device *oct)
return ret;
}
 
+   /* Dedicate one of Octeon's BAR1 index registers to create a static
+* mapping to a region of Octeon DRAM that contains the PCI console
+* named block.
+*/
+   oct->console_nb_info.bar1_index = BAR1_INDEX_STATIC_MAP;
+   oct->fn_list.bar1_idx_setup(oct, addr, oct->console_nb_info.bar1_index,
+   true);
+   oct->console_nb_info.dram_region_base = addr
+   & ~(OCTEON_BAR1_ENTRY_SIZE - 1ULL);
+
/* num_consoles > 0, is an indication that the consoles
 * are accessible
 */
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index 18f6836..c301a38 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -477,6 +477,12 @@ struct octeon_device {
/* Console caches */
struct octeon_console console[MAX_OCTEON_MAPS];
 
+   /* Console named block info */
+   struct {
+   u64 dram_region_base;
+   int bar1_index;
+   } console_nb_info;
+
/* Coprocessor clock rate. */
u64 coproc_clock_rate;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
index 13a18c9..5cd96e7 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
@@ -23,7 +23,7 @@
 #include "response_manager.h"
 #include "octeon_device.h"
 
-#define MEMOPS_IDX   MAX_BAR1_MAP_INDEX
+#define MEMOPS_IDX   BAR1_INDEX_DYNAMIC_MAP
 
 #ifdef __BIG_ENDIAN_BITFIELD
 static inline void
@@ -96,6 +96,25 @@ __octeon_pci_rw_core_mem(struct octeon_device *oct, u64 addr,
u32 copy_len = 0, index_reg_val = 0;
unsigned long flags;
u8 __iomem *mapped_addr;
+   u64 static_mapping_base;
+
+   static_mapping_base = oct->console_nb_info.dram_region_base;
+
+   if (static_mapping_base &&
+   static_mapping_base == (addr & ~(OCTEON_BAR1_ENTRY_SIZE - 1ULL))) {
+   int bar1_index = oct->console_nb_info.bar1_index;
+
+   mapped_addr = oct->mmio[1].hw_addr
+   + (bar1_index << ilog2(OCTEON_BAR1_ENTRY_SIZE))
+   + (addr & (OCTEON_BAR1_ENTRY_SIZE - 1ULL));
+
+   if (op)
+   octeon_pci_fastread(oct, mapped_addr, hostbuf, len);
+   else
+   octeon_pci_fastwrite(oct, mapped_addr, hostbuf, len);
+
+   return;
+   }
 
spin_lock_irqsave(&oct->mem_access_lock, flags);

Re: [PATCH v5] net: dev_weight: TX/RX orthogonality

2016-12-29 Thread David Miller

From: Matthias Tafelmeier 
Date: Thu, 29 Dec 2016 21:37:21 +0100

> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior 
> processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.
> 
> Signed-off-by: Matthias Tafelmeier 

Much better, applied, thanks.

RE: [PATCH net-next] liquidio: optimize reads from Octeon PCI console

2016-12-29 Thread Chickles, Derek

>  #ifdef __BIG_ENDIAN_BITFIELD
>  static inline void
> @@ -96,6 +96,25 @@ __octeon_pci_rw_core_mem(struct octeon_device
> *oct, u64 addr,
>   u32 copy_len = 0, index_reg_val = 0;
>   unsigned long flags;
>   u8 __iomem *mapped_addr;
> + u64 static_mapping_base;
> +
> + static_mapping_base = oct->console_nb_info.dram_region_base;
> +

Does this work when there are multiple cards?

Re: [PATCH 1/1] r8169: fix the typo

2016-12-29 Thread Yanjun Zhu


Hi,

Please comment on this patch.

Zhu Yanjun

On 2016/12/29 11:11, Zhu Yanjun wrote:

>From the realtek data sheet, the PID0 should be bit 0.

Signed-off-by: Zhu Yanjun 
---
  drivers/net/ethernet/realtek/r8169.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 44389c9..8f1623b 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -696,7 +696,7 @@ enum rtl_tx_desc_bit_1 {
  enum rtl_rx_desc_bit {
/* Rx private */
PID1= (1 << 18), /* Protocol ID bit 1/2 */
-   PID0= (1 << 17), /* Protocol ID bit 2/2 */
+   PID0= (1 << 17), /* Protocol ID bit 0/2 */
  
  #define RxProtoUDP	(PID1)

  #define RxProtoTCP(PID0)

Re: [PATCH net-next] liquidio: optimize reads from Octeon PCI console

2016-12-29 Thread Felix Manlunas

Chickles, Derek  wrote on Thu [2016-Dec-29 17:42:34 
-0800]:
> >  #ifdef __BIG_ENDIAN_BITFIELD
> >  static inline void
> > @@ -96,6 +96,25 @@ __octeon_pci_rw_core_mem(struct octeon_device
> > *oct, u64 addr,
> > u32 copy_len = 0, index_reg_val = 0;
> > unsigned long flags;
> > u8 __iomem *mapped_addr;
> > +   u64 static_mapping_base;
> > +
> > +   static_mapping_base = oct->console_nb_info.dram_region_base;
> > +
> 
> Does this work when there are multiple cards?

Yes, it works when there are multiple cards.

[PATCH v4 2/2] net: 3com: typhoon: typhoon_init_one: make return values more specific

2016-12-29 Thread Thomas Preisner

In some cases the return value of a failing function is not being used
and the function typhoon_init_one() returns another negative error code
instead.

Signed-off-by: Thomas Preisner 
Signed-off-by: Milan Stephan 
---
 drivers/net/ethernet/3com/typhoon.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/3com/typhoon.c 
b/drivers/net/ethernet/3com/typhoon.c
index 25f2e92..1986ad1 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -2370,9 +2370,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 * 4) Get the hardware address.
 * 5) Put the card to sleep.
 */
-   if (typhoon_reset(ioaddr, WaitSleep) < 0) {
+   err = typhoon_reset(ioaddr, WaitSleep);
+   if (err < 0) {
err_msg = "could not reset 3XP";
-   err = -EIO;
goto error_out_dma;
}
 
@@ -2386,16 +2386,16 @@ typhoon_init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
typhoon_init_interface(tp);
typhoon_init_rings(tp);
 
-   if(typhoon_boot_3XP(tp, TYPHOON_STATUS_WAITING_FOR_HOST) < 0) {
+   err = typhoon_boot_3XP(tp, TYPHOON_STATUS_WAITING_FOR_HOST);
+   if (err < 0) {
err_msg = "cannot boot 3XP sleep image";
-   err = -EIO;
goto error_out_reset;
}
 
INIT_COMMAND_WITH_RESPONSE(&xp_cmd, TYPHOON_CMD_READ_MAC_ADDRESS);
-   if(typhoon_issue_command(tp, 1, &xp_cmd, 1, xp_resp) < 0) {
+   err = typhoon_issue_command(tp, 1, &xp_cmd, 1, xp_resp);
+   if (err < 0) {
err_msg = "cannot read MAC address";
-   err = -EIO;
goto error_out_reset;
}
 
@@ -2430,9 +2430,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
if(xp_resp[0].numDesc != 0)
tp->capabilities |= TYPHOON_WAKEUP_NEEDS_RESET;
 
-   if(typhoon_sleep(tp, PCI_D3hot, 0) < 0) {
+   err = typhoon_sleep(tp, PCI_D3hot, 0);
+   if (err < 0) {
err_msg = "cannot put adapter to sleep";
-   err = -EIO;
goto error_out_reset;
}
 
-- 
2.7.4

[PATCH v4 1/2] net: 3com: typhoon: typhoon_init_one: fix incorrect return values

2016-12-29 Thread Thomas Preisner

In a few cases the err-variable is not set to a negative error code if a
function call in typhoon_init_one() fails and thus 0 is returned
instead.
It may be better to set err to the appropriate negative error
code before returning.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188841

Reported-by: Pan Bian 
Signed-off-by: Thomas Preisner 
Signed-off-by: Milan Stephan 
---
 drivers/net/ethernet/3com/typhoon.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/3com/typhoon.c 
b/drivers/net/ethernet/3com/typhoon.c
index 9fe3990..25f2e92 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -2402,8 +2402,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
*(__be16 *)&dev->dev_addr[0] = htons(le16_to_cpu(xp_resp[0].parm1));
*(__be32 *)&dev->dev_addr[2] = htonl(le32_to_cpu(xp_resp[0].parm2));
 
-   if(!is_valid_ether_addr(dev->dev_addr)) {
+   if (!is_valid_ether_addr(dev->dev_addr)) {
err_msg = "Could not obtain valid ethernet address, aborting";
+   err = -EIO;
goto error_out_reset;
}
 
@@ -2411,7 +2412,8 @@ typhoon_init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 * later when we print out the version reported.
 */
INIT_COMMAND_WITH_RESPONSE(&xp_cmd, TYPHOON_CMD_READ_VERSIONS);
-   if(typhoon_issue_command(tp, 1, &xp_cmd, 3, xp_resp) < 0) {
+   err = typhoon_issue_command(tp, 1, &xp_cmd, 3, xp_resp);
+   if (err < 0) {
err_msg = "Could not get Sleep Image version";
goto error_out_reset;
}
@@ -2453,7 +2455,8 @@ typhoon_init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
dev->features = dev->hw_features |
NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_RXCSUM;
 
-   if(register_netdev(dev) < 0) {
+   err = register_netdev(dev);
+   if (err < 0) {
err_msg = "unable to register netdev";
goto error_out_reset;
}
-- 
2.7.4

Re: Re: [PATCH v3 2/2] drivers: net: ethernet: 3com: fix return value

2016-12-29 Thread Thomas Preisner

On Tue, 2016-12-27 at 22:17:35 +0100, David Dillow wrote:
>On Sun, 2016-12-25 at 01:30 +0100, Thomas Preisner wrote:
>> In some cases the return value of a failing function is not being used
>> and the function typhoon_init_one() returns another negative error
>> code instead.
>
>I'm not sure these changes are especially valuable, since we'll need to
>look at the dmesg log anyways to figure out what went wrong, but again I
>don't feel strongly.
>
>Fix up the subject issues and I'm happy to ack them.

As requested, here are the patchsets with the fixed subjects.
The subjects aswell as the subject prefixes are more precise now.
Hopefully that's ok.

Patch 1:
Makes the function typhoon_init_one() return a negative error code instead of 0.

Patch 2 [Optional]:
Makes the function typhoon_init_one() return the return value of the
corresponding failing function calls instead of a "fixed" negative error code.

With Regards,
Milan and Thomas

Re: [net-next PATCH 2/6] i40e: Introduce VF Port Representator(VFPR) netdevs.

2016-12-29 Thread David Miller

From: Sridhar Samudrala 
Date: Thu, 29 Dec 2016 22:20:59 -0800

>   /* VF resources get allocated during reset */
>   i40e_reset_vf(&vfs[i], false);
>  
> + if (pf->eswitch_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV)
> + i40e_alloc_vfpr_netdev(&vfs[i], i);

You aren't checking for, and unwinding from, errors signalled
by i40e_alloc_vfpr_netdev().

Re: [net-next PATCH 0/6] i40e: Add VF port representator support or SR-IOV VFs

2016-12-29 Thread David Miller

From: Sridhar Samudrala 
Date: Thu, 29 Dec 2016 22:20:57 -0800

> - Patch 1 introduces devlink interface to get/set the mode of SRIOV switch.
> - Patch 2 adds support to create VF port representor(VFPR) netdevs associated
>   with SR-IOV VFs that can be used to control/configure VFs from PF name 
> space.
> - Patch 3 enables syncing link state between VFs and VFPRs.
> - Patch 4 adds a new type to metadata_dst to allow passing VF id to lower 
> device.
> - Patch 5 adds TX and RX support to VFPR netdevs.
> - Patch 6 enables HW and SW VFPR statistics to be exposed via netlink on VFPR
>   netdevs.

Patch 4 did not show up on the lists.

The computer you did this work on is configured with a time in the future.

Re: [PATCH] sh_eth: fix branch prediction in sh_eth_interrupt()

2016-12-29 Thread David Miller

From: Sergei Shtylyov 
Date: Fri, 30 Dec 2016 00:07:38 +0300

> IIUC, likely()/unlikely() should apply to the whole *if* statement's
> expression, not a part of it  -- fix such expression in  sh_eth_interrupt()
> accordingly...
> 
> Fixes: 283e38db65e7 ("sh_eth: Fix serialisation of interrupt disable with 
> interrupt & NAPI handlers")
> Signed-off-by: Sergei Shtylyov 
> 
> ---
> The patch is against DaveM's 'net-next.git' repo; I'm not sure if it should
> be  targeted to the 'net.git' repo instead...

I decided to apply this to 'net', thanks.

Re: [PATCH net-next] net: dsa: Implement ndo_get_phys_port_id

2016-12-29 Thread David Miller

From: Florian Fainelli 
Date: Thu, 29 Dec 2016 14:20:56 -0800

> Implement ndo_get_phys_port_id() by returning the physical port number
> of the switch this per-port DSA created network interface corresponds
> to.
> 
> Signed-off-by: Florian Fainelli 

Applied, thanks.

Re: [PATCH net-next rfc 0/6] convert tc_verd to integer bitfields

2016-12-29 Thread David Miller

From: Willem de Bruijn 
Date: Wed, 28 Dec 2016 14:13:24 -0500

> The skb tc_verd field takes up two bytes but uses far fewer bits.
> Convert the remaining use cases to bitfields that fit in existing
> holes (depending on config options) and potentially save the two
> bytes in struct sk_buff.

I like the looks of this, for sure :-)

Re: [PATCH net-next] liquidio: optimize reads from Octeon PCI console

2016-12-29 Thread David Miller

From: Felix Manlunas 
Date: Thu, 29 Dec 2016 17:04:47 -0800

> Reads from Octeon PCI console are inefficient because before each read
> operation, a dynamic mapping to Octeon DRAM is set up.  This patch replaces
> the repeated setup of a dynamic mapping with a one-time setup of a static
> mapping.
> 
> Signed-off-by: Felix Manlunas 
> Signed-off-by: Raghu Vatsavayi 
> Signed-off-by: Derek Chickles 
> Signed-off-by: Satanand Burla 

Applied.

Re: [PATCH net] net: ipv4: dst for local input routes should use l3mdev if relevant

2016-12-29 Thread David Miller

From: David Ahern 
Date: Thu, 29 Dec 2016 15:29:03 -0800

> IPv4 output routes already use l3mdev device instead of loopback for dst's
> if it is applicable. Change local input routes to do the same.
> 
> This fixes icmp responses for unreachable UDP ports which are directed
> to the wrong table after commit 9d1a6c4ea43e4 because local_input
> routes use the loopback device. Moving from ingress device to loopback
> loses the L3 domain causing responses based on the dst to get to lost.
> 
> Fixes: 9d1a6c4ea43e4 ("net: icmp_route_lookup should use rt dev to
>  determine L3 domain")
> Signed-off-by: David Ahern 

Applied and queued up for -stable, thanks David.

Re: [PATCH] igmp: Make igmp group member RFC 3376 compliant

2016-12-29 Thread David Miller

From: Michal Tesar 
Date: Wed, 7 Dec 2016 13:38:57 +0100

> would it be possible to have another look at this patch and reconsider
> its behavior? I really believe that current code does not behave
> correctly.

Please resubmit your patch.

Re: [PATCH net-next V2 3/3] tun: rx batching

2016-12-29 Thread Jason Wang




On 2016年12月30日 00:35, David Miller wrote:

From: Jason Wang 
Date: Wed, 28 Dec 2016 16:09:31 +0800


+   spin_lock(&queue->lock);
+   qlen = skb_queue_len(queue);
+   if (qlen > rx_batched)
+   goto drop;
+   __skb_queue_tail(queue, skb);
+   if (!more || qlen + 1 > rx_batched) {
+   __skb_queue_head_init(&process_queue);
+   skb_queue_splice_tail_init(queue, &process_queue);
+   rcv = true;
+   }
+   spin_unlock(&queue->lock);

Since you always clear the 'queue' when you insert the skb that hits
the limit, I don't see how the "goto drop" path can be possibly taken.


True, will fix this.

Thanks

[PATCH net-next V3 0/3] vhost_net tx batching

2016-12-29 Thread Jason Wang

Hi:

This series tries to implement tx batching support for vhost. This was
done by using MSG_MORE as a hint for under layer socket. The backend
(e.g tap) can then batch the packets temporarily in a list and
submit it all once the number of bacthed exceeds a limitation.

Tests shows obvious improvement on guest pktgen over over
mlx4(noqueue) on host:

Mpps  -+%
rx_batched=0  0.90  +0%
rx_batched=4  0.97  +7.8%
rx_batched=8  0.97  +7.8%
rx_batched=16 0.98  +8.9%
rx_batched=32 1.03  +14.4%
rx_batched=48 1.09  +21.1%
rx_batched=64 1.02  +13.3%

Changes from V2:
- remove uselss queue limitation check (and we don't drop any packet now)

Changes from V1:
- drop NAPI handler since we don't use NAPI now
- fix the issues that may exceeds max pending of zerocopy
- more improvement on available buffer detection
- move the limitation of batched pacekts from vhost to tuntap

Please review.

Thanks

Jason Wang (3):
  vhost: better detection of available buffers
  vhost_net: tx batching
  tun: rx batching

 drivers/net/tun.c | 50 --
 drivers/vhost/net.c   | 23 ---
 drivers/vhost/vhost.c |  8 ++--
 3 files changed, 70 insertions(+), 11 deletions(-)

-- 
2.7.4

[PATCH net-next V3 3/3] tun: rx batching

2016-12-29 Thread Jason Wang

We can only process 1 packet at one time during sendmsg(). This often
lead bad cache utilization under heavy load. So this patch tries to do
some batching during rx before submitting them to host network
stack. This is done through accepting MSG_MORE as a hint from
sendmsg() caller, if it was set, batch the packet temporarily in a
linked list and submit them all once MSG_MORE were cleared.

Tests were done by pktgen (burst=128) in guest over mlx4(noqueue) on host:

  Mpps  -+%
rx_batched=0  0.90  +0%
rx_batched=4  0.97  +7.8%
rx_batched=8  0.97  +7.8%
rx_batched=16 0.98  +8.9%
rx_batched=32 1.03  +14.4%
rx_batched=48 1.09  +21.1%
rx_batched=64 1.02  +13.3%

The maximum number of batched packets were specified through a module
parameter.

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 50 --
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cd8e02c..a268ed9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,10 @@
 
 #include 
 
+static int rx_batched;
+module_param(rx_batched, int, 0444);
+MODULE_PARM_DESC(rx_batched, "Number of packets batched in rx");
+
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
 
@@ -522,6 +526,7 @@ static void tun_queue_purge(struct tun_file *tfile)
while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
kfree_skb(skb);
 
+   skb_queue_purge(&tfile->sk.sk_write_queue);
skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
@@ -1140,10 +1145,36 @@ static struct sk_buff *tun_alloc_skb(struct tun_file 
*tfile,
return skb;
 }
 
+static void tun_rx_batched(struct tun_file *tfile, struct sk_buff *skb,
+  int more)
+{
+   struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+   struct sk_buff_head process_queue;
+   int qlen;
+   bool rcv = false;
+
+   spin_lock(&queue->lock);
+   qlen = skb_queue_len(queue);
+   __skb_queue_tail(queue, skb);
+   if (!more || qlen == rx_batched) {
+   __skb_queue_head_init(&process_queue);
+   skb_queue_splice_tail_init(queue, &process_queue);
+   rcv = true;
+   }
+   spin_unlock(&queue->lock);
+
+   if (rcv) {
+   local_bh_disable();
+   while ((skb = __skb_dequeue(&process_queue)))
+   netif_receive_skb(skb);
+   local_bh_enable();
+   }
+}
+
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
-   int noblock)
+   int noblock, bool more)
 {
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb;
@@ -1283,10 +1314,15 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
skb_probe_transport_header(skb, 0);
 
rxhash = skb_get_hash(skb);
+
 #ifndef CONFIG_4KSTACKS
-   local_bh_disable();
-   netif_receive_skb(skb);
-   local_bh_enable();
+   if (!rx_batched) {
+   local_bh_disable();
+   netif_receive_skb(skb);
+   local_bh_enable();
+   } else {
+   tun_rx_batched(tfile, skb, more);
+   }
 #else
netif_rx_ni(skb);
 #endif
@@ -1312,7 +1348,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
if (!tun)
return -EBADFD;
 
-   result = tun_get_user(tun, tfile, NULL, from, file->f_flags & 
O_NONBLOCK);
+   result = tun_get_user(tun, tfile, NULL, from,
+ file->f_flags & O_NONBLOCK, false);
 
tun_put(tun);
return result;
@@ -1570,7 +1607,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr 
*m, size_t total_len)
return -EBADFD;
 
ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
-  m->msg_flags & MSG_DONTWAIT);
+  m->msg_flags & MSG_DONTWAIT,
+  m->msg_flags & MSG_MORE);
tun_put(tun);
return ret;
 }
-- 
2.7.4

[PATCH net-next V3 1/3] vhost: better detection of available buffers

2016-12-29 Thread Jason Wang

This patch tries to do several tweaks on vhost_vq_avail_empty() for a
better performance:

- check cached avail index first which could avoid userspace memory access.
- using unlikely() for the failure of userspace access
- check vq->last_avail_idx instead of cached avail index as the last
  step.

This patch is need for batching supports which needs to peek whether
or not there's still available buffers in the ring.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index d643260..9f11838 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2241,11 +2241,15 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
__virtio16 avail_idx;
int r;
 
+   if (vq->avail_idx != vq->last_avail_idx)
+   return false;
+
r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
-   if (r)
+   if (unlikely(r))
return false;
+   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
-   return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
+   return vq->avail_idx == vq->last_avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-- 
2.7.4

[PATCH net-next V3 2/3] vhost_net: tx batching

2016-12-29 Thread Jason Wang

This patch tries to utilize tuntap rx batching by peeking the tx
virtqueue during transmission, if there's more available buffers in
the virtqueue, set MSG_MORE flag for a hint for backend (e.g tuntap)
to batch the packets.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5dc3465..c42e9c3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -351,6 +351,15 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
return r;
 }
 
+static bool vhost_exceeds_maxpend(struct vhost_net *net)
+{
+   struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+   struct vhost_virtqueue *vq = &nvq->vq;
+
+   return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
+   == nvq->done_idx;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -394,8 +403,7 @@ static void handle_tx(struct vhost_net *net)
/* If more outstanding DMAs, queue the work.
 * Handle upend_idx wrap around
 */
-   if (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND)
- % UIO_MAXIOV == nvq->done_idx))
+   if (unlikely(vhost_exceeds_maxpend(net)))
break;
 
head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
@@ -454,6 +462,16 @@ static void handle_tx(struct vhost_net *net)
msg.msg_control = NULL;
ubufs = NULL;
}
+
+   total_len += len;
+   if (total_len < VHOST_NET_WEIGHT &&
+   !vhost_vq_avail_empty(&net->dev, vq) &&
+   likely(!vhost_exceeds_maxpend(net))) {
+   msg.msg_flags |= MSG_MORE;
+   } else {
+   msg.msg_flags &= ~MSG_MORE;
+   }
+
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) {
@@ -472,7 +490,6 @@ static void handle_tx(struct vhost_net *net)
vhost_add_used_and_signal(&net->dev, vq, head, 0);
else
vhost_zerocopy_signal_used(net, vq);
-   total_len += len;
vhost_net_tx_packet(net);
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
-- 
2.7.4

99 matches

Mail list logo