date:20170430

[PATCH net-next 4/4] virtio_net: make use of extended ack message reporting

2017-04-30 Thread Jakub Kicinski

Try to carry error messages to the user via the netlink extended
ack message attribute.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/virtio_net.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82f1c3a73345..046c60619c59 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1878,7 +1878,8 @@ static int virtnet_reset(struct virtnet_info *vi, int 
curr_qp, int xdp_qp)
return ret;
 }
 
-static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+  struct netlink_ext_ack *extack)
 {
unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
struct virtnet_info *vi = netdev_priv(dev);
@@ -1890,16 +1891,17 @@ static int virtnet_xdp_set(struct net_device *dev, 
struct bpf_prog *prog)
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
-   netdev_warn(dev, "can't set XDP while host is implementing LRO, 
disable LRO first\n");
+   NL_SET_ERR_MSG(extack, "can't set XDP while host is 
implementing LRO, disable LRO first");
return -EOPNOTSUPP;
}
 
if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
-   netdev_warn(dev, "XDP expects header/data in single page, 
any_header_sg required\n");
+   NL_SET_ERR_MSG(extack, "XDP expects header/data in single page, 
any_header_sg required");
return -EINVAL;
}
 
if (dev->mtu > max_sz) {
+   NL_SET_ERR_MSG(extack, "MTU too large to enable XDP");
netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
return -EINVAL;
}
@@ -1910,6 +1912,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
 
/* XDP requires extra queues for XDP_TX */
if (curr_qp + xdp_qp > vi->max_queue_pairs) {
+   NL_SET_ERR_MSG(extack, "Too few free TX rings available");
netdev_warn(dev, "request %i queues but max is %i\n",
curr_qp + xdp_qp, vi->max_queue_pairs);
return -ENOMEM;
@@ -1971,7 +1974,7 @@ static int virtnet_xdp(struct net_device *dev, struct 
netdev_xdp *xdp)
 {
switch (xdp->command) {
case XDP_SETUP_PROG:
-   return virtnet_xdp_set(dev, xdp->prog);
+   return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
case XDP_QUERY_PROG:
xdp->prog_attached = virtnet_xdp_query(dev);
return 0;
-- 
2.11.0

[PATCH net-next 2/4] xdp: propagate extended ack to XDP setup

2017-04-30 Thread Jakub Kicinski

Drivers usually have a number of restrictions for running XDP
- most common being buffer sizes, LRO and number of rings.
Even though some drivers try to be helpful and print error
messages experience shows that users don't often consult
kernel logs on netlink errors.  Try to use the new extended
ack mechanism to carry the message back to user space.

Signed-off-by: Jakub Kicinski 
---
 include/linux/netdevice.h | 10 --
 net/core/dev.c|  5 -
 net/core/rtnetlink.c  | 13 -
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6847714a5ae3..9c23bd2efb56 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -813,11 +813,16 @@ enum xdp_netdev_command {
XDP_QUERY_PROG,
 };
 
+struct netlink_ext_ack;
+
 struct netdev_xdp {
enum xdp_netdev_command command;
union {
/* XDP_SETUP_PROG */
-   struct bpf_prog *prog;
+   struct {
+   struct bpf_prog *prog;
+   struct netlink_ext_ack *extack;
+   };
/* XDP_QUERY_PROG */
bool prog_attached;
};
@@ -3291,7 +3296,8 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device 
*dev,
struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8371a01eee87..35a06cebb282 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6854,12 +6854,14 @@ EXPORT_SYMBOL(dev_change_proto_down);
 /**
  * dev_change_xdp_fd - set or clear a bpf program for a device rx path
  * @dev: device
+ * @extact: netlink extended ack
  * @fd: new program fd or negative value to clear
  * @flags: xdp-related flags
  *
  * Set or clear a bpf program for a device
  */
-int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, u32 flags)
 {
int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
const struct net_device_ops *ops = dev->netdev_ops;
@@ -6892,6 +6894,7 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 
flags)
 
memset(&xdp, 0, sizeof(xdp));
xdp.command = XDP_SETUP_PROG;
+   xdp.extack = extack;
xdp.prog = prog;
 
err = xdp_op(dev, &xdp);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9031a6c8bfa7..6e67315ec368 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1919,6 +1919,7 @@ static int do_set_master(struct net_device *dev, int 
ifindex)
 #define DO_SETLINK_NOTIFY  0x03
 static int do_setlink(const struct sk_buff *skb,
  struct net_device *dev, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
  struct nlattr **tb, char *ifname, int status)
 {
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2201,7 +2202,7 @@ static int do_setlink(const struct sk_buff *skb,
}
 
if (xdp[IFLA_XDP_FD]) {
-   err = dev_change_xdp_fd(dev,
+   err = dev_change_xdp_fd(dev, extack,
nla_get_s32(xdp[IFLA_XDP_FD]),
xdp_flags);
if (err)
@@ -2261,7 +2262,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct 
nlmsghdr *nlh,
if (err < 0)
goto errout;
 
-   err = do_setlink(skb, dev, ifm, tb, ifname, 0);
+   err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
 errout:
return err;
 }
@@ -2423,6 +2424,7 @@ EXPORT_SYMBOL(rtnl_create_link);
 static int rtnl_group_changelink(const struct sk_buff *skb,
struct net *net, int group,
struct ifinfomsg *ifm,
+   struct netlink_ext_ack *extack,
struct nlattr **tb)
 {
struct net_device *dev, *aux;
@@ -2430,7 +2432,7 @@ static int rtnl_group_changelink(const struct sk_buff 
*skb,
 
for_each_netdev_safe(net, dev, aux) {
if (dev->group == group) {
-   err = do_setlink(skb, dev, ifm, tb, NULL, 0);
+   err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0);
if (err < 0)
return err;
}
@@ -2576,14 +

[PATCH net-next 3/4] nfp: make use of extended ack message reporting

2017-04-30 Thread Jakub Kicinski

Try to carry error messages to the user via the netlink extended
ack message attribute.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  3 ++-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 22 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  4 ++--
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 38b41fdeaa8f..fcf81b3be830 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -818,7 +818,8 @@ nfp_net_irqs_assign(struct nfp_net *nn, struct msix_entry 
*irq_entries,
unsigned int n);
 
 struct nfp_net_dp *nfp_net_clone_dp(struct nfp_net *nn);
-int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *new);
+int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *new,
+ struct netlink_ext_ack *extack);
 
 bool nfp_net_link_changed_read_clear(struct nfp_net *nn);
 int nfp_net_refresh_eth_port(struct nfp_net *nn);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index b9f3548bb65f..db20376260f5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2524,24 +2524,27 @@ struct nfp_net_dp *nfp_net_clone_dp(struct nfp_net *nn)
return new;
 }
 
-static int nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp)
+static int
+nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp,
+struct netlink_ext_ack *extack)
 {
/* XDP-enabled tests */
if (!dp->xdp_prog)
return 0;
if (dp->fl_bufsz > PAGE_SIZE) {
-   nn_warn(nn, "MTU too large w/ XDP enabled\n");
+   NL_MOD_TRY_SET_ERR_MSG(extack, "MTU too large w/ XDP enabled");
return -EINVAL;
}
if (dp->num_tx_rings > nn->max_tx_rings) {
-   nn_warn(nn, "Insufficient number of TX rings w/ XDP enabled\n");
+   NL_MOD_TRY_SET_ERR_MSG(extack, "Insufficient number of TX rings 
w/ XDP enabled");
return -EINVAL;
}
 
return 0;
 }
 
-int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *dp)
+int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *dp,
+ struct netlink_ext_ack *extack)
 {
int r, err;
 
@@ -2553,7 +2556,7 @@ int nfp_net_ring_reconfig(struct nfp_net *nn, struct 
nfp_net_dp *dp)
 
dp->num_r_vecs = max(dp->num_rx_rings, dp->num_stack_tx_rings);
 
-   err = nfp_net_check_config(nn, dp);
+   err = nfp_net_check_config(nn, dp, extack);
if (err)
goto exit_free_dp;
 
@@ -2628,7 +2631,7 @@ static int nfp_net_change_mtu(struct net_device *netdev, 
int new_mtu)
 
dp->mtu = new_mtu;
 
-   return nfp_net_ring_reconfig(nn, dp);
+   return nfp_net_ring_reconfig(nn, dp, NULL);
 }
 
 static void nfp_net_stat64(struct net_device *netdev,
@@ -2944,9 +2947,10 @@ static int nfp_net_xdp_offload(struct nfp_net *nn, 
struct bpf_prog *prog)
return ret;
 }
 
-static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog)
+static int nfp_net_xdp_setup(struct nfp_net *nn, struct netdev_xdp *xdp)
 {
struct bpf_prog *old_prog = nn->dp.xdp_prog;
+   struct bpf_prog *prog = xdp->prog;
struct nfp_net_dp *dp;
int err;
 
@@ -2969,7 +2973,7 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct 
bpf_prog *prog)
dp->rx_dma_off = prog ? XDP_PACKET_HEADROOM - nn->dp.rx_offset : 0;
 
/* We need RX reconfig to remap the buffers (BIDIR vs FROM_DEV) */
-   err = nfp_net_ring_reconfig(nn, dp);
+   err = nfp_net_ring_reconfig(nn, dp, xdp->extack);
if (err)
return err;
 
@@ -2987,7 +2991,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct 
netdev_xdp *xdp)
 
switch (xdp->command) {
case XDP_SETUP_PROG:
-   return nfp_net_xdp_setup(nn, xdp->prog);
+   return nfp_net_xdp_setup(nn, xdp);
case XDP_QUERY_PROG:
xdp->prog_attached = !!nn->dp.xdp_prog;
return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index a704efd4e314..abbb47e60cc3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -309,7 +309,7 @@ static int nfp_net_set_ring_size(struct nfp_net *nn, u32 
rxd_cnt, u32 txd_cnt)
dp->rxd_cnt = rxd_cnt;
dp->txd_cnt = txd_cnt;
 
-   return nfp_net_ring_reconfig(nn, dp);
+   return nfp_net_ring_reconfig(nn, dp, NULL);
 }
 
 static int nfp_net_set_ringparam(struct net_device *netdev,
@@ -880,7 +880,7 @@ static int nfp_net_set

[PATCH net-next 1/4] netlink: add NULL-friendly helper for setting extended ACK message

2017-04-30 Thread Jakub Kicinski

As we propagate extended ack reporting throughout various paths in
the kernel it may be that the same function is called with the
extended ack parameter passed as NULL.  One place where that happens
is in drivers which have a centralized reconfiguration function
called both from ndos and from ethtool_ops.  Add a new helper for
setting the error message in such conditions.

Existing helper is left as is to encourage propagating the ext act
fully wherever possible.  It also makes it clear in the code which
messages may be lost due to ext ack being NULL.

Signed-off-by: Jakub Kicinski 
---
 include/linux/netlink.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 8d2a8924705c..c20395edf2de 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -92,6 +92,14 @@ struct netlink_ext_ack {
(extack)->_msg = _msg;  \
 } while (0)
 
+#define NL_MOD_TRY_SET_ERR_MSG(extack, msg) do {   \
+   static const char _msg[] = KBUILD_MODNAME ": " msg; \
+   struct netlink_ext_ack *_extack = (extack); \
+   \
+   if (_extack)\
+   _extack->_msg = _msg;   \
+} while (0)
+
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
-- 
2.11.0

[PATCH net-next 0/4] xdp: use netlink extended ACK reporting

2017-04-30 Thread Jakub Kicinski

Hi!

This series is an attempt to make XDP more user friendly by 
enabling exploiting the recently added netlink extended ACK 
reporting to carry messages to user space.

David Ahern's iproute2 ext ack patches for ip link are sufficient
to show the errors like this:

# ip link set dev p4p1 xdp obj ipip_prepend.o sec ".text"
Error: nfp: MTU too large w/ XDP enabled

Where the message is coming directly from the driver.  There could
still be a bit of a leap for a complete novice from the message 
above to the right settings, but it's a big improvement over the
standard "Invalid argument" message.

v1/non-rfc:
 - add a separate macro in patch 1;
 - add KBUILD_MODNAME as part of the message (Daniel);
 - don't print the error to logs in patch 1.

Jakub Kicinski (4):
  netlink: add NULL-friendly helper for setting extended ACK message
  xdp: propagate extended ack to XDP setup
  nfp: make use of extended ack message reporting
  virtio_net: make use of extended ack message reporting

 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  3 ++-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 22 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  4 ++--
 drivers/net/virtio_net.c   | 11 +++
 include/linux/netdevice.h  | 10 --
 include/linux/netlink.h|  8 
 net/core/dev.c |  5 -
 net/core/rtnetlink.c   | 13 -
 8 files changed, 52 insertions(+), 24 deletions(-)

-- 
2.11.0

Re: [PATCH net-next] mlxsw: spectrum_router: Simplify VRF enslavement

2017-04-30 Thread David Miller

From: 
Date: Sun, 30 Apr 2017 19:47:14 +0300

> From: Ido Schimmel 
> 
> When a netdev is enslaved to a VRF master, its router interface (RIF)
> needs to be destroyed (if exists) and a new one created using the
> corresponding virtual router (VR).
> 
> From the driver's perspective, the above is equivalent to an inetaddr
> event sent for this netdev. Therefore, when a port netdev (or its
> uppers) are enslaved to a VRF master, call the same function that
> would've been called had a NETDEV_UP was sent for this netdev in the
> inetaddr notification chain.
> 
> This patch also fixes a bug when a LAG netdev with an existing RIF is
> enslaved to a VRF. Before this patch, each LAG port would drop the
> reference on the RIF, but would re-join the same one (in the wrong VR)
> soon after. With this patch, the corresponding RIF is first destroyed
> and a new one is created using the correct VR.
> 
> Fixes: 7179eb5acd59 ("mlxsw: spectrum_router: Add support for VRFs")
> Signed-off-by: Ido Schimmel 
> Reviewed-by: Jiri Pirko 

Applied, thanks.

Re: pull request: bluetooth-next 2017-04-30

2017-04-30 Thread David Miller

From: Johan Hedberg 
Date: Sun, 30 Apr 2017 17:09:28 +0300

> Here's one last batch of Bluetooth patches in the bluetooth-next tree
> targeting the 4.12 kernel.
> 
>  - Remove custom ECDH implementation and use new KPP API instead
>  - Add protocol checks to hci_ldisc
>  - Add module license to HCI UART Nokia H4+ driver
>  - Minor fix for 32bit user space - 64 bit kernel combination
> 
> Please let me know if there are any issues pulling. Thanks.

Pulled, thanks Johan.

Re: [pull request][net-next 00/15] Mellanox, mlx5 updates 2017-04-30

2017-04-30 Thread David Miller

From: Saeed Mahameed 
Date: Sun, 30 Apr 2017 16:20:01 +0300

> This series contains two sets of patches to the mlx5 driver,
> 1. Nine patches (mostly from Hadar) to add 'mlx5 neigh update' feature.
> 2. Six misc patches.
> 
> For more details please see below.
> 
> Sorry for the last minute submission, originally I planned to submit before
> weekend, but in order to provide clean patches, we had to deal with some
> auto build issues first.
> 
> Please pull and let me know if there's any problem.

Pulled, thanks.

Re: [PATCH net-next] qed: Prevent warning without CONFIG_RFS_ACCEL

2017-04-30 Thread David Miller

From: Yuval Mintz 
Date: Sun, 30 Apr 2017 12:14:44 +0300

> After removing the PTP related initialization from slowpath start,
> the remaining PTT entry is required only in case CONFIG_RFS_ACCEL is set.
> Otherwise, it leads to a warning due to it being unused.
> 
> Fixes: d179bd1699fc ("qed: Acquire/release ptt_ptp lock when 
> enabling/disabling PTP")
> Signed-off-by: Yuval Mintz 

Also applied, thanks.

Re: [PATCH net-next 0/6] qed: RoCE related pseudo-fixes

2017-04-30 Thread David Miller

From: Yuval Mintz 
Date: Sun, 30 Apr 2017 11:49:04 +0300

> This series contains multiple small corrections to the RoCE logic
> in qed plus some debug information and inter-module parameter
> meant to prevent issues further along.
> 
>  - #1, #6 Share information with protocol driver
>[either new or filling missing bits in existing API].
>  - #2, #3 correct error flows in qed.
>  - #4 add debug related information.
>  - #5 fixes a minor issue in the HW configuration.

Series applied, thanks.

Re: [PATCH net-next] bpf: enhance verifier to understand stack pointer arithmetic

2017-04-30 Thread David Miller

From: Alexei Starovoitov 
Date: Sat, 29 Apr 2017 22:52:42 -0700

> From: Yonghong Song 
> 
> llvm 4.0 and above generates the code like below:
> 
> 440: (b7) r1 = 15
> 441: (05) goto pc+73
> 515: (79) r6 = *(u64 *)(r10 -152)
> 516: (bf) r7 = r10
> 517: (07) r7 += -112
> 518: (bf) r2 = r7
> 519: (0f) r2 += r1
> 520: (71) r1 = *(u8 *)(r8 +0)
> 521: (73) *(u8 *)(r2 +45) = r1
> 
> and the verifier complains "R2 invalid mem access 'inv'" for insn #521.
> This is because verifier marks register r2 as unknown value after #519
> where r2 is a stack pointer and r1 holds a constant value.
> 
> Teach verifier to recognize "stack_ptr + imm" and
> "stack_ptr + reg with const val" as valid stack_ptr with new offset.
> 
> Signed-off-by: Yonghong Song 
> Acked-by: Martin KaFai Lau 
> Acked-by: Daniel Borkmann 
> Signed-off-by: Alexei Starovoitov 
> ---
> technically it's 'net' material, but it's too late for 'net',
> hence 'net-next' tag.
> No 'Fixes' tag, since it's only seen with newer llvm.

Applied to net-next, but I'll queue this up to -stable.

Re: [PATCH] net: phy: Allow BCM5481x PHYs to setup internal TX/RX clock delay

2017-04-30 Thread David Miller

From: Abhishek Shah 
Date: Sun, 30 Apr 2017 11:04:21 +0530

> This patch allows users to enable/disable internal TX and/or RX
> clock delay for BCM5481x series PHYs so as to satisfy RGMII timing
> specifications.
> 
> On a particular platform, whether TX and/or RX clock delay is required
> depends on how PHY connected to the MAC IP. This requirement can be
> specified through "phy-mode" property in the platform device tree.
> 
> Signed-off-by: Abhishek Shah 

Applied.

Re: [PATCH] net: sunhme: fix spelling mistakes: "ParityErro" -> "ParityError"

2017-04-30 Thread David Miller

From: Colin King 
Date: Sat, 29 Apr 2017 22:38:57 +0100

> From: Colin Ian King 
> 
> trivial fix to spelling mistakes in printk message.
> 
> Signed-off-by: Colin Ian King 

Applied.

Re: [PATCH net v3] driver: dummy: Fix one possbile memleak when fail to register_netdevice

2017-04-30 Thread David Miller


Please, Gao, submit this as a proper, numbered, patch series
with a proper header posting.

That way you can explain why you took this strategy to fix
this problem, compared to your original approach.

Thanks.

Re: [PATCH v2] iov_iter: don't revert iov buffer if csum error

2017-04-30 Thread David Miller

From: Al Viro 
Date: Sat, 29 Apr 2017 21:48:23 +0100

> On Sat, Apr 29, 2017 at 05:37:38PM +0800, Ding Tianhong wrote:
> 
>> Looks good, if so, we don't need the csum_error any more,
> 
> Acked-by: Al Viro 
> 
> Dave, I could put that through my tree, but I think it would be better off
> in net.git; either way, it needs to go into mainline before -final...

Please just send it directly to Linus, thanks.

Re: [PATCH net] bnx2x: Align RX buffers

2017-04-30 Thread David Miller

From: Scott Wood 
Date: Fri, 28 Apr 2017 19:17:41 -0500

> The bnx2x driver is not providing proper alignment on the receive buffers it
> passes to build_skb(), causing skb_shared_info to be misaligned.
> skb_shared_info contains an atomic, and while PPC normally supports
> unaligned accesses, it does not support unaligned atomics.
> 
> Aligning the size of rx buffers will ensure that page_frag_alloc() returns
> aligned addresses.
> 
> This can be reproduced on PPC by setting the network MTU to 1450 (or other
> non-multiple-of-4) and then generating sufficient inbound network traffic
> (one or two large "wget"s usually does it), producing the following oops:
 ...
> Fixes: d46d132cc021 ("bnx2x: use netdev_alloc_frag()")
> Signed-off-by: Scott Wood 

Applied, thanks.

Re: [PATCH v3 1/2] net: dsa: b53: Add compatible strings for the Cygnus-family BCM11360.

2017-04-30 Thread David Miller

From: Eric Anholt 
Date: Fri, 28 Apr 2017 15:22:03 -0700

> Cygnus is a small family of SoCs, of which we currently have
> devicetree for BCM11360 and BCM58300.  The 11360's B53 is mostly the
> same as 58xx, just requiring a tiny bit of setup that was previously
> missing.
> 
> v2: Reorder the entry in the docs (suggestion by Scott Branden), add
> missing '"'
> 
> Signed-off-by: Eric Anholt 
> Reviewed-by: Florian Fainelli 
> Acked-by: Rob Herring 

The second patch with the DTS file update doesn't apply cleanly
at all to net-next.

So I'm dropping this series.

Re: [PATCH net-next v3] net: bridge: Fix improper taking over HW learned FDB

2017-04-30 Thread David Miller

From: Arkadi Sharshevsky 
Date: Fri, 28 Apr 2017 22:39:07 +0300

> Commit 7e26bf45e4cb ("net: bridge: allow SW learn to take over HW fdb
> entries") added the ability to "take over an entry which was previously
> learned via HW when it shows up from a SW port".
> 
> However, if an entry was learned via HW and then a control packet
> (e.g., ARP request) was trapped to the CPU, the bridge driver will
> update the entry and remove the externally learned flag, although the
> entry is still present in HW. Instead, only clear the externally learned
> flag in case of roaming.
> 
> Fixes: 7e26bf45e4cb ("net: bridge: allow SW learn to take over HW fdb 
> entries")
> Signed-off-by: Ido Schimmel 
> Signed-off-by: Arkadi Sharashevsky 

Applied, thanks.

Re: [PATCH net-next] rtnetlink: Remove NETDEV_CHANGEINFODATA

2017-04-30 Thread David Miller

From: David Ahern 
Date: Fri, 28 Apr 2017 11:06:25 -0700

> NETDEV_CHANGEINFODATA was added by d4261e5650004 ("bonding: create
> netlink event when bonding option is changed"). RTM_NEWLINK
> messages are already created on changelink events, so this event
> is just a duplicate. Remove it.
> 
> Cc: Jiri Pirko 
> Signed-off-by: David Ahern 

I think you need to respin this, I get rejects when I try to
apply this to net-next.

Re: [Patch net-next v2] ipv4: get rid of ip_ra_lock

2017-04-30 Thread David Miller

From: Cong Wang 
Date: Fri, 28 Apr 2017 10:04:29 -0700

> After commit 1215e51edad1 ("ipv4: fix a deadlock in ip_ra_control")
> we always take RTNL lock for ip_ra_control() which is the only place
> we update the list ip_ra_chain, so the ip_ra_lock is no longer needed.
> 
> As Eric points out, BH does not need to disable either, RCU readers
> don't care.
> 
> Signed-off-by: Cong Wang 

Applied.

Re: [PATCH net-next] lwtunnel: fix error path in lwtunnel_fill_encap()

2017-04-30 Thread David Miller

From: Dan Carpenter 
Date: Fri, 28 Apr 2017 16:03:48 +0300

> We recently added a check to see if nla_nest_start() fails.  There are
> two issues with that.  First, if it fails then I don't think we should
> call nla_nest_cancel().  Second, it's slightly convoluted but the
> current code returns success but we should return -EMSGSIZE instead.
> 
> Fixes: a50fe0ffd76f ("lwtunnel: check return value of nla_nest_start")
> Signed-off-by: Dan Carpenter 

Applied.

Re: [net-next PATCH V1] samples/bpf: bpf_load.c detect and abort if ELF maps section size is wrong

2017-04-30 Thread David Miller

From: Jesper Dangaard Brouer 
Date: Fri, 28 Apr 2017 16:25:04 +0200

> The struct bpf_map_def was extended in commit fb30d4b71214 ("bpf: Add tests
> for map-in-map") with member unsigned int inner_map_idx.  This changed the 
> size
> of the maps section in the generated ELF _kern.o files.
> 
> Unfortunately the loader in bpf_load.c does not detect or handle this.  Thus,
> older _kern.o files became incompatible, and caused hard-to-debug errors
> where the syscall validation rejected BPF_MAP_CREATE request.
> 
> This patch only detect the situation and aborts load_bpf_file(). It also
> add code comments warning people that read this loader for inspiration
> for these pitfalls.
> 
> Fixes: fb30d4b71214 ("bpf: Add tests for map-in-map")
> Signed-off-by: Jesper Dangaard Brouer 

Applied.

Re: [PATCH net] liquidio: silence a locking static checker warning

2017-04-30 Thread David Miller

From: Dan Carpenter 
Date: Fri, 28 Apr 2017 15:57:15 +0300

> Presumably we never hit this return, but static checkers complain that
> we need to unlock so we may as well fix that.
> 
> Signed-off-by: Dan Carpenter 

Applied.

Re: [PATCH net] qed: Unlock on error in qed_vf_pf_acquire()

2017-04-30 Thread David Miller

From: Dan Carpenter 
Date: Fri, 28 Apr 2017 15:56:09 +0300

> My static checker complains that we're holding a mutex on this error
> path.  Let's goto exit instead of returning directly.
> 
> Fixes: b0bccb69eba3 ("qed: Change locking scheme for VF channel")
> Signed-off-by: Dan Carpenter 

Applied.

Re: [PATCH net-next v5 0/2] net: hns: bug fix for HNS driver

2017-04-30 Thread David Miller

From: Yankejian 
Date: Fri, 28 Apr 2017 14:49:45 +0800

> From: lipeng 
> 
> This patchset add support defered dsaf probe when mdio and
> mbigen module is not insmod.
> 
> For more details, please refer to individual patch.

Series applied, thanks.

Re: [PATCH net-next 0/6] nfp: optimize XDP TX and small fixes

2017-04-30 Thread David Miller

From: Jakub Kicinski 
Date: Thu, 27 Apr 2017 21:06:14 -0700

> This series optimizes the nfp XDP TX performance a little bit.  
> I run quick tests on an Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz.  
> Single core/queue performance for both touch and drop and touch and
> forward is above 20Mpps @64B packets, drop being 2Mpps faster.  
> I think this is max for a single queue on the low power NFPs.
> 
> There are also a few minor fixes included for code in net-next.

Series applied, thanks.

Re: [PATCH] drivers:net:ethernet:emulex:benet: Use time_before_eq for time comparison

2017-04-30 Thread David Miller

From: Karim Eshapa 
Date: Fri, 28 Apr 2017 03:48:59 +0200

> Use time_before_eq for time comparison more safe and dealing
> with timer wrapping to be future-proof.
> 
> Signed-off-by: Karim Eshapa 

Subject line has way too many subsystem prefixes, simply
"benet: " is sufficient.

And in situations where multiple subsystem prefixes are
appropriate, one must put a space after each one like
"one: two: three: ".

Thanks.

Re: [PATCH net-next] geneve: fix incorrect setting of UDP checksum flag

2017-04-30 Thread David Miller

From: Girish Moodalbail 
Date: Thu, 27 Apr 2017 14:11:53 -0700

> Creating a geneve link with 'udpcsum' set results in a creation of link
> for which UDP checksum will NOT be computed on outbound packets, as can
> be seen below.
> 
> 11: gen0:  mtu 1500 qdisc noop state DOWN
> link/ether c2:85:27:b6:b4:15 brd ff:ff:ff:ff:ff:ff promiscuity 0
> geneve id 200 remote 192.168.13.1 dstport 6081 noudpcsum
> 
> Similarly, creating a link with 'noudpcsum' set results in a creation
> of link for which UDP checksum will be computed on outbound packets.
> 
> Fixes: 9b4437a5b870 ("geneve: Unify LWT and netdev handling.")
> Signed-off-by: Girish Moodalbail 

Applied and queued up for -stable.

Re: [PATCH net-next] virtio-net: use netif_tx_napi_add for tx napi

2017-04-30 Thread David Miller

From: Willem de Bruijn 
Date: Thu, 27 Apr 2017 20:37:58 -0400

> From: Willem de Bruijn 
> 
> Avoid hashing the tx napi struct into napi_hash[], which is used for
> busy polling receive queues.
> 
> Signed-off-by: Willem de Bruijn 

Applied.

Re: [PATCH net-next] net: Initialise init_net.count to 1

2017-04-30 Thread David Miller

From: David Howells 
Date: Thu, 27 Apr 2017 22:40:23 +0100

> Initialise init_net.count to 1 for its pointer from init_nsproxy lest
> someone tries to do a get_net() and a put_net() in a process in which
> current->ns_proxy->net_ns points to the initial network namespace.
> 
> Signed-off-by: David Howells 

Applied.

Re: [PATCH net 0/2] vxlan: do not error out on disabled IPv6

2017-04-30 Thread David Miller

From: Jiri Benc 
Date: Thu, 27 Apr 2017 21:24:34 +0200

> This patchset fixes a bug with metadata based tunnels when booted with
> ipv6.disable=1.

Series applied, thanks.

Re: [PATCH v1 3/3] bnx2x: Get rid of useless temporary variable

2017-04-30 Thread David Miller

From: Andy Shevchenko 
Date: Thu, 27 Apr 2017 16:37:01 +0300

> From: Andy Shevchenko 
> 
> Replace pattern
> 
>  int status;
>  ...
>  status = func(...);
>  return status;
> 
> by
> 
>  return func(...);
> 
> No functional change intented.
> 
> Signed-off-by: Andy Shevchenko 

Applied.

Re: [PATCH v1 1/3] bnx2x: Replace custom scnprintf()

2017-04-30 Thread David Miller

From: Andy Shevchenko 
Date: Thu, 27 Apr 2017 16:36:59 +0300

> From: Andy Shevchenko 
> 
> Use scnprintf() when printing version instead of custom open coded variants.
> 
> Signed-off-by: Andy Shevchenko 

Applied.

Re: [PATCH v1 2/3] bnx2x: Reuse bnx2x_null_format_ver()

2017-04-30 Thread David Miller

From: Andy Shevchenko 
Date: Thu, 27 Apr 2017 16:37:00 +0300

> From: Andy Shevchenko 
> 
> Reuse bnx2x_null_format_ver() in functions where it's appropriated
> instead of open coded variant.
> 
> Signed-off-by: Andy Shevchenko 

Applied.

Re: pull-request: can-next 2017-04-25,pull-request: can-next 2017-04-25

2017-04-30 Thread David Miller

From: Marc Kleine-Budde 
Date: Thu, 27 Apr 2017 09:54:33 +0200

> this is a pull request of 1 patch for net-next/master.
> 
> This patch by Oliver Hartkopp fixes the build of the broad cast manager
> with CONFIG_PROC_FS disabled.

Pulled, thanks!

Re: [PATCH net-next] Fix inaccurate helper function description

2017-04-30 Thread David Miller

From: Chenbo Feng 
Date: Wed, 26 Apr 2017 16:41:23 -0700

> From: Chenbo Feng 
> 
> The description inside uapi/linux/bpf.h about bpf_get_socket_uid
> helper function is no longer valid. It returns overflowuid rather
> than 0 when failed.
> 
> Signed-off-by: Chenbo Feng 

Applied.

Re: [PATCH net] tcp: fix access to sk->sk_state in tcp_poll()

2017-04-30 Thread David Miller

From: Davide Caratti 
Date: Wed, 26 Apr 2017 19:07:35 +0200

> avoid direct access to sk->sk_state when tcp_poll() is called on a socket
> using active TCP fastopen with deferred connect. Use local variable
> 'state', which stores the result of sk_state_load(), like it was done in
> commit 00fd38d938db ("tcp: ensure proper barriers in lockless contexts").
> 
> Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support")
> Signed-off-by: Davide Caratti 

Applied.

Re: [PATCH net-next] bpf: restore skb->sk before pskb_trim() call

2017-04-30 Thread David Miller

From: Eric Dumazet 
Date: Wed, 26 Apr 2017 09:09:23 -0700

> From: Eric Dumazet 
> 
> While testing a fix [1] in ___pskb_trim(), addressing the WARN_ON_ONCE()
> in skb_try_coalesce() reported by Andrey, I found that we had an skb
> with skb->sk set but no skb->destructor.
> 
> This invalidated heuristic found in commit 158f323b9868 ("net: adjust
> skb->truesize in pskb_expand_head()") and in cited patch.
> 
> Considering the BUG_ON(skb->sk) we have in skb_orphan(), we should
> restrain the temporary setting to a minimal section.
> 
> [1] https://patchwork.ozlabs.org/patch/755570/ 
> net: adjust skb->truesize in ___pskb_trim()
> 
> Fixes: 8f917bba0042 ("bpf: pass sk to helper functions")
> Signed-off-by: Eric Dumazet 

Applied, thanks Eric.

Re: [PATCH] net: macb: fix phy interrupt parsing

2017-04-30 Thread David Miller

From: Alexandre Belloni 
Date: Wed, 26 Apr 2017 12:06:28 +0200

> Since 83a77e9ec415, the phydev irq is explicitly set to PHY_POLL when
> there is no pdata. It doesn't work on DT enabled platforms because the
> phydev irq is already set by libphy before.
> 
> Fixes: 83a77e9ec415 ("net: macb: Added PCI wrapper for Platform Driver.")
> Signed-off-by: Alexandre Belloni 

Applied and queued up for -stable, thanks.

[PATCH net-next] bpf, arm64: implement jiting of BPF_XADD

2017-04-30 Thread Daniel Borkmann

This work adds BPF_XADD for BPF_W/BPF_DW to the arm64 JIT and therefore
completes JITing of all BPF instructions, meaning we can thus also remove
the 'notyet' label and do not need to fall back to the interpreter when
BPF_XADD is used in a program!

This now also brings arm64 JIT in line with x86_64, s390x, ppc64, sparc64,
where all current eBPF features are supported.

BPF_W example from test_bpf:

  .u.insns_int = {
BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
BPF_ST_MEM(BPF_W, R10, -40, 0x10),
BPF_STX_XADD(BPF_W, R10, R0, -40),
BPF_LDX_MEM(BPF_W, R0, R10, -40),
BPF_EXIT_INSN(),
  },

  [...]
  0020:  52800247  mov w7, #0x12 // #18
  0024:  928004eb  mov x11, #0xffd8 // #-40
  0028:  d280020a  mov x10, #0x10 // #16
  002c:  b82b6b2a  str w10, [x25,x11]
  // start of xadd mapping:
  0030:  928004ea  mov x10, #0xffd8 // #-40
  0034:  8b19014a  add x10, x10, x25
  0038:  f9800151  prfm pstl1strm, [x10]
  003c:  885f7d4b  ldxr w11, [x10]
  0040:  0b07016b  add w11, w11, w7
  0044:  880b7d4b  stxr w11, w11, [x10]
  0048:  35ab  cbnz w11, 0x003c
  // end of xadd mapping:
  [...]

BPF_DW example from test_bpf:

  .u.insns_int = {
BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
BPF_STX_XADD(BPF_DW, R10, R0, -40),
BPF_LDX_MEM(BPF_DW, R0, R10, -40),
BPF_EXIT_INSN(),
  },

  [...]
  0020:  52800247  mov w7,  #0x12 // #18
  0024:  928004eb  mov x11, #0xffd8 // #-40
  0028:  d280020a  mov x10, #0x10 // #16
  002c:  f82b6b2a  str x10, [x25,x11]
  // start of xadd mapping:
  0030:  928004ea  mov x10, #0xffd8 // #-40
  0034:  8b19014a  add x10, x10, x25
  0038:  f9800151  prfm pstl1strm, [x10]
  003c:  c85f7d4b  ldxr x11, [x10]
  0040:  8b07016b  add x11, x11, x7
  0044:  c80b7d4b  stxr w11, x11, [x10]
  0048:  35ab  cbnz w11, 0x003c
  // end of xadd mapping:
  [...]

Tested on Cavium ThunderX ARMv8, test suite results after the patch:

  No JIT:   [ 3751.855362] test_bpf: Summary: 311 PASSED, 0 FAILED, [0/303 
JIT'ed]
  With JIT: [ 3573.759527] test_bpf: Summary: 311 PASSED, 0 FAILED, [303/303 
JIT'ed]

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 ( Based against net-next where BPF related patches are usually
   routed, if something else is preferred please let me know. )

 arch/arm64/include/asm/insn.h |  30 
 arch/arm64/kernel/insn.c  | 106 ++
 arch/arm64/net/bpf_jit.h  |  19 
 arch/arm64/net/bpf_jit_comp.c |  16 +--
 lib/test_bpf.c| 105 +
 5 files changed, 271 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index aecc07e..29cb2ca 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -80,6 +80,7 @@ enum aarch64_insn_register_type {
AARCH64_INSN_REGTYPE_RM,
AARCH64_INSN_REGTYPE_RD,
AARCH64_INSN_REGTYPE_RA,
+   AARCH64_INSN_REGTYPE_RS,
 };
 
 enum aarch64_insn_register {
@@ -188,6 +189,8 @@ enum aarch64_insn_ldst_type {
AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX,
AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX,
AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX,
+   AARCH64_INSN_LDST_LOAD_EX,
+   AARCH64_INSN_LDST_STORE_EX,
 };
 
 enum aarch64_insn_adsb_type {
@@ -240,6 +243,23 @@ enum aarch64_insn_logic_type {
AARCH64_INSN_LOGIC_BIC_SETFLAGS
 };
 
+enum aarch64_insn_prfm_type {
+   AARCH64_INSN_PRFM_TYPE_PLD,
+   AARCH64_INSN_PRFM_TYPE_PLI,
+   AARCH64_INSN_PRFM_TYPE_PST,
+};
+
+enum aarch64_insn_prfm_target {
+   AARCH64_INSN_PRFM_TARGET_L1,
+   AARCH64_INSN_PRFM_TARGET_L2,
+   AARCH64_INSN_PRFM_TARGET_L3,
+};
+
+enum aarch64_insn_prfm_policy {
+   AARCH64_INSN_PRFM_POLICY_KEEP,
+   AARCH64_INSN_PRFM_POLICY_STRM,
+};
+
 #define__AARCH64_INSN_FUNCS(abbr, mask, val)   \
 static __always_inline bool aarch64_insn_is_##abbr(u32 code) \
 { return (code & (mask)) == (val); } \
@@ -248,6 +268,7 @@ enum aarch64_insn_logic_type {
 
 __AARCH64_INSN_FUNCS(adr,  0x9F00, 0x1000)
 __AARCH64_INSN_FUNCS(adrp, 0x9F00, 0x9000)
+__AARCH64_INSN_FUNCS(prfm, 0x3FC0, 0x3980)
 __AARCH64_INSN_FUNCS(prfm_lit, 0xFF00, 0xD800)
 __AARCH64_INSN_FUNCS(str_reg,  0x3FE0EC00, 0x38206800)
 __AARCH64_INSN_FUNCS(ldr_reg,  0x3FE0EC00, 0x38606800)
@@ -357,6 +378,11 @@ u32 aarch64_insn_gen_load_store_pair(enum 
aarch64_insn_register reg1,
 int offset,
 enum aarch64_insn_variant variant,
 enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
+  enum aarch64_insn_register base,
+  enum

Re: [PATCH] net: phy: Allow BCM5481x PHYs to setup internal TX/RX clock delay

2017-04-30 Thread Florian Fainelli



On 04/29/2017 10:34 PM, Abhishek Shah wrote:
> This patch allows users to enable/disable internal TX and/or RX
> clock delay for BCM5481x series PHYs so as to satisfy RGMII timing
> specifications.
> 
> On a particular platform, whether TX and/or RX clock delay is required
> depends on how PHY connected to the MAC IP. This requirement can be
> specified through "phy-mode" property in the platform device tree.
> 
> Signed-off-by: Abhishek Shah 

Reviewed-by: Florian Fainelli 

> ---
>  drivers/net/phy/broadcom.c | 69 
> ++
>  1 file changed, 33 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
> index 9cd8b27..a32dc5d 100644
> --- a/drivers/net/phy/broadcom.c
> +++ b/drivers/net/phy/broadcom.c
> @@ -74,27 +74,40 @@ static int bcm54612e_config_init(struct phy_device 
> *phydev)
>   return 0;
>  }
>  
> -static int bcm54810_config(struct phy_device *phydev)
> +static int bcm5481x_config(struct phy_device *phydev)
>  {
>   int rc, val;
>  
> - val = bcm_phy_read_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL);
> - val &= ~BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN;
> - rc = bcm_phy_write_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL,
> -val);
> - if (rc < 0)
> - return rc;
> -
> + /* handling PHY's internal RX clock delay */
>   val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
> - val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
>   val |= MII_BCM54XX_AUXCTL_MISC_WREN;
> + if (phydev->interface == PHY_INTERFACE_MODE_RGMII ||
> + phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) {
> + /* Disable RGMII RXC-RXD skew */
> + val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
> + }
> + if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> + phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) {
> + /* Enable RGMII RXC-RXD skew */
> + val |= MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
> + }
>   rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
> val);
>   if (rc < 0)
>   return rc;
>  
> + /* handling PHY's internal TX clock delay */
>   val = bcm_phy_read_shadow(phydev, BCM54810_SHD_CLK_CTL);
> - val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
> + if (phydev->interface == PHY_INTERFACE_MODE_RGMII ||
> + phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) {
> + /* Disable internal TX clock delay */
> + val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
> + }
> + if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> + phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) {
> + /* Enable internal TX clock delay */
> + val |= BCM54810_SHD_CLK_CTL_GTXCLK_EN;
> + }
>   rc = bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val);
>   if (rc < 0)
>   return rc;
> @@ -244,7 +257,7 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device 
> *phydev)
>  
>  static int bcm54xx_config_init(struct phy_device *phydev)
>  {
> - int reg, err;
> + int reg, err, val;
>  
>   reg = phy_read(phydev, MII_BCM54XX_ECR);
>   if (reg < 0)
> @@ -283,8 +296,14 @@ static int bcm54xx_config_init(struct phy_device *phydev)
>   if (err)
>   return err;
>   } else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
> - err = bcm54810_config(phydev);
> - if (err)
> + /* For BCM54810, we need to disable BroadR-Reach function */
> + val = bcm_phy_read_exp(phydev,
> +BCM54810_EXP_BROADREACH_LRE_MISC_CTL);
> + val &= ~BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN;
> + err = bcm_phy_write_exp(phydev,
> + BCM54810_EXP_BROADREACH_LRE_MISC_CTL,
> + val);
> + if (err < 0)
>   return err;
>   }
>  
> @@ -392,29 +411,7 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
>   ret = genphy_config_aneg(phydev);
>  
>   /* Then we can set up the delay. */
> - if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) {
> - u16 reg;
> -
> - /*
> -  * There is no BCM5481 specification available, so down
> -  * here is everything we know about "register 0x18". This
> -  * at least helps BCM5481 to successfully receive packets
> -  * on MPC8360E-RDK board. Peter Barada 
> -  * says: "This sets delay between the RXD and RXC signals
> -  * instead of using trace lengths to achieve timing".
> -  */
> -
> - /* Set RDX clk delay. */
> - reg = 0x7 | (0x7 << 12);
> - phy_write(phydev, 0x18, reg);
> -
> -

Re: [PATCH net-next] samples/bpf: Add support for SKB_MODE to xdp1 and xdp_tx_iptunnel

2017-04-30 Thread David Ahern

On 4/28/17 3:40 PM, Jesper Dangaard Brouer wrote:
> [...]
>> diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
>> index 0d449d8032d1..d4433a47e6c3 100644
>> --- a/samples/bpf/bpf_load.c
>> +++ b/samples/bpf/bpf_load.c
>> @@ -563,7 +563,7 @@ struct ksym *ksym_search(long key)
>>  return &syms[0];
>>  }
>>  
>> -int set_link_xdp_fd(int ifindex, int fd)
>> +int set_link_xdp_fd(int ifindex, int fd, int flags)
> Shouldn't the flags be a unsigned int, actually a __u32 ?
> 

sure. I'll send a patch

Re: [PATCH net-next iproute2] ip: increase number of MPLS labels

2017-04-30 Thread David Ahern

On 4/30/17 12:04 AM, Stephen Hemminger wrote:
> On Sat, 29 Apr 2017 20:48:50 -0700
> David Ahern  wrote:
> 
>> Kernel now supports more than 2 labels. Increase ip to
>> handle up to 16 labels.
>>
>> Signed-off-by: David Ahern 
>> ---
>>  include/utils.h | 8 
>>  lib/utils.c | 2 +-
>>  2 files changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/utils.h b/include/utils.h
>> index 8c12e1e2a60c..a69e176c260d 100644
>> --- a/include/utils.h
>> +++ b/include/utils.h
>> @@ -54,6 +54,9 @@ void incomplete_command(void) __attribute__((noreturn));
>>  #define NEXT_ARG_FWD() do { argv++; argc--; } while(0)
>>  #define PREV_ARG() do { argv--; argc++; } while(0)
>>  
>> +/* Maximum number of labels the mpls helpers support */
>> +#define MPLS_MAX_LABELS 16
>> +
> 
> Why is the kernel limit not in include/uapi/ header file?
> 

I believe Eric had reasons, but not sure why.

[PATCH 5/6] Drivers: hv: vmbus: Fix rescind handling

2017-04-30 Thread kys

From: K. Y. Srinivasan 

Fix the rescind handling. This patch addresses the following rescind
scenario that is currently not handled correctly:

If a rescind were to be received while the offer is still being
peocessed, we will be blocked indefinitely since the rescind message
is handled on the same work element as the offer message. Fix this
issue.

I would like to thank Dexuan Cui  and
Long Li  for working with me on this patch.

Signed-off-by: K. Y. Srinivasan 
---
 drivers/hv/channel.c  |8 -
 drivers/hv/channel_mgmt.c |   69 ++--
 drivers/hv/connection.c   |7 +++-
 drivers/hv/hyperv_vmbus.h |7 
 drivers/hv/vmbus_drv.c|   29 ++-
 5 files changed, 99 insertions(+), 21 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 736ac76..e9bf0bb 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -630,9 +630,13 @@ void vmbus_close(struct vmbus_channel *channel)
 */
list_for_each_safe(cur, tmp, &channel->sc_list) {
cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
-   if (cur_channel->state != CHANNEL_OPENED_STATE)
-   continue;
vmbus_close_internal(cur_channel);
+   if (cur_channel->rescind) {
+   mutex_lock(&vmbus_connection.channel_mutex);
+   hv_process_channel_removal(cur_channel,
+  cur_channel->offermsg.child_relid);
+   mutex_unlock(&vmbus_connection.channel_mutex);
+   }
}
/*
 * Now close the primary.
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index eec616a..06529b3 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -428,7 +428,6 @@ void vmbus_free_channels(void)
 {
struct vmbus_channel *channel, *tmp;
 
-   mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list,
listentry) {
/* hv_process_channel_removal() needs this */
@@ -436,7 +435,6 @@ void vmbus_free_channels(void)
 
vmbus_device_unregister(channel->device_obj);
}
-   mutex_unlock(&vmbus_connection.channel_mutex);
 }
 
 /*
@@ -483,8 +481,10 @@ static void vmbus_process_offer(struct vmbus_channel 
*newchannel)
list_add_tail(&newchannel->sc_list, &channel->sc_list);
channel->num_sc++;
spin_unlock_irqrestore(&channel->lock, flags);
-   } else
+   } else {
+   atomic_dec(&vmbus_connection.offer_in_progress);
goto err_free_chan;
+   }
}
 
dev_type = hv_get_dev_type(newchannel);
@@ -511,6 +511,7 @@ static void vmbus_process_offer(struct vmbus_channel 
*newchannel)
if (!fnew) {
if (channel->sc_creation_callback != NULL)
channel->sc_creation_callback(newchannel);
+   atomic_dec(&vmbus_connection.offer_in_progress);
return;
}
 
@@ -532,9 +533,7 @@ static void vmbus_process_offer(struct vmbus_channel 
*newchannel)
 * binding which eventually invokes the device driver's AddDevice()
 * method.
 */
-   mutex_lock(&vmbus_connection.channel_mutex);
ret = vmbus_device_register(newchannel->device_obj);
-   mutex_unlock(&vmbus_connection.channel_mutex);
 
if (ret != 0) {
pr_err("unable to add child device object (relid %d)\n",
@@ -542,6 +541,8 @@ static void vmbus_process_offer(struct vmbus_channel 
*newchannel)
kfree(newchannel->device_obj);
goto err_deq_chan;
}
+
+   atomic_dec(&vmbus_connection.offer_in_progress);
return;
 
 err_deq_chan:
@@ -799,6 +800,7 @@ static void vmbus_onoffer(struct 
vmbus_channel_message_header *hdr)
newchannel = alloc_channel();
if (!newchannel) {
vmbus_release_relid(offer->child_relid);
+   atomic_dec(&vmbus_connection.offer_in_progress);
pr_err("Unable to allocate channel object\n");
return;
}
@@ -845,16 +847,38 @@ static void vmbus_onoffer_rescind(struct 
vmbus_channel_message_header *hdr)
 
rescind = (struct vmbus_channel_rescind_offer *)hdr;
 
+   /*
+* The offer msg and the corresponding rescind msg
+* from the host are guranteed to be ordered -
+* offer comes in first and then the rescind.
+* Since we process these events in work elements,
+* and with preemption, we may end up processing
+* the events out of order. Given that we handle these
+* work elements on the same CPU, this is possible only
+* in the case of preemption. In any case wait here
+* until the offer proces

[PATCH 4/6] Drivers: hv: util: Make hv_poll_channel() a little more efficient

2017-04-30 Thread kys

From: K. Y. Srinivasan 

The current code unconditionally sends an IPI. If we are running on the
correct CPU and are in interrupt level, we don't need an IPI.
Make this adjustment.

Signed-off-by: K. Y. Srinivasan 
---
 drivers/hv/hyperv_vmbus.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 6113e91..fa514be 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -411,6 +411,10 @@ static inline void hv_poll_channel(struct vmbus_channel 
*channel,
if (!channel)
return;
 
+   if (in_interrupt() && (channel->target_cpu == smp_processor_id())) {
+   cb(channel);
+   return;
+   }
smp_call_function_single(channel->target_cpu, cb, channel, true);
 }
 
-- 
1.7.1

[PATCH 3/6] Drivers: hv: vmbus: Fix error code returned by vmbus_post_msg()

2017-04-30 Thread kys

From: K. Y. Srinivasan 

ENOBUFS is a more approrpiate error code to be returned
when the hypervisor cannot post the message because of
insufficient buffers. Make the adjustment.

Signed-off-by: K. Y. Srinivasan 
---
 drivers/hv/connection.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index fce27fb..a938fcf 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -370,7 +370,7 @@ int vmbus_post_msg(void *buffer, size_t buflen, bool 
can_sleep)
break;
case HV_STATUS_INSUFFICIENT_MEMORY:
case HV_STATUS_INSUFFICIENT_BUFFERS:
-   ret = -ENOMEM;
+   ret = -ENOBUFS;
break;
case HV_STATUS_SUCCESS:
return ret;
-- 
1.7.1

[PATCH 2/6] tools: hv: properly handle long paths

2017-04-30 Thread kys

From: Vitaly Kuznetsov 

Paths can be up to PATH_MAX long and PATH_MAX is usually greater than 256.
While on it, simplify path reconstruction to a simple snprintf(), define
and reuse KVP_NET_DIR.

Suggested-by: Tomas Hozza 
Signed-off-by: Vitaly Kuznetsov 
Signed-off-by: K. Y. Srinivasan 
---
 tools/hv/hv_kvp_daemon.c |   44 ++--
 1 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index f1758fc..88b20e0 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -97,6 +98,8 @@ enum {
 #define KVP_SCRIPTS_PATH "/usr/libexec/hypervkvpd/"
 #endif
 
+#define KVP_NET_DIR "/sys/class/net/"
+
 #define MAX_FILE_NAME 100
 #define ENTRIES_PER_BLOCK 50
 
@@ -596,26 +599,21 @@ void kvp_get_os_info(void)
DIR *dir;
struct dirent *entry;
FILE*file;
-   char*p, *q, *x;
+   char*p, *x;
char*if_name = NULL;
charbuf[256];
-   char *kvp_net_dir = "/sys/class/net/";
-   char dev_id[256];
+   char dev_id[PATH_MAX];
 
-   dir = opendir(kvp_net_dir);
+   dir = opendir(KVP_NET_DIR);
if (dir == NULL)
return NULL;
 
-   snprintf(dev_id, sizeof(dev_id), "%s", kvp_net_dir);
-   q = dev_id + strlen(kvp_net_dir);
-
while ((entry = readdir(dir)) != NULL) {
/*
 * Set the state for the next pass.
 */
-   *q = '\0';
-   strcat(dev_id, entry->d_name);
-   strcat(dev_id, "/device/device_id");
+   snprintf(dev_id, sizeof(dev_id), "%s%s/device/device_id",
+KVP_NET_DIR, entry->d_name);
 
file = fopen(dev_id, "r");
if (file == NULL)
@@ -653,12 +651,12 @@ void kvp_get_os_info(void)
FILE*file;
char*p, *x;
charbuf[256];
-   char addr_file[256];
+   char addr_file[PATH_MAX];
unsigned int i;
char *mac_addr = NULL;
 
-   snprintf(addr_file, sizeof(addr_file), "%s%s%s", "/sys/class/net/",
-   if_name, "/address");
+   snprintf(addr_file, sizeof(addr_file), "%s%s%s", KVP_NET_DIR,
+if_name, "/address");
 
file = fopen(addr_file, "r");
if (file == NULL)
@@ -688,28 +686,22 @@ void kvp_get_os_info(void)
DIR *dir;
struct dirent *entry;
FILE*file;
-   char*p, *q, *x;
+   char*p, *x;
char*if_name = NULL;
charbuf[256];
-   char *kvp_net_dir = "/sys/class/net/";
-   char dev_id[256];
+   char dev_id[PATH_MAX];
unsigned int i;
 
-   dir = opendir(kvp_net_dir);
+   dir = opendir(KVP_NET_DIR);
if (dir == NULL)
return NULL;
 
-   snprintf(dev_id, sizeof(dev_id), "%s", kvp_net_dir);
-   q = dev_id + strlen(kvp_net_dir);
-
while ((entry = readdir(dir)) != NULL) {
/*
 * Set the state for the next pass.
 */
-   *q = '\0';
-
-   strcat(dev_id, entry->d_name);
-   strcat(dev_id, "/address");
+   snprintf(dev_id, sizeof(dev_id), "%s%s/address", KVP_NET_DIR,
+entry->d_name);
 
file = fopen(dev_id, "r");
if (file == NULL)
@@ -1218,9 +1210,9 @@ static int process_ip_string(FILE *f, char *ip_string, 
int type)
 static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
 {
int error = 0;
-   char if_file[128];
+   char if_file[PATH_MAX];
FILE *file;
-   char cmd[512];
+   char cmd[PATH_MAX];
char *mac_addr;
 
/*
-- 
1.7.1

[PATCH 6/6] HV: properly delay KVP packets when negotiation is in progress

2017-04-30 Thread kys

From: Long Li 

The host may send multiple negotiation packets
(due to timeout) before the KVP user-mode daemon
is connected. KVP user-mode daemon is connected.
We need to defer processing those packets
until the daemon is negotiated and connected.
It's okay for guest to respond
to all negotiation packets.

In addition, the host may send multiple staged
KVP requests as soon as negotiation is done.
We need to properly process those packets using one
tasklet for exclusive access to ring buffer.

This patch is based on the work of
Nick Meier .

Signed-off-by: Long Li 
Signed-off-by: K. Y. Srinivasan 
---
 drivers/hv/hv_kvp.c |   14 --
 1 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index e99ff2d..9a90b91 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -112,7 +112,7 @@ static void kvp_poll_wrapper(void *channel)
 {
/* Transaction is finished, reset the state here to avoid races. */
kvp_transaction.state = HVUTIL_READY;
-   hv_kvp_onchannelcallback(channel);
+   tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
 }
 
 static void kvp_register_done(void)
@@ -159,7 +159,7 @@ static void kvp_timeout_func(struct work_struct *dummy)
 
 static void kvp_host_handshake_func(struct work_struct *dummy)
 {
-   hv_poll_channel(kvp_transaction.recv_channel, hv_kvp_onchannelcallback);
+   tasklet_schedule(&kvp_transaction.recv_channel->callback_event);
 }
 
 static int kvp_handle_handshake(struct hv_kvp_msg *msg)
@@ -625,16 +625,17 @@ void hv_kvp_onchannelcallback(void *context)
 NEGO_IN_PROGRESS,
 NEGO_FINISHED} host_negotiatied = NEGO_NOT_STARTED;
 
-   if (host_negotiatied == NEGO_NOT_STARTED &&
-   kvp_transaction.state < HVUTIL_READY) {
+   if (kvp_transaction.state < HVUTIL_READY) {
/*
 * If userspace daemon is not connected and host is asking
 * us to negotiate we need to delay to not lose messages.
 * This is important for Failover IP setting.
 */
-   host_negotiatied = NEGO_IN_PROGRESS;
-   schedule_delayed_work(&kvp_host_handshake_work,
+   if (host_negotiatied == NEGO_NOT_STARTED) {
+   host_negotiatied = NEGO_IN_PROGRESS;
+   schedule_delayed_work(&kvp_host_handshake_work,
  HV_UTIL_NEGO_TIMEOUT * HZ);
+   }
return;
}
if (kvp_transaction.state > HVUTIL_READY)
@@ -702,6 +703,7 @@ void hv_kvp_onchannelcallback(void *context)
   VM_PKT_DATA_INBAND, 0);
 
host_negotiatied = NEGO_FINISHED;
+   hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
}
 
 }
-- 
1.7.1

[PATCH 1/6] Tools: hv: vss: Thaw the filesystem and continue if freeze call has timed out

2017-04-30 Thread kys

From: Alex Ng 

If a FREEZE operation takes too long, the driver may time out and move on
to another  operation. The daemon is unaware of this and attempts to
notify the driver that the FREEZE succeeded. This results in an error from
the driver and the daemon leaves the filesystem in frozen state.

Fix this by thawing the filesystem and continuing.

Signed-off-by: Michael Gissing 
Signed-off-by: Alex Ng 
Signed-off-by: K. Y. Srinivasan 
---
 tools/hv/hv_vss_daemon.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
index e082980..7ba5419 100644
--- a/tools/hv/hv_vss_daemon.c
+++ b/tools/hv/hv_vss_daemon.c
@@ -261,7 +261,9 @@ int main(int argc, char *argv[])
if (len != sizeof(struct hv_vss_msg)) {
syslog(LOG_ERR, "write failed; error: %d %s", errno,
   strerror(errno));
-   exit(EXIT_FAILURE);
+
+   if (op == VSS_OP_FREEZE)
+   vss_operate(VSS_OP_THAW);
}
}
 
-- 
1.7.1

[PATCH 0/6] Drivers: hv: Miscellaneous fixes

2017-04-30 Thread kys

From: K. Y. Srinivasan 

Miscellaneous fixes to vmbus and util drivers.

Alex Ng (1):
  Tools: hv: vss: Thaw the filesystem and continue if freeze call has
timed out

K. Y. Srinivasan (3):
  Drivers: hv: vmbus: Fix error code returned by vmbus_post_msg()
  Drivers: hv: util: Make hv_poll_channel() a little more efficient
  Drivers: hv: vmbus: Fix rescind handling

Long Li (1):
  HV: properly delay KVP packets when negotiation is in progress

Vitaly Kuznetsov (1):
  tools: hv: properly handle long paths

 drivers/hv/channel.c  |8 -
 drivers/hv/channel_mgmt.c |   69 ++--
 drivers/hv/connection.c   |9 --
 drivers/hv/hv_kvp.c   |   14 +
 drivers/hv/hyperv_vmbus.h |   11 +++
 drivers/hv/vmbus_drv.c|   29 ++-
 tools/hv/hv_kvp_daemon.c  |   44 -
 tools/hv/hv_vss_daemon.c  |4 ++-
 8 files changed, 133 insertions(+), 55 deletions(-)

Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?

2017-04-30 Thread John Fastabend

On 17-04-29 06:04 PM, Alexei Starovoitov wrote:
> On 4/28/17 3:58 AM, Jesper Dangaard Brouer wrote:
>> On Thu, 27 Apr 2017 16:31:14 -0700
>> Alexei Starovoitov  wrote:
>>
>>> On 4/27/17 1:41 AM, Jesper Dangaard Brouer wrote:
 When registering/attaching a XDP/bpf program, we would just send the
 file-descriptor for this port-map along (like we do with the bpf_prog
 FD). Plus, it own ingress-port number this program is in the port-map.

 It is not clear to me, in-which-data-structure on the kernel-side we
 store this reference to the port-map and ingress-port. As today we only
 have the "raw" struct bpf_prog pointer. I see several options:

 1. Create a new xdp_prog struct that contains existing bpf_prog,
 a port-map pointer and ingress-port. (IMHO easiest solution)

 2. Just create a new pointer to port-map and store it in driver rx-ring
 struct (like existing bpf_prog), but this create a race-challenge
 replacing (cmpxchg) the program (or perhaps it's not a problem as it
 runs under rcu and RTNL-lock).

 3. Extend bpf_prog to store this port-map and ingress-port, and have a
 fast-way to access it.  I assume it will be accessible via
 bpf_prog->bpf_prog_aux->used_maps[X] but it will be too slow for XDP.
>>>
>>> I'm not sure I completely follow the 3 proposals.
>>> Are you suggesting to have only one netdev_array per program?
>>
>> Yes, but I can see you have a more clever idea below.
>>
>>> Why not to allow any number like we do for tailcall+prog_array, etc?
>>
>>> We can teach verifier to allow new helper
>>>  bpf_tx_port(netdev_array, port_num);
>>> to only be used with netdev_array map type.
>>> It will fetch netdevice pointer from netdev_array[port_num]
>>> and will tx the packet into it.
>>
>> I love it.
>>
>> I just don't like the "netdev" part of the name "netdev_array" as one
>> basic ideas of a port tabel, is that a port can be anything that can
>> consume a XDP_buff packet.  This generalization allow us to move code
>> out of the drivers.  We might be on the same page, as I do imagine that
>> netdev_array or port_array is just a struct bpf_map pointer, and the
>> bpf_map->map_type will tell us that this bpf_map contains net_device
>> pointers.  Thus, when later introducing a new type of redirect (like to
>> a socket or remote-CPU) then we just add a new bpf_map_type for this,
>> without needing to change anything in the drivers, right?
> 
> In theory, yes, but in practice I doubt it will be so easy.
> We probably shouldn't allow very different types of netdev
> into the same netdev_array or port_array (whatever the name).
> They need to be similar enough, otherwise we'd have to do run-time
> checks. If they're all the same, these checks can be done at
> insertion time instead.
> 

I think we can just have different map types for each redirect type. So
a netdev_map_array, socket_map_array, etc.

>> Do you imagine that bpf-side bpf_tx_port() returns XDP_REDIRECT?
>> Or does it return if the call was successful (e.g validate port_num
>> existed in map)?
> 
> don't know :)
> we need to brainstorm pros and cons.
> 
>> On the kernel side, we need to receive this info "port_array" and
>> "port_num", given you don't provide the call a xdp_buff/ctx, then I
>> assume you want the per-CPU temp-store solution.  Then during the
>> XDP_REDIRECT action we call a core redirect function that based on the
>> bpf_map_type does a lookup, and find the net_device ptr.
> 
> hmm. didn't think that far either :)
> indeed makes sense to pass 'ctx' into such helper as well,
> so it's easier to deal with original netdev.
> 
>>> We can make it similar to bpf_tail_call(), so that program will
>>> finish on successful bpf_tx_port() or
>>> make it into 'delayed' tx which will be executed when program finishes.
>>> Not sure which approach is better.
>>
>> I know you are talking about something slightly different, about
>> delaying TX.
>>
>> But I want to mention (as I've done before) that it is important (for
>> me) that we get bulking working/integrated.   I imagine the driver will
>> call a function that will delay the TX/redirect action and at the end
>> of the NAPI cycle have a function that flush packets, bulk per
>> destination port.
>>
>> I was wondering where to store these delayed TX packets, but now that
>> we have an associated bpf_map data-structure (netdev_array), I'm thinking
>> about storing packets (ordered by port) inside that.  And then have a
>> bpf_tx_flush(netdev_array) call in the driver (for every port-table-map
>> seen, which will likely be small).
> 
> makes sense to me as well.
> Ideally we should try to make an api such, that batching or no-batching
> can be kernel choice. The program will just do
> xdp_tx_port(...something here...)
> and the kernel does the best for performance. If it needs to delay
> the result to do batching the api should allow that transparently.
> Like right now xdp program does 'return XDP_TX;' and
> on t

Re: [PATCH v3 binutils] Add BPF support to binutils...

2017-04-30 Thread David Miller

From: Alexei Starovoitov 
Date: Sat, 29 Apr 2017 23:44:59 -0700

> '-g' still doesn't seem to work:
> /w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
> /w/binutils-gdb/bld/binutils/objdump: BFD (GNU Binutils)
> 2.28.51.20170429 assertion fail ../../bfd/elf64-bpf.c:139
>0: 18 01 00 00 39 47 98 83 ldimm64 r0, 590618314553

Ok, I can look at the debug info in little endian objects created by
clang now, but something is up with the dwarf information in
big-endian objects.

Your test program:

int bpf_prog1(void *ign)
{
volatile unsigned long t = 0x8983984739ull;
return *(unsigned long *)((0x8fff0002ull) + t);
}

built with:

clang -O2 -target bpfel -g -c x.c -o x.o

readelf can see it just fine:

[davem@localhost binutils]$ ./readelf --debug-dump=loc ./xel.o 
Contents of the .debug_loc section:

Offset   BeginEnd  Expression
  0010 (DW_OP_reg1 (r1))
0013 
0023 0010 0020 (DW_OP_constu: 590618314553; 
DW_OP_stack_value)
003d 0020 0030 (DW_OP_reg1 (r1))
0050 

But with big-endian:

[davem@localhost binutils]$ ./readelf --debug-dump=loc ./xeb.o 
readelf: Warning: Invalid pointer size (0) in compunit header, using 4 instead
readelf: Warning: Bogus end-of-siblings marker detected at offset 27 in 
.debug_info section
readelf: Warning: Bogus end-of-siblings marker detected at offset 28 in 
.debug_info section
readelf: Warning: DIE at offset 0x29 refers to abbreviation number 48 which 
does not exist
readelf: Warning: Unable to load/parse the .debug_info section, so cannot 
interpret the .debug_loc section.

GDB behaves similarly, xel.o works fine but for the big-endian object:

Reading symbols from ./xeb.o./../binutils-gdb/gdb/dwarf2read.c:16933: 
internal-error: read_address: bad switch, unsigned [in module 
/home/davem/src/GIT/BINUTILS/build-bpf/binutils/xeb.o]

It is entirely possible that the problem is on the LLVM side.
Can you double check that the dwarf2 emission code in LLVM is
using the correct endianness?

Here is my working diff against v4:

diff --git a/bfd/elf64-bpf.c b/bfd/elf64-bpf.c
index a42f768..1d8085e 100644
--- a/bfd/elf64-bpf.c
+++ b/bfd/elf64-bpf.c
@@ -15,6 +15,7 @@
 static reloc_howto_type _bfd_bpf_elf_howto_table[] =
 {
   HOWTO(R_BPF_NONE,  0,3, 0,FALSE,0,complain_overflow_dont,
bfd_elf_generic_reloc,  "R_BPF_NONE",FALSE,0,0x,TRUE),
+  HOWTO(R_BPF_DATA_64,   
0,4,64,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  
"R_BPF_DATA_64", FALSE,0,MINUS_ONE,TRUE),
 
   /* XXX these are wrong XXX */
   HOWTO(R_BPF_INSN_64,   
0,4,64,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  
"R_BPF_INSN_64", FALSE,0,MINUS_ONE,TRUE),
@@ -22,13 +23,11 @@ static reloc_howto_type _bfd_bpf_elf_howto_table[] =
   HOWTO(R_BPF_INSN_16,   
0,1,16,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  
"R_BPF_INSN_16", FALSE,0,0x,TRUE),
   HOWTO(R_BPF_WDISP16,   0,1,16,TRUE, 0,complain_overflow_signed,  
bfd_elf_generic_reloc,  "R_BPF_WDISP16", FALSE,0,0x,TRUE),
 
-  EMPTY_HOWTO(5),
   EMPTY_HOWTO(6),
   EMPTY_HOWTO(7),
   HOWTO(R_BPF_DATA_8,0,0, 
8,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  "R_BPF_DATA_8",  
FALSE,0,0x00ff,TRUE),
   HOWTO(R_BPF_DATA_16,   
0,1,16,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  
"R_BPF_DATA_16", FALSE,0,0x,TRUE),
   HOWTO(R_BPF_DATA_32,   
0,2,32,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  
"R_BPF_DATA_32", FALSE,0,0x,TRUE),
-  HOWTO(R_BPF_DATA_64,   
0,4,64,FALSE,0,complain_overflow_bitfield,bfd_elf_generic_reloc,  
"R_BPF_DATA_64", FALSE,0,MINUS_ONE,TRUE),
 };
 
 reloc_howto_type *
diff --git a/binutils/readelf.c b/binutils/readelf.c
index b4013fb..0e9716b 100644
--- a/binutils/readelf.c
+++ b/binutils/readelf.c
@@ -12254,7 +12254,7 @@ is_64bit_abs_reloc (unsigned int reloc_type)
 case EM_ALPHA:
   return reloc_type == 2; /* R_ALPHA_REFQUAD.  */
 case EM_BPF:
-  return reloc_type == 11; /* R_BPF_DATA_64 */
+  return reloc_type == 1; /* R_BPF_DATA_64 */
 case EM_IA_64:
   return reloc_type == 0x27; /* R_IA64_DIR64LSB.  */
 case EM_PARISC:
diff --git a/include/elf/bpf.h b/include/elf/bpf.h
index 4aa38cc..0d6fddc 100644
--- a/include/elf/bpf.h
+++ b/include/elf/bpf.h
@@ -26,14 +26,14 @@
 /* Relocation types.  */
 START_RELOC_NUMBERS (elf_bpf_reloc_type)
   RELOC_NUMBER (R_BPF_NONE, 0)
-  RELOC_NUMBER (R_BPF_INSN_64, 1)
-  RELOC_NUMBER (R_BPF_INSN_32, 2)
-  RELOC_NUMBER (R_BPF_INSN_16, 3)
-  RELOC_NUMBER (R_BPF_WDISP16, 4)
+  RELOC_NUMBER (R_BPF_DATA_64, 1)
+  RELOC_NUMBER (R_BPF_INSN_64, 2)
+  RELOC_NUMBER (R_BPF_INSN_32, 3)
+  RELOC_NUMBER (R_BPF_INSN_16, 4)
+  RELOC_NUMBER (R_BPF_WDISP16, 5)
   RELOC_NUMBER (R_BPF_DATA_8,  8)
   RELOC_NUMBER (R_BPF_DATA_16, 9)
   RELOC_NUMBER (R_BPF_DATA_32, 10)
-  RELOC_NUMBER (R_BPF_DATA

[PATCH net-next] mlxsw: spectrum_router: Simplify VRF enslavement

2017-04-30 Thread idosch

From: Ido Schimmel 

When a netdev is enslaved to a VRF master, its router interface (RIF)
needs to be destroyed (if exists) and a new one created using the
corresponding virtual router (VR).

>From the driver's perspective, the above is equivalent to an inetaddr
event sent for this netdev. Therefore, when a port netdev (or its
uppers) are enslaved to a VRF master, call the same function that
would've been called had a NETDEV_UP was sent for this netdev in the
inetaddr notification chain.

This patch also fixes a bug when a LAG netdev with an existing RIF is
enslaved to a VRF. Before this patch, each LAG port would drop the
reference on the RIF, but would re-join the same one (in the wrong VR)
soon after. With this patch, the corresponding RIF is first destroyed
and a new one is created using the correct VR.

Fixes: 7179eb5acd59 ("mlxsw: spectrum_router: Add support for VRFs")
Signed-off-by: Ido Schimmel 
Reviewed-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |  77 +++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  10 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 107 +
 3 files changed, 63 insertions(+), 131 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 20c1b6c..88357ce 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4106,7 +4106,6 @@ static int mlxsw_sp_netdevice_port_upper_event(struct 
net_device *dev,
if (!is_vlan_dev(upper_dev) &&
!netif_is_lag_master(upper_dev) &&
!netif_is_bridge_master(upper_dev) &&
-   !netif_is_l3_master(upper_dev) &&
!netif_is_ovs_master(upper_dev))
return -EINVAL;
if (!info->linking)
@@ -4151,11 +4150,6 @@ static int mlxsw_sp_netdevice_port_upper_event(struct 
net_device *dev,
else
mlxsw_sp_port_lag_leave(mlxsw_sp_port,
upper_dev);
-   } else if (netif_is_l3_master(upper_dev)) {
-   if (info->linking)
-   err = mlxsw_sp_port_vrf_join(mlxsw_sp_port);
-   else
-   mlxsw_sp_port_vrf_leave(mlxsw_sp_port);
} else if (netif_is_ovs_master(upper_dev)) {
if (info->linking)
err = mlxsw_sp_port_ovs_join(mlxsw_sp_port);
@@ -4275,7 +4269,7 @@ static int mlxsw_sp_netdevice_bridge_event(struct 
net_device *br_dev,
switch (event) {
case NETDEV_PRECHANGEUPPER:
upper_dev = info->upper_dev;
-   if (!is_vlan_dev(upper_dev) && !netif_is_l3_master(upper_dev))
+   if (!is_vlan_dev(upper_dev))
return -EINVAL;
if (is_vlan_dev(upper_dev) &&
br_dev != mlxsw_sp->master_bridge.dev)
@@ -4290,12 +4284,6 @@ static int mlxsw_sp_netdevice_bridge_event(struct 
net_device *br_dev,
else
mlxsw_sp_master_bridge_vlan_unlink(mlxsw_sp,
   upper_dev);
-   } else if (netif_is_l3_master(upper_dev)) {
-   if (info->linking)
-   err = mlxsw_sp_bridge_vrf_join(mlxsw_sp,
-  br_dev);
-   else
-   mlxsw_sp_bridge_vrf_leave(mlxsw_sp, br_dev);
} else {
err = -EINVAL;
WARN_ON(1);
@@ -4529,8 +4517,7 @@ static int mlxsw_sp_netdevice_vport_event(struct 
net_device *dev,
switch (event) {
case NETDEV_PRECHANGEUPPER:
upper_dev = info->upper_dev;
-   if (!netif_is_bridge_master(upper_dev) &&
-   !netif_is_l3_master(upper_dev))
+   if (!netif_is_bridge_master(upper_dev))
return -EINVAL;
if (!info->linking)
break;
@@ -4550,11 +4537,6 @@ static int mlxsw_sp_netdevice_vport_event(struct 
net_device *dev,
 upper_dev);
else
mlxsw_sp_vport_bridge_leave(mlxsw_sp_vport);
-   } else if (netif_is_l3_master(upper_dev)) {
-   if (info->linking)
-   err = mlxsw_sp_vport_vrf_join(mlxsw_sp_vport);
-   else
-   mlxsw_sp_vport_vrf_leave(mlxsw_sp_vport);
} else {
err = -EINVAL;
WARN_ON(1);
@@ -4585,47 +4567,6 @@ static int mlxsw_sp_netd

Re: [Patch net-next v2] ipv4: get rid of ip_ra_lock

2017-04-30 Thread Eric Dumazet

On Fri, 2017-04-28 at 10:04 -0700, Cong Wang wrote:
> After commit 1215e51edad1 ("ipv4: fix a deadlock in ip_ra_control")
> we always take RTNL lock for ip_ra_control() which is the only place
> we update the list ip_ra_chain, so the ip_ra_lock is no longer needed.
> 
> As Eric points out, BH does not need to disable either, RCU readers
> don't care.
> 
> Signed-off-by: Cong Wang 
> ---
>  net/ipv4/ip_sockglue.c | 9 +
>  1 file changed, 1 insertion(+), 8 deletions(-)
> 
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 1d46d05..4c25458 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -330,7 +330,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
> struct ipcm_cookie *ipc,
> sent to multicast group to reach destination designated router.
>   */
>  struct ip_ra_chain __rcu *ip_ra_chain;
> -static DEFINE_SPINLOCK(ip_ra_lock);
>  
> 
>  static void ip_ra_destroy_rcu(struct rcu_head *head)
> @@ -352,21 +351,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
>  
>   new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
>  
> - spin_lock_bh(&ip_ra_lock);
>   for (rap = &ip_ra_chain;
> -  (ra = rcu_dereference_protected(*rap,
> - lockdep_is_held(&ip_ra_lock))) != NULL;
> +  (ra = rtnl_dereference(*rap)) != NULL;
>rap = &ra->next) {
>   if (ra->sk == sk) {
>   if (on) {
> - spin_unlock_bh(&ip_ra_lock);
>   kfree(new_ra);
>   return -EADDRINUSE;
>   }
>   /* dont let ip_call_ra_chain() use sk again */
>   ra->sk = NULL;
>   RCU_INIT_POINTER(*rap, ra->next);
> - spin_unlock_bh(&ip_ra_lock);
>  
>   if (ra->destructor)
>   ra->destructor(sk);
> @@ -381,7 +376,6 @@ int ip_ra_control(struct sock *sk, unsigned char on,
>   }
>   }
>   if (!new_ra) {
> - spin_unlock_bh(&ip_ra_lock);
>   return -ENOBUFS;
>   }

Minor point : You could have removed the {}

Acked-by: Eric Dumazet 

Thanks !

Re: [PATCH] netvsc: make sure napi enabled before vmbus_open

2017-04-30 Thread Stephen Hemminger

On Fri, 28 Apr 2017 16:09:13 -0400 (EDT)
David Miller  wrote:

> From: Stephen Hemminger 
> Date: Wed, 26 Apr 2017 16:58:30 -0700
> 
> > This fixes a race where vmbus callback for new packet arriving
> > could occur before NAPI is initialized. Happens more on WS2008
> > which takes longer to setup channel.
> > 
> > Signed-off-by: Stephen Hemminger   
> 
> This doesn't apply cleanly to 'net'.

Patch was for net-next. Will resend in next group

[PATCH v4 binutils] Add BPF support to binutils...

2017-04-30 Thread David Miller


This is mainly a synchronization point, I still need to look
more deeply into Alexei's -g issue.

New in this version from v3:
 - Remove tailcall from opcode table
 - Rearrange relocations so that numbers match with LLVM ones
 - Emit relocs properly so that dwarf2 debug info tests pass
 - Handle negative load/store offsets properly, add tests

Signed-off-by: David S. Miller 
---
 bfd/Makefile.am |   2 +
 bfd/Makefile.in |   3 +
 bfd/archures.c  |   3 +
 bfd/bfd-in2.h   |   8 +
 bfd/config.bfd  |   6 +
 bfd/configure   |   2 +
 bfd/configure.ac|   2 +
 bfd/cpu-bpf.c   |  41 +++
 bfd/elf64-bpf.c | 159 ++
 bfd/elf64-bpf.h |  24 ++
 bfd/libbfd.h|   4 +
 bfd/reloc.c |  11 +
 bfd/targets.c   |   5 +
 binutils/readelf.c  |  11 +
 config.sub  |   5 +-
 gas/Makefile.am |   2 +
 gas/Makefile.in |  17 ++
 gas/config/tc-bpf.c | 639 
 gas/config/tc-bpf.h |  45 +++
 gas/configure.tgt   |   3 +
 gas/testsuite/gas/bpf/arith.d   |  61 
 gas/testsuite/gas/bpf/arith.s   |  53 
 gas/testsuite/gas/bpf/atomics.d |  12 +
 gas/testsuite/gas/bpf/atomics.s |   4 +
 gas/testsuite/gas/bpf/bpf.exp   |  28 ++
 gas/testsuite/gas/bpf/call.d|  14 +
 gas/testsuite/gas/bpf/call.s|   6 +
 gas/testsuite/gas/bpf/imm64.d   |  30 ++
 gas/testsuite/gas/bpf/imm64.s   |  12 +
 gas/testsuite/gas/bpf/jump.d|  43 +++
 gas/testsuite/gas/bpf/jump.s|  35 +++
 gas/testsuite/gas/bpf/loads.d   |  27 ++
 gas/testsuite/gas/bpf/loads.s   |  19 ++
 gas/testsuite/gas/bpf/move.d|  19 ++
 gas/testsuite/gas/bpf/move.s|  11 +
 gas/testsuite/gas/bpf/stores.d  |  21 ++
 gas/testsuite/gas/bpf/stores.s  |  13 +
 gdb/bpf-tdep.c  | 229 ++
 gdb/bpf-tdep.h  |  40 +++
 gdb/configure.tgt   |   4 +
 include/dis-asm.h   |   1 +
 include/elf/bpf.h   |  39 +++
 include/opcode/bpf.h|  16 +
 ld/Makefile.am  |   4 +
 ld/Makefile.in  |   5 +
 ld/configure.tgt|   2 +
 ld/emulparams/elf64_bpf.sh  |   8 +
 opcodes/Makefile.am |   2 +
 opcodes/bpf-dis.c   | 161 ++
 opcodes/bpf-opc.c   | 147 +
 opcodes/configure   |   1 +
 opcodes/configure.ac|   1 +
 opcodes/disassemble.c   |   6 +
 sim/configure.tgt   |   3 +
 54 files changed, 2067 insertions(+), 2 deletions(-)
 create mode 100644 bfd/cpu-bpf.c
 create mode 100644 bfd/elf64-bpf.c
 create mode 100644 bfd/elf64-bpf.h
 create mode 100644 gas/config/tc-bpf.c
 create mode 100644 gas/config/tc-bpf.h
 create mode 100644 gas/testsuite/gas/bpf/arith.d
 create mode 100644 gas/testsuite/gas/bpf/arith.s
 create mode 100644 gas/testsuite/gas/bpf/atomics.d
 create mode 100644 gas/testsuite/gas/bpf/atomics.s
 create mode 100644 gas/testsuite/gas/bpf/bpf.exp
 create mode 100644 gas/testsuite/gas/bpf/call.d
 create mode 100644 gas/testsuite/gas/bpf/call.s
 create mode 100644 gas/testsuite/gas/bpf/imm64.d
 create mode 100644 gas/testsuite/gas/bpf/imm64.s
 create mode 100644 gas/testsuite/gas/bpf/jump.d
 create mode 100644 gas/testsuite/gas/bpf/jump.s
 create mode 100644 gas/testsuite/gas/bpf/loads.d
 create mode 100644 gas/testsuite/gas/bpf/loads.s
 create mode 100644 gas/testsuite/gas/bpf/move.d
 create mode 100644 gas/testsuite/gas/bpf/move.s
 create mode 100644 gas/testsuite/gas/bpf/stores.d
 create mode 100644 gas/testsuite/gas/bpf/stores.s
 create mode 100644 gdb/bpf-tdep.c
 create mode 100644 gdb/bpf-tdep.h
 create mode 100644 include/elf/bpf.h
 create mode 100644 include/opcode/bpf.h
 create mode 100644 ld/emulparams/elf64_bpf.sh
 create mode 100644 opcodes/bpf-dis.c
 create mode 100644 opcodes/bpf-opc.c

diff --git a/bfd/Makefile.am b/bfd/Makefile.am
index 97b608c..911655a 100644
--- a/bfd/Makefile.am
+++ b/bfd/Makefile.am
@@ -95,6 +95,7 @@ ALL_MACHINES = \
cpu-arm.lo \
cpu-avr.lo \
cpu-bfin.lo \
+   cpu-bpf.lo \
cpu-cr16.lo \
cpu-cr16c.lo \
cpu-cris.lo \
@@ -185,6 +186,7 @@ ALL_MACHINES_CFILES = \
cpu-arm.c \
cpu-avr.c \
cpu-bfin.c \
+   cpu-bpf.c \
cpu-cr16.c \
cpu-cr16c.c \
cpu-cris.c \
diff --git a/bfd/Makefile.in b/bfd/Makefile.in
index e48abaf..930aa09 100644
--- a/bfd/Makefile.in
+++ b/bfd/Makefile.in
@@ -428,6 +428,7 @@ ALL_MACHINES = \
cpu-arm.lo \
cpu-avr.lo \
cpu-bfin.lo \
+   cpu-bpf.lo \
cpu-cr16.lo \
cpu-cr16c.lo \
cpu-cris.lo \
@@ -518,6 +519,7 @@ ALL_MACHINES_CFILES = \
cpu-arm.c \
cpu-avr.c \
cpu-bfin.c \
+   cpu-bpf.c \

Re: [PATCH 0/4] TI Bluetooth serdev support

2017-04-30 Thread Sebastian Reichel

Hi,

On Sun, Apr 30, 2017 at 10:14:20AM -0500, Adam Ford wrote:
> On Wed, Apr 5, 2017 at 1:30 PM, Rob Herring  wrote:
> > This series adds serdev support to the HCI LL protocol used on TI BT
> > modules and enables support on HiKey board with with the WL1835 module.
> > With this the custom TI UIM daemon and btattach are no longer needed.
> 
> Without UIM daemon, what instruction do you use to load the BT firmware?
> 
> I was thinking 'hciattach' but I was having trouble.  I was hoping you
> might have some insight.
> 
>  hciattach -t 30 -s 115200 /dev/ttymxc1 texas 300 flow  Just
> returns a timeout.
> 
> I modified my i.MX6 device tree per the binding documentation and
> setup the regulators and enable GPIO pins.

If you configured everything correctly no userspace interaction is
required. The driver should request the firmware automatically once
you power up the bluetooth device.

Apart from DT changes make sure, that the following options are
enabled and check dmesg for any hints.

CONFIG_SERIAL_DEV_BUS
CONFIG_SERIAL_DEV_CTRL_TTYPORT
CONFIG_BT_HCIUART
CONFIG_BT_HCIUART_LL

-- Sebastian


signature.asc
Description: PGP signature

Re: [PATCH v3 binutils] Add BPF support to binutils...

2017-04-30 Thread David Miller

From: Alexei Starovoitov 
Date: Sat, 29 Apr 2017 23:44:59 -0700

> On 4/29/17 7:37 PM, David Miller wrote:
>> BTW, should I just remove tailcall from the opcode table altogether?
> 
> yeah. tailcall is not a special opcode from user space point of view.
> Only after normal call with func_id=bpf_tail_call passes verifier
> then verifier will change insn->code into CALL|X
> It's done only to have two 'case' statement in the interpreter,
> so that normal calls and tailcalls don't interfere.
> From user space pov CALL|X opcode is reserved and we can use it
> for something in the future. Just need to change interpeter and JITs.

Ok, I've removed it from my tree.

Thanks.

Re: [net-next 00/13][pull request] 40GbE Intel Wired LAN Driver Updates 2017-04-30

2017-04-30 Thread David Miller

From: Jeff Kirsher 
Date: Sun, 30 Apr 2017 06:24:38 -0700

> This series contains updates to i40e and i40evf only.
...
> The following are changes since commit 
> c08bac03d2894113bdb114e66e6ada009defb120:
>   Merge branch '10GbE' of 
> git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue
> and are available in the git repository at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Also pulled, thanks Jeff.

Re: [net-next 0/4][pull request] 1GbE Intel Wired LAN Driver Updates 2017-04-30

2017-04-30 Thread David Miller

From: Jeff Kirsher 
Date: Sun, 30 Apr 2017 05:36:10 -0700

> This series contains updates to e1000e only.
> 
> Jarod Wilson fixes an issue where the workaround for 82574 & 82583
> is needed for i218 as well, so set the appropriate flags.
> 
> Sasha adds support for the upcoming new i219 devices for the client
> platform (CannonLake), which includes the support for 38.4MHz frequency
> to support PTP on CannonLake.
> 
> The following are changes since commit 
> c08bac03d2894113bdb114e66e6ada009defb120:
>   Merge branch '10GbE' of 
> git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue
> and are available in the git repository at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 1GbE

Pulled.

Re: [PATCH v1 1/3] bnx2x: Replace custom scnprintf()

2017-04-30 Thread David Miller

From: Andy Shevchenko 
Date: Sun, 30 Apr 2017 15:58:18 +0300

> On Sun, Apr 30, 2017 at 11:16 AM, Mintz, Yuval  wrote:
>>> From: Andy Shevchenko 
>>>
>>> Use scnprintf() when printing version instead of custom open coded variants.
>>>
>>> Signed-off-by: Andy Shevchenko 
>>
>> Hi Andy this seems correct.
>> Was there a cover-letter for your series? I've failed to find it.
> 
> There was none since patches are quite straight forward.

All patch series, regardless of how simple, should provide
a proper cover letter.

It is an essential part of all patch series.

Re: [PATCH v3 binutils] Add BPF support to binutils...

2017-04-30 Thread David Miller

From: Alexei Starovoitov 
Date: Sat, 29 Apr 2017 23:44:59 -0700

> On 4/29/17 7:37 PM, David Miller wrote:
>> From: David Miller 
>> Date: Sat, 29 Apr 2017 22:24:50 -0400 (EDT)
>>
>>> Some of your bugs should be fixed by this patch below, I'll add
>>> test cases soon:
>>
>> Ok, here are all the local changes in my tree.  I made the relocs
>> match LLVM and I fixed some dwarf debugging stuff.
>>
>> With this we are also down to one test case failure under binutils/
>> and it's something weird with merging 64-bit notes which I should be
>> able to fix soon.
>>
>> I can fix these bugs fast, keep reporting.
>>
>> BTW, should I just remove tailcall from the opcode table altogether?
> 
> yeah. tailcall is not a special opcode from user space point of view.
> Only after normal call with func_id=bpf_tail_call passes verifier
> then verifier will change insn->code into CALL|X
> It's done only to have two 'case' statement in the interpreter,
> so that normal calls and tailcalls don't interfere.
> From user space pov CALL|X opcode is reserved and we can use it
> for something in the future. Just need to change interpeter and JITs.
> 
>>  case 'O':
>> -  (*info->fprintf_func) (stream, "%d", off);
>> +  (*info->fprintf_func) (stream, "%d", (int) off);
> 
> tried this diff. It looks better
>   10: 7b 1a f8 ff 00 00 00 00 stdw[r1+-8], r10
>   18: 79 a1 f8 ff 00 00 00 00 lddwr10, [r1+-8]
> I wonder if '+' can be removed as well.

All disassemblers in binutils print it this way, sparc, x86, etc.

> '-g' still doesn't seem to work:
> /w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
> /w/binutils-gdb/bld/binutils/objdump: BFD (GNU Binutils)
> 2.28.51.20170429 assertion fail ../../bfd/elf64-bpf.c:139
>0: 18 01 00 00 39 47 98 83 ldimm64 r0, 590618314553

Hmm, I defined a relocation type 10 in the patch, make sure BFD got
rebuilt properly...

I'll double check here too.

Re: assembler mnenomics for call/tailcall plus maps...

2017-04-30 Thread David Miller

From: Alexei Starovoitov 
Date: Sat, 29 Apr 2017 23:35:30 -0700

> On 4/29/17 11:38 AM, David Miller wrote:
>> or, taking it one step further, do the following since we know this
>> maps to a 32-bit FD:
>>
>>  mov32   r1, %map(hash_map)
> 
> hence this approach won't work without serious elf loader hacks.
> The kernel needs to see ldimm64 because after it validated map_fd,
> it will store real 'struct bpf_map *' pointer into this ldimm64
> instruction and it will clear 'src_reg' markings.

I didn't see this part, now it all makes sense why ldimm64 is used
and I therefore think we should keep it this way.

> So from interpreter and from JITs point of view there are no
> special ldimm64 instructions. All ldimm64 are moving 64-bit
> constant into a register. It's only verifier that knows that
> some of these constants are real pointers.
> 
>> In GCC it will be simple to get the backend to emit this, various
>> options exist.  We can make it a special "__attribute__((map))", or
>> use address spaces to annotate the map object.  And then when the
>> ldimm64 or whatever instruction is emitted, and it sees the symbol
>> referenced has this special type, it will emit "%%map(%s)" instead of
>> just "%s" for the symbol name in the asembler output.
> 
> I like the %map(symbol) idea.
> I think it fits the whole thing quite well.
> Not sure though how gcc will know that it needs to emit %map(..)

I just explained it in that paragraph above :-)

struct bpf_map_def SEC("maps") jmp_table __attribute__((map)) = {

And when referenced by an instruction the bpf gcc backend can see that
the "map" attribute is set and emit the appropriate %map() string into
the assembler.

We can even make the special map attribute do the SEC("") part too.

> I take all the blame for not documenting this thing properly.
> The elf loader in samples/bpf/bpf_load.c should have been temporary.
> Its only purpose was to have minimal demo to parse elf and load it.
> I didn't expect the .o approach to come that far.
> My bet was on iovisor/bcc approach where elf file is never generated.
> C->bpf is compiled in memory and loaded into the kernel completely
> without elf and without relocations.

I think it is better to have real objects for introspection (even
after session is complete) and for testing under simulators (one of
which I plan to write).

And if we linked a real final static object, elf header would be all
that would be needed to find execution entry point.

Re: [PATCH 0/4] TI Bluetooth serdev support

2017-04-30 Thread Adam Ford

On Wed, Apr 5, 2017 at 1:30 PM, Rob Herring  wrote:
> This series adds serdev support to the HCI LL protocol used on TI BT
> modules and enables support on HiKey board with with the WL1835 module.
> With this the custom TI UIM daemon and btattach are no longer needed.

Without UIM daemon, what instruction do you use to load the BT firmware?

I was thinking 'hciattach' but I was having trouble.  I was hoping you
might have some insight.

 hciattach -t 30 -s 115200 /dev/ttymxc1 texas 300 flow  Just
returns a timeout.

I modified my i.MX6 device tree per the binding documentation and
setup the regulators and enable GPIO pins.

adam
>
> The series is available on this git branch[1]. Patch 2 is just clean-up
> and can be applied independently. Patch 3 is dependent on the series
> "Nokia H4+ support". I'd suggest both series are merged thru the BT tree.
>
> Rob
>
> [1] git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git ti-bluetooth
>
> Rob Herring (4):
>   dt-bindings: net: Add TI WiLink shared transport binding
>   bluetooth: hci_uart: remove unused hci_uart_init_tty
>   bluetooth: hci_uart: add LL protocol serdev driver support
>   arm64: dts: hikey: add WL1835 Bluetooth device node
>
>  .../devicetree/bindings/net/ti,wilink-st.txt   |  35 +++
>  arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts |   5 +
>  drivers/bluetooth/hci_ldisc.c  |  19 --
>  drivers/bluetooth/hci_ll.c | 261 
> -
>  drivers/bluetooth/hci_uart.h   |   1 -
>  5 files changed, 300 insertions(+), 21 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/net/ti,wilink-st.txt
>
> --
> 2.10.1
>
>
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Re: [PATCH/RFC net-next 0/4] net/sched: cls_flower: avoid false matching of truncated packets

2017-04-30 Thread Jamal Hadi Salim


On 17-04-30 09:51 AM, Jamal Hadi Salim wrote:

[..]


1. As things stand, without this patch-set, flower does not differentiate
   between a packet truncated at the end of the IP header and a packet
with
   zero ports. Likewise for icmp type and code of zero.

   The first three patches of this series address that so that a match
for
   port == zero only matches if ports are present in the packet. Again,
   likewise for ICMP.

   This is a bug-fix to my way of thinking.



Agreed to bug fix. I would have said there is never a legit packet with
TCP/UDP


Meant:
"never a legit packet with TCP/UDP port 0 on the wire".

cheers,
jamal

[PATCH net-next RFC 1/1] net netlink: Add new type NLA_FLAG_BITS

2017-04-30 Thread Jamal Hadi Salim

From: Jamal Hadi Salim 

Generic bitflags attribute content sent to the kernel by user.
With this type the user can either set or unset a flag in the
kernel.

The nla_flag_values is a bitmap that defines the values being set
The nla_flag_selector is a bitmask that defines which value is legit

A check is made to ensure the rules that a kernel subsystem always
conforms to bitflags the kernel already knows about. Example
if the user tries to set a bit flag that is not understood then
the _it will be rejected_.
The user specifies the attribute policy as:
[ATTR_GOO] = { .type = NLA_FLAG_BITS, .validation_data = &myvalidflags },

where myvalidflags is the bit mask of the flags the kernel understands.

If the user _does not_ provide myvalidflags then the attribute will
also be rejected.

Examples:
nla_flag_values = 0x0, and nla_flag_selector = 0x1
implies we are selecting bit 1 and we want to set its value to 0.

nla_flag_values = 0x2, and nla_flag_selector = 0x2
implies we are selecting bit 2 and we want to set its value to 1.

This patch also provides an extra feature (which should be a separate
pach): a validation callback that could be speaciliazed for other types.

So a kernel subsystem could specify validation rules of the following
nature:

[ATTR_GOO] = { .type = MYTYPE,
   .validation_data = &myvalidation_data,
   .validate_content = mycontent_validator },

With validator callback looking like:

int mycontent_validator(const struct nlattr *nla, void *valid_data)
{
   const struct myattribute *user_data = nla_data(nla);
   struct myvalidation_struct *valid_data_constraint = valid_data;

  ... validate user_data against valid_data_constraint ...
  ... return appropriate error code etc ...
}

Only compile tested to float the idea.

Signed-off-by: Jamal Hadi Salim 
---
 include/net/netlink.h  | 11 +++
 include/uapi/linux/rtnetlink.h | 17 +
 lib/nlattr.c   | 25 +
 3 files changed, 53 insertions(+)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0170917..8ab9784 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -6,6 +6,11 @@
 #include 
 #include 
 
+struct nla_bit_flags {
+   u32 nla_flag_values;
+   u32 nla_flag_selector;
+};
+
 /* 
  * Netlink Messages and Attributes Interface (As Seen On TV)
  * 
@@ -178,6 +183,7 @@ enum {
NLA_S16,
NLA_S32,
NLA_S64,
+   NLA_FLAG_BITS,
__NLA_TYPE_MAX,
 };
 
@@ -206,6 +212,7 @@ enum {
  *NLA_MSECSLeaving the length field zero will verify the
  * given type fits, using it verifies minimum length
  * just like "All other"
+ *NLA_FLAG_BITSA bitmap/bitselector attribute
  *All otherMinimum length of attribute payload
  *
  * Example:
@@ -213,11 +220,15 @@ enum {
  * [ATTR_FOO] = { .type = NLA_U16 },
  * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
  * [ATTR_BAZ] = { .len = sizeof(struct mystruct) },
+ * [ATTR_GOO] = { .type = NLA_FLAG_BITS, .validation_data = &myvalidflags 
},
  * };
  */
 struct nla_policy {
u16 type;
u16 len;
+   void*validation_data;
+   int (*validate_content)(const struct nlattr *nla,
+   const void *validation_data);
 };
 
 /**
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cce0613..3691d8d 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -179,6 +179,23 @@ struct rtattr {
 #define RTA_DATA(rta)   ((void*)(((char*)(rta)) + RTA_LENGTH(0)))
 #define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0))
 
+/* Generic bitflags attribute content sent to the kernel.
+ *
+ * The nla_flag_values is a bitmap that defines the values being set
+ * The nla_flag_selector is a bitmask that defines which value is legit
+ *
+ * Examples:
+ *  nla_flag_values = 0x0, and nla_flag_selector = 0x1
+ *  implies we are selecting bit 1 and we want to set its value to 0.
+ *
+ *  nla_flag_values = 0x2, and nla_flag_selector = 0x2
+ *  implies we are selecting bit 2 and we want to set its value to 1.
+ *
+ */
+struct __nla_bit_flags {
+   __u32 nla_flag_values;
+   __u32 nla_flag_selector;
+};
 
 
 
diff --git a/lib/nlattr.c b/lib/nlattr.c
index a7e0b16..78fed43 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -27,6 +27,21 @@
[NLA_S64]   = sizeof(s64),
 };
 
+static int validate_nla_bit_flags(const struct nlattr *nla, void *valid_data)
+{
+   const struct nla_bit_flags *nbf = nla_data(nla);
+   u32 *valid_flags_mask = valid_data;
+
+   if (!valid_data)
+   return -EINVAL;
+
+
+   if (nbf->nla_flag_values &

[PATCH iproute2 net-next] ip xfrm: Add xfrm state crypto offload

2017-04-30 Thread ilant

From: Boris Pismenny 

syntax:
ip xfrm state  offload dev  dir 

Example to add inbound offload:
  ip xfrm state  offload dev mlx0 dir in
Example to add outbound offload:
  ip xfrm state  offload dev mlx0 dir out

Signed-off-by: Boris Pismenny 
Signed-off-by: Ilan Tayari 
---
 ip/ipxfrm.c | 19 +++
 ip/xfrm_state.c | 49 +
 2 files changed, 68 insertions(+)

diff --git a/ip/ipxfrm.c b/ip/ipxfrm.c
index b0cfac17..d5eb22e2 100644
--- a/ip/ipxfrm.c
+++ b/ip/ipxfrm.c
@@ -862,6 +862,25 @@ void xfrm_xfrma_print(struct rtattr *tb[], __u16 family,
}
fprintf(fp, "%s", _SL_);
}
+   if (tb[XFRMA_OFFLOAD_DEV]) {
+   struct xfrm_user_offload *xuo;
+
+   if (prefix)
+   fputs(prefix, fp);
+   fprintf(fp, "crypto offload parameters: ");
+
+   if (RTA_PAYLOAD(tb[XFRMA_OFFLOAD_DEV]) < sizeof(*xuo)) {
+   fprintf(fp, "(ERROR truncated)");
+   fprintf(fp, "%s", _SL_);
+   return;
+   }
+
+   xuo = (struct xfrm_user_offload *)
+   RTA_DATA(tb[XFRMA_OFFLOAD_DEV]);
+   fprintf(fp, "dev %s dir %s", ll_index_to_name(xuo->ifindex),
+   (xuo->flags & XFRM_OFFLOAD_INBOUND) ? "in" : "out");
+   fprintf(fp, "%s", _SL_);
+   }
 }
 
 static int xfrm_selector_iszero(struct xfrm_selector *s)
diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c
index ea7d4f34..e11c93bf 100644
--- a/ip/xfrm_state.c
+++ b/ip/xfrm_state.c
@@ -60,6 +60,7 @@ static void usage(void)
fprintf(stderr, "[ replay-seq-hi SEQ ] [ replay-oseq-hi SEQ 
]\n");
fprintf(stderr, "[ flag FLAG-LIST ] [ sel SELECTOR ] [ 
LIMIT-LIST ] [ encap ENCAP ]\n");
fprintf(stderr, "[ coa ADDR[/PLEN] ] [ ctx CTX ] [ extra-flag 
EXTRA-FLAG-LIST ]\n");
+   fprintf(stderr, "[ offload [dev DEV] dir DIR ]\n");
fprintf(stderr, "Usage: ip xfrm state allocspi ID [ mode MODE ] [ mark 
MARK [ mask MASK ] ]\n");
fprintf(stderr, "[ reqid REQID ] [ seq SEQ ] [ min SPI max SPI 
]\n");
fprintf(stderr, "Usage: ip xfrm state { delete | get } ID [ mark MARK [ 
mask MASK ] ]\n");
@@ -108,6 +109,7 @@ static void usage(void)
fprintf(stderr, "LIMIT := { time-soft | time-hard | time-use-soft | 
time-use-hard } SECONDS |\n");
fprintf(stderr, " { byte-soft | byte-hard } SIZE | { 
packet-soft | packet-hard } COUNT\n");
fprintf(stderr, "ENCAP := { espinudp | espinudp-nonike } SPORT DPORT 
OADDR\n");
+   fprintf(stderr, "DIR := in | out\n");
 
exit(-1);
 }
@@ -264,6 +266,24 @@ static int xfrm_state_extra_flag_parse(__u32 *extra_flags, 
int *argcp, char ***a
return 0;
 }
 
+static int xfrm_offload_dir_parse(__u8 *dir, int *argcp, char ***argvp)
+{
+   int argc = *argcp;
+   char **argv = *argvp;
+
+   if (strcmp(*argv, "in") == 0)
+   *dir = XFRM_OFFLOAD_INBOUND;
+   else if (strcmp(*argv, "out") == 0)
+   *dir = 0;
+   else
+   invarg("DIR value is invalid", *argv);
+
+   *argcp = argc;
+   *argvp = argv;
+
+   return 0;
+}
+
 static int xfrm_state_modify(int cmd, unsigned int flags, int argc, char 
**argv)
 {
struct rtnl_handle rth;
@@ -283,6 +303,10 @@ static int xfrm_state_modify(int cmd, unsigned int flags, 
int argc, char **argv)
};
struct xfrm_replay_state replay = {};
struct xfrm_replay_state_esn replay_esn = {};
+   struct xfrm_user_offload xuo = {};
+   unsigned int ifindex = 0;
+   __u8 dir = 0;
+   bool is_offload = false;
__u32 replay_window = 0;
__u32 seq = 0, oseq = 0, seq_hi = 0, oseq_hi = 0;
char *idp = NULL;
@@ -394,6 +418,25 @@ static int xfrm_state_modify(int cmd, unsigned int flags, 
int argc, char **argv)
xfrm_sctx_parse((char *)&ctx.str, context, &ctx.sctx);
addattr_l(&req.n, sizeof(req.buf), XFRMA_SEC_CTX,
  (void *)&ctx, ctx.sctx.len);
+   } else if (strcmp(*argv, "offload") == 0) {
+   is_offload = true;
+   NEXT_ARG();
+   if (strcmp(*argv, "dev") == 0) {
+   NEXT_ARG();
+   ifindex = ll_name_to_index(*argv);
+   if (!ifindex) {
+   invarg("value after \"offload dev\" is 
invalid", *argv);
+   is_offload = false;
+   }
+   NEXT_ARG();
+   }
+   if (strcmp(*argv, "dir") == 0) {
+   NEXT_ARG();
+   xfrm_offload_dir_parse(&dir, &argc, &

pull request: bluetooth-next 2017-04-30

2017-04-30 Thread Johan Hedberg

Hi Dave,

Here's one last batch of Bluetooth patches in the bluetooth-next tree
targeting the 4.12 kernel.

 - Remove custom ECDH implementation and use new KPP API instead
 - Add protocol checks to hci_ldisc
 - Add module license to HCI UART Nokia H4+ driver
 - Minor fix for 32bit user space - 64 bit kernel combination

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit e3a724edeec3836ed44675a6587a6db7b6b68dbe:

  sparc64: Support cbcond instructions in eBPF JIT. (2017-04-24 15:56:21 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git 
for-upstream

for you to fetch changes up to 71653eb64bcca6110c42aadfd50b8d54d3a88079:

  Bluetooth: Add selftest for ECDH key generation (2017-04-30 16:52:43 +0300)


Dean Jenkins (3):
  Bluetooth: hci_ldisc: Add protocol check to hci_uart_send_frame()
  Bluetooth: hci_ldisc: Add protocol check to hci_uart_dequeue()
  Bluetooth: hci_ldisc: Add protocol check to hci_uart_tx_wakeup()

Frédéric Danis (1):
  Bluetooth: Add module license for HCI UART Nokia H4+

Marcel Holtmann (2):
  Bluetooth: zero kpp input for key generation
  Bluetooth: Add selftest for ECDH key generation

Salvatore Benedetto (2):
  Bluetooth: convert smp and selftest to crypto kpp API
  Bluetooth: allocate data for kpp on heap

Szymon Janc (1):
  Bluetooth: Fix user channel for 32bit userspace on 64bit kernel

 drivers/bluetooth/hci_ldisc.c |  14 +-
 drivers/bluetooth/hci_nokia.c |   7 +
 net/bluetooth/Kconfig |   1 +
 net/bluetooth/Makefile|   2 +-
 net/bluetooth/ecc.c   | 816 --
 net/bluetooth/ecc.h   |  54 ---
 net/bluetooth/ecdh_helper.c   | 231 
 net/bluetooth/ecdh_helper.h   |  27 ++
 net/bluetooth/hci_sock.c  |   3 +-
 net/bluetooth/selftest.c  |  28 +-
 net/bluetooth/smp.c   |  46 ++-
 11 files changed, 342 insertions(+), 887 deletions(-)
 delete mode 100644 net/bluetooth/ecc.c
 delete mode 100644 net/bluetooth/ecc.h
 create mode 100644 net/bluetooth/ecdh_helper.c
 create mode 100644 net/bluetooth/ecdh_helper.h


signature.asc
Description: PGP signature

Re: [patch net-next] net: sched: add helpers to handle extended actions

2017-04-30 Thread Jamal Hadi Salim


Jiri,

With "goto chain X" this will have to be more generalized. Maybe
we have 0xAXXX Where "A" recognizes the extension with
current values ACT_JUMP(0x1) and GOTO_CHAIN(maybe 0x2)
and the rest "XXX" is a free floating parameter values
which carry the goto count for ACT_JUMP and GOTO_CHAIN chain-id.

cheers,
jamal

On 17-04-28 12:13 PM, Jiri Pirko wrote:

From: Jiri Pirko 

Jump is now the only one using value action opcode. This is going to
change soon. So introduce helpers to work with this. Convert TC_ACT_JUMP.

Signed-off-by: Jiri Pirko 
---
 include/uapi/linux/pkt_cls.h | 15 ++-
 net/sched/act_api.c  |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index f1129e3..d613be3 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -37,7 +37,20 @@ enum {
 #define TC_ACT_QUEUED  5
 #define TC_ACT_REPEAT  6
 #define TC_ACT_REDIRECT7
-#define TC_ACT_JUMP0x1000
+
+/* There is a special kind of actions called "extended actions",
+ * which need a value parameter. These have a local opcode located in
+ * the highest nibble, starting from 1. The rest of the bits
+ * are used to carry the value. These two parts together make
+ * a combined opcode.
+ */
+#define __TC_ACT_EXT_SHIFT 28
+#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT)
+#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1)
+#define TC_ACT_EXT_CMP(combined, opcode) \
+   (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode)
+
+#define TC_ACT_JUMP __TC_ACT_EXT(1)

 /* Action type identifiers*/
 enum {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 7f2cd70..a90e8f3 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -453,7 +453,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
if (ret == TC_ACT_REPEAT)
goto repeat;/* we need a ttl - JHS */

-   if (ret & TC_ACT_JUMP) {
+   if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) {
jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK;
if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) {
/* faulty opcode, stop pipeline */

Re: [PATCH/RFC net-next 0/4] net/sched: cls_flower: avoid false matching of truncated packets

2017-04-30 Thread Jamal Hadi Salim


On 17-04-28 10:14 AM, Simon Horman wrote:

On Fri, Apr 28, 2017 at 09:41:00AM -0400, Jamal Hadi Salim wrote:

On 17-04-28 09:11 AM, Simon Horman wrote:

[..]

A default lower prio match all on udp or icmp?


I'm certainly not opposed to exploring ideas here.

The way that flower currently works is that a match on ip_proto ==
UDP/TCP/SCTP/ICMP but not fields in the L4 header itself would not result in
the dissector only dissecting the packet's L4 header and thus would not
discover (or as in currently the case, silently ignore) the absence of the
ports/ICMP type and code in the L4 header.

What my patch attempts to do is to describe a policy of what to do if
a given classifier invokes the dissector (to pull out the headers needed for
the match in question) and that dissection fails. Its basically describing
the error-path.



Understood - I was struggling with whether error-path is the same as
"didnt match".



There are two issues:

1. As things stand, without this patch-set, flower does not differentiate
   between a packet truncated at the end of the IP header and a packet with
   zero ports. Likewise for icmp type and code of zero.

   The first three patches of this series address that so that a match for
   port == zero only matches if ports are present in the packet. Again,
   likewise for ICMP.

   This is a bug-fix to my way of thinking.



Agreed to bug fix. I would have said there is never a legit packet with
TCP/UDP but I think some fingerprinting apps use it. And one would need
to distinguish between the two at classification time.
ICMP type 0 is certainly used.

minimal some flag should qualify it as "truncated".


2. The behaviour described above, prior to this patchset, might have been
   utilised to f.e. drop packets that are either truncated or have port == 0
   (because flower didn't differentiate between these cases).

   So the question becomes if/how to provide such a feature.
   The last patch is my attempt to answer that question.


It almost feels like you need metadata matching as well - one being
"truncated".

cheers,
jamal

[PATCH net-next] xfrm: Indicate xfrm_state offload errors

2017-04-30 Thread ilant

From: Ilan Tayari 

Current code silently ignores driver errors when configuring
IPSec offload xfrm_state, and falls back to host-based crypto.

Fail the xfrm_state creation if the driver has an error, because
the NIC offloading was explicitly requested by the user program.

This will communicate back to the user that there was an error.

Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API")
Signed-off-by: Ilan Tayari 
---
 net/xfrm/xfrm_user.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ba74e5ef..c4cceddac9db 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -595,9 +595,12 @@ static struct xfrm_state *xfrm_state_construct(struct net 
*net,
goto error;
}
 
-   if (attrs[XFRMA_OFFLOAD_DEV] &&
-   xfrm_dev_state_add(net, x, nla_data(attrs[XFRMA_OFFLOAD_DEV])))
-   goto error;
+   if (attrs[XFRMA_OFFLOAD_DEV]) {
+   err = xfrm_dev_state_add(net, x,
+nla_data(attrs[XFRMA_OFFLOAD_DEV]));
+   if (err)
+   goto error;
+   }
 
if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
   attrs[XFRMA_REPLAY_ESN_VAL])))
-- 
2.11.0

[PATCH net-next] net/esp4: Fix invalid esph pointer crash

2017-04-30 Thread ilant

From: Ilan Tayari 

Both esp_output and esp_xmit take a pointer to the ESP header
and place it in esp_info struct prior to calling esp_output_head.

Inside esp_output_head, the call to esp_output_udp_encap
makes sure to update the pointer if it gets invalid.
However, if esp_output_head itself calls skb_cow_data, the
pointer is not updated and stays invalid, causing a crash
after esp_output_head returns.

Update the pointer if it becomes invalid in esp_output_head

Fixes: fca11ebde3f0 ("esp4: Reorganize esp_output")
Signed-off-by: Ilan Tayari 
---
 net/ipv4/esp4.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7f2caf71212b..65cc02bd82bc 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -317,6 +317,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff 
*skb, struct esp_info *
if (nfrags < 0)
goto out;
tail = skb_tail_pointer(trailer);
+   esp->esph = ip_esp_hdr(skb);
 
 skip_cow:
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
-- 
2.11.0

[net-next 11/13] i40evf: remove I40E_FLAG_FDIR_ATR_ENABLED

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

The flag used by the common code and PF code is I40E_FLAG_FD_ATR_ENABLED,
not *FDIR*. It turns out none of the txrx code actually shared with the
VF driver actually checks the ATR flag. This is made even more obvious
by the typo in the VF header file.

Let's just remove the flag from the VF driver since it's not needed.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h 
b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 4681c63ee7e3..b8ada6d8d890 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -223,7 +223,6 @@ struct i40evf_adapter {
 #define I40EVF_FLAG_ALLMULTI_ONBIT(19)
 #define I40EVF_FLAG_LEGACY_RX  BIT(20)
 /* duplicates for common code */
-#define I40E_FLAG_FDIR_ATR_ENABLED 0
 #define I40E_FLAG_DCB_ENABLED  0
 #define I40E_FLAG_RX_CSUM_ENABLED  I40EVF_FLAG_RX_CSUM_ENABLED
 #define I40E_FLAG_WB_ON_ITR_CAPABLEI40EVF_FLAG_WB_ON_ITR_CAPABLE
-- 
2.12.2

[net-next 10/13] i40e: remove hw_disabled_flags in favor of using separate flag bits

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

The hw_disabled_flags field was added as a way of signifying that
a feature was automatically or temporarily disabled. However, we
actually only use this for FDir features. Replace its use with new
_AUTO_DISABLED flags instead. This is more readable, because you aren't
setting an *_ENABLED flag to *disable* the feature.

Additionally, clean up a few areas where we used these bits. First, we
don't really need to set the auto-disable flag for ATR if we're fully
disabling the feature via ethtool.

Second, we should always clear the auto-disable bits in case they somehow
got set when the feature was disabled. However, avoid displaying
a message that we've re-enabled the feature.

Third, we shouldn't be re-enabling ATR in the SB ntuple add flow,
because it might have been disabled due to space constraints. Instead,
we should just wait for the fdir_check_and_reenable to be called by the
watchdog.

Overall, this change allows us to simplify some code by removing an
extra field we didn't need, and the result should make it more clear as
to what we're actually doing with these flags.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  9 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  6 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c| 53 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 22 ---
 4 files changed, 38 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 6eb21abdc60e..cdde3cc28fb5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -408,6 +408,8 @@ struct i40e_pf {
 #define I40E_FLAG_DCB_ENABLED  BIT_ULL(20)
 #define I40E_FLAG_FD_SB_ENABLEDBIT_ULL(21)
 #define I40E_FLAG_FD_ATR_ENABLED   BIT_ULL(22)
+#define I40E_FLAG_FD_SB_AUTO_DISABLED  BIT_ULL(23)
+#define I40E_FLAG_FD_ATR_AUTO_DISABLED BIT_ULL(24)
 #define I40E_FLAG_PTP  BIT_ULL(25)
 #define I40E_FLAG_MFP_ENABLED  BIT_ULL(26)
 #define I40E_FLAG_UDP_FILTER_SYNC  BIT_ULL(27)
@@ -440,13 +442,6 @@ struct i40e_pf {
 #define I40E_FLAG_WOL_MC_MAGIC_PKT_WAKEBIT_ULL(57)
 #define I40E_FLAG_LEGACY_RXBIT_ULL(58)
 
-   /* Tracks features that are disabled due to hw limitations.
-* If a bit is set here, it means that the corresponding
-* bit in the 'flags' field is cleared i.e that feature
-* is disabled
-*/
-   u64 hw_disabled_flags;
-
struct i40e_client_instance *cinst;
bool stat_offsets_loaded;
struct i40e_hw_port_stats stats;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index b1064c6468c2..7a8eb486b9ea 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -3643,7 +3643,7 @@ static int i40e_add_fdir_ethtool(struct i40e_vsi *vsi,
if (!(pf->flags & I40E_FLAG_FD_SB_ENABLED))
return -EOPNOTSUPP;
 
-   if (pf->hw_disabled_flags & I40E_FLAG_FD_SB_ENABLED)
+   if (pf->flags & I40E_FLAG_FD_SB_AUTO_DISABLED)
return -ENOSPC;
 
if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||
@@ -4086,12 +4086,12 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
/* Flush current ATR settings if ATR was disabled */
if ((changed_flags & I40E_FLAG_FD_ATR_ENABLED) &&
!(pf->flags & I40E_FLAG_FD_ATR_ENABLED)) {
-   pf->hw_disabled_flags |= I40E_FLAG_FD_ATR_ENABLED;
+   pf->flags |= I40E_FLAG_FD_ATR_AUTO_DISABLED;
set_bit(__I40E_FD_FLUSH_REQUESTED, pf->state);
}
 
/* Only allow ATR evict on hardware that is capable of handling it */
-   if (pf->hw_disabled_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)
+   if (pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)
pf->flags &= ~I40E_FLAG_HW_ATR_EVICT_CAPABLE;
 
if (changed_flags & I40E_FLAG_TRUE_PROMISC_SUPPORT) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 38772e49bb84..d5c9c9e06ff5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1050,13 +1050,13 @@ static void i40e_update_pf_stats(struct i40e_pf *pf)
   &osd->rx_lpi_count, &nsd->rx_lpi_count);
 
if (pf->flags & I40E_FLAG_FD_SB_ENABLED &&
-   !(pf->hw_disabled_flags & I40E_FLAG_FD_SB_ENABLED))
+   !(pf->flags & I40E_FLAG_FD_SB_AUTO_DISABLED))
nsd->fd_sb_status = true;
else
nsd->fd_sb_status = false;
 
if (pf->flags & I40E_FLAG_FD_ATR_ENABLED &&
-   !(pf->hw_disabled_f

[net-next 04/13] i40e: Reprogram port offloads after reset

2017-04-30 Thread Jeff Kirsher

From: Alexander Duyck 

This patch corrects a major oversight in that we were not reprogramming the
ports after a reset.  As a result we completely lost all of the Rx tunnel
offloads on receive including Rx checksum, RSS on inner headers, and ATR.

The fix for this is pretty standard as all we needed to do is reset the
filter bits to pending for all active filters and schedule the sync event.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 063044268170..f44affc7e08c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7331,6 +7331,23 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_sync_udp_filters - Trigger a sync event for existing UDP filters
+ * @pf: board private structure
+ **/
+static void i40e_sync_udp_filters(struct i40e_pf *pf)
+{
+   int i;
+
+   /* loop through and set pending bit for all active UDP filters */
+   for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
+   if (pf->udp_ports[i].port)
+   pf->pending_udp_bitmap |= BIT_ULL(i);
+   }
+
+   pf->flags |= I40E_FLAG_UDP_FILTER_SYNC;
+}
+
+/**
  * i40e_sync_udp_filters_subtask - Sync the VSI filter list with HW
  * @pf: board private structure
  **/
@@ -10738,6 +10755,9 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
 
i40e_ptp_init(pf);
 
+   /* repopulate tunnel port filters */
+   i40e_sync_udp_filters(pf);
+
return ret;
 }
 
-- 
2.12.2

[net-next 02/13] i40e: make use of i40e_reset_all_vfs when initializing new VFs

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

When allocating a large number of VFs, the driver previously used
i40e_reset_vf in a sequence. Just as when performing a normal reset,
this accumulates a large amount of delay for handling all of the VFs in
sequence. This delay is mainly due to a hardware requirement to wait
after initiating a reset on the VF.

We recently added a new function, i40e_reset_all_vfs() which can be used
to amortize the delay time, by first triggering all VF resets, then
waiting once, and finally cleaning up and allocating the VFs. This is
almost as good as truly running the resets in parallel.

In order to avoid sending a spurious reset message to a client
interface, we have a check to see whether we've assigned
pf->num_alloc_vfs yet. This was originally intended as a way to
distinguish the "initialization" case from the regular reset case.

Unfortunately, this means that we can't directly use i40e_reset_all_vfs
yet. Lets avoid this check of pf->num_alloc_vfs by replacing it with
a proper VSI state bit which we can use instead. This makes the
intention much clearer and allows us to re-use the i40e_reset_all_vfs
function directly.

Change-ID: I694279b37eb6b5a91b6670182d0c15d10244fd6e
Signed-off-by: Jacob Keller 
Reviewed-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 10 +++---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index a46c07799384..74977a295987 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1007,7 +1007,8 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf)
set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
clear_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
/* Do not notify the client during VF init */
-   if (vf->pf->num_alloc_vfs)
+   if (test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE,
+  &vf->vf_states))
i40e_notify_client_of_vf_reset(pf, abs_vf_id);
vf->num_vlan = 0;
}
@@ -1280,12 +1281,15 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 
num_alloc_vfs)
/* assign default capabilities */
set_bit(I40E_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps);
vfs[i].spoofchk = true;
-   /* VF resources get allocated during reset */
-   i40e_reset_vf(&vfs[i], false);
+
+   set_bit(I40E_VF_STATE_PRE_ENABLE, &vfs[i].vf_states);
 
}
pf->num_alloc_vfs = num_alloc_vfs;
 
+   /* VF resources get allocated during reset */
+   i40e_reset_all_vfs(pf, false);
+
i40e_notify_client_of_vf_enable(pf, num_alloc_vfs);
 
 err_alloc:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 8c7c08489612..20d7c8160e9e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -63,6 +63,7 @@ enum i40e_vf_states {
I40E_VF_STATE_DISABLED,
I40E_VF_STATE_MC_PROMISC,
I40E_VF_STATE_UC_PROMISC,
+   I40E_VF_STATE_PRE_ENABLE,
 };
 
 /* VF capabilities */
-- 
2.12.2

[net-next 09/13] i40evf: remove needless min_t() on num_online_cpus()*2

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

We already set pairs to the value of adapter->num_active_queues. This
value is limited by vsi_res->num_queue_pairs and num_online_cpus(). This
means that pairs by definition is already smaller than
num_online_cpus()*2, so we don't even need to bother with this check.

Lets just remove it and update the comment.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 8e6276d864e6..89035ee01679 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1271,13 +1271,13 @@ static int i40evf_set_interrupt_capability(struct 
i40evf_adapter *adapter)
}
pairs = adapter->num_active_queues;
 
-   /* It's easy to be greedy for MSI-X vectors, but it really
-* doesn't do us much good if we have a lot more vectors
-* than CPU's.  So let's be conservative and only ask for
-* (roughly) twice the number of vectors as there are CPU's.
+   /* It's easy to be greedy for MSI-X vectors, but it really doesn't do
+* us much good if we have more vectors than CPUs. However, we already
+* limit the total number of queues by the number of CPUs so we do not
+* need any further limiting here.
 */
-   v_budget = min_t(int, pairs, (int)(num_online_cpus() * 2)) + NONQ_VECS;
-   v_budget = min_t(int, v_budget, (int)adapter->vf_res->max_vectors);
+   v_budget = min_t(int, pairs + NONQ_VECS,
+(int)adapter->vf_res->max_vectors);
 
adapter->msix_entries = kcalloc(v_budget,
sizeof(struct msix_entry), GFP_KERNEL);
-- 
2.12.2

[net-next 13/13] i40evf: hide unused variable

2017-04-30 Thread Jeff Kirsher

From: Arnd Bergmann 

On architectures with larger pages, we get a warning about an unused variable:

drivers/net/ethernet/intel/i40evf/i40evf_main.c: In function 
'i40evf_configure_rx':
drivers/net/ethernet/intel/i40evf/i40evf_main.c:690:21: error: unused variable 
'netdev' [-Werror=unused-variable]

This moves the declaration into the #ifdef to avoid the warning.

Fixes: dab86afdbbd1 ("i40e/i40evf: Change the way we limit the maximum frame 
size for Rx")
Signed-off-by: Arnd Bergmann 
Acked-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 445a97a57853..ea110a730e16 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -694,13 +694,14 @@ static void i40evf_configure_tx(struct i40evf_adapter 
*adapter)
 static void i40evf_configure_rx(struct i40evf_adapter *adapter)
 {
unsigned int rx_buf_len = I40E_RXBUFFER_2048;
-   struct net_device *netdev = adapter->netdev;
struct i40e_hw *hw = &adapter->hw;
int i;
 
/* Legacy Rx will always default to a 2048 buffer size. */
 #if (PAGE_SIZE < 8192)
if (!(adapter->flags & I40EVF_FLAG_LEGACY_RX)) {
+   struct net_device *netdev = adapter->netdev;
+
/* For jumbo frames on systems with 4K pages we have to use
 * an order 1 page, so we might as well increase the size
 * of our Rx buffer to make better use of the available space
-- 
2.12.2

[net-next 06/13] i40e: remove unnecessary msleep() delay in i40e_free_vfs

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

The delay was added because of a desire to ensure that the VF driver can
finish up removing. However, pci_disable_sriov already has its own
ssleep() call that will sleep for an entire second, so there is no
reason to add extra delay on top of this by using msleep here. In
practice, an msleep() won't have a huge impact on timing but there is no
real value in keeping it, so lets just simplify the code and remove it.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c| 2 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 --
 drivers/net/ethernet/intel/i40evf/i40evf_main.c| 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f44affc7e08c..20850a646e6c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -47,7 +47,7 @@ static const char i40e_driver_string[] =
 
 #define DRV_VERSION_MAJOR 2
 #define DRV_VERSION_MINOR 1
-#define DRV_VERSION_BUILD 7
+#define DRV_VERSION_BUILD 14
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD)DRV_KERN
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 2a47a6474366..29f53f032c3c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1219,8 +1219,6 @@ void i40e_free_vfs(struct i40e_pf *pf)
else
dev_warn(&pf->pdev->dev, "VFs are assigned - not disabling 
SR-IOV\n");
 
-   msleep(20); /* let any messages in transit get finished up */
-
/* free up VF resources */
tmp = pf->num_alloc_vfs;
pf->num_alloc_vfs = 0;
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 5915273c372f..3ea81bd0db32 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -46,7 +46,7 @@ static const char i40evf_driver_string[] =
 
 #define DRV_VERSION_MAJOR 2
 #define DRV_VERSION_MINOR 1
-#define DRV_VERSION_BUILD 7
+#define DRV_VERSION_BUILD 14
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD) \
-- 
2.12.2

[net-next 07/13] i40e: separate PF and VSI state flags

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

Avoid using the same named flags for both vsi->state and pf->state. This
makes code review easier, as it is more likely that future authors will
use the correct state field when checking bits. Previous commits already
found issues with at least one check, and possibly others may be
incorrect.

This reduces confusion as it is more clear what each flag represents,
and which flags are valid for which state field.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h  | 12 -
 drivers/net/ethernet/intel/i40e/i40e_client.c   |  4 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c  |  4 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c | 66 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |  8 +--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c   |  8 +--
 drivers/net/ethernet/intel/i40evf/i40evf.h  |  7 ++-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 10 ++--
 8 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 0f22c03ec726..ac2a4850a30b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -125,7 +125,6 @@ enum i40e_state_t {
__I40E_CONFIG_BUSY,
__I40E_CONFIG_DONE,
__I40E_DOWN,
-   __I40E_NEEDS_RESTART,
__I40E_SERVICE_SCHED,
__I40E_ADMINQ_EVENT_PENDING,
__I40E_MDD_EVENT_PENDING,
@@ -138,7 +137,6 @@ enum i40e_state_t {
__I40E_GLOBAL_RESET_REQUESTED,
__I40E_EMP_RESET_REQUESTED,
__I40E_EMP_RESET_INTR_RECEIVED,
-   __I40E_FILTER_OVERFLOW_PROMISC,
__I40E_SUSPENDED,
__I40E_PTP_TX_IN_PROGRESS,
__I40E_BAD_EEPROM,
@@ -149,6 +147,16 @@ enum i40e_state_t {
__I40E_VF_DISABLE,
 };
 
+/* VSI state flags */
+enum i40e_vsi_state_t {
+   __I40E_VSI_DOWN,
+   __I40E_VSI_NEEDS_RESTART,
+   __I40E_VSI_SYNCING_FILTERS,
+   __I40E_VSI_OVERFLOW_PROMISC,
+   __I40E_VSI_REINIT_REQUESTED,
+   __I40E_VSI_DOWN_REQUESTED,
+};
+
 enum i40e_interrupt_policy {
I40E_INTERRUPT_BEST_CASE,
I40E_INTERRUPT_MEDIUM,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c 
b/drivers/net/ethernet/intel/i40e/i40e_client.c
index eb2896fd52a6..75e528a6943f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -382,7 +382,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
 * the netdev is up, then open the client.
 */
if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
-   if (!test_bit(__I40E_DOWN, &vsi->state) &&
+   if (!test_bit(__I40E_VSI_DOWN, &vsi->state) &&
client->ops && client->ops->open) {
set_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
ret = client->ops->open(&cdev->lan_info, client);
@@ -397,7 +397,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
/* Likewise for client close. If the client is up, but the netdev
 * is down, then close the client.
 */
-   if (test_bit(__I40E_DOWN, &vsi->state) &&
+   if (test_bit(__I40E_VSI_DOWN, &vsi->state) &&
client->ops && client->ops->close) {
clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
client->ops->close(&cdev->lan_info, client, false);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index a3d7ec62b76c..5408dbf04a00 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -174,7 +174,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int 
seid)
}
dev_info(&pf->pdev->dev, "active_filters %u, promisc_threshold %u, 
overflow promisc %s\n",
 vsi->active_filters, vsi->promisc_threshold,
-(test_bit(__I40E_FILTER_OVERFLOW_PROMISC, &vsi->state) ?
+(test_bit(__I40E_VSI_OVERFLOW_PROMISC, &vsi->state) ?
  "ON" : "OFF"));
nstat = i40e_get_vsi_stats_struct(vsi);
dev_info(&pf->pdev->dev,
@@ -1706,7 +1706,7 @@ static ssize_t i40e_dbg_netdev_ops_write(struct file 
*filp,
} else if (!vsi->netdev) {
dev_info(&pf->pdev->dev, "tx_timeout: no netdev for VSI 
%d\n",
 vsi_seid);
-   } else if (test_bit(__I40E_DOWN, &vsi->state)) {
+   } else if (test_bit(__I40E_VSI_DOWN, &vsi->state)) {
dev_info(&pf->pdev->dev, "tx_timeout: VSI %d not UP\n",
 vsi_seid);
} else if (rtnl_trylock()) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
ind

[net-next 05/13] i40e: amortize wait time when disabling lots of VFs

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

Just as we do in i40e_reset_all_vfs, save some time when freeing VFs by
amortizing the wait time for stopping queues. We can use
i40e_vsi_stop_rings_no_wait() to begin the process of stopping all the
VF rings at once. Then, once we've started the process on each VF we can
begin waiting for the VFs to stop. This helps reduce the total wait time
by a large factor.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 74977a295987..2a47a6474366 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1194,9 +1194,21 @@ void i40e_free_vfs(struct i40e_pf *pf)
usleep_range(1000, 2000);
 
i40e_notify_client_of_vf_enable(pf, 0);
-   for (i = 0; i < pf->num_alloc_vfs; i++)
+
+   /* Amortize wait time by stopping all VFs at the same time */
+   for (i = 0; i < pf->num_alloc_vfs; i++) {
+   if (test_bit(I40E_VF_STATE_INIT, &pf->vf[i].vf_states))
+   continue;
+
+   i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[i].lan_vsi_idx]);
+   }
+
+   for (i = 0; i < pf->num_alloc_vfs; i++) {
if (test_bit(I40E_VF_STATE_INIT, &pf->vf[i].vf_states))
-   i40e_vsi_stop_rings(pf->vsi[pf->vf[i].lan_vsi_idx]);
+   continue;
+
+   i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[i].lan_vsi_idx]);
+   }
 
/* Disable IOV before freeing resources. This lets any VF drivers
 * running in the host get themselves cleaned up before we yank
-- 
2.12.2

[net-next 12/13] i40evf: allocate queues before we setup the interrupts and q_vectors

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

This matches the ordering of how we free stuff during reset and remove.
It also makes logical sense because we set the interrupts based on the
number of queues. Currently this doesn't really matter in practice.
However a future patch moves the assignment of num_active_queues into
i40evf_alloc_queues, which is required by
i40evf_set_interrupt_capability.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 89035ee01679..445a97a57853 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1508,6 +1508,13 @@ int i40evf_init_interrupt_scheme(struct i40evf_adapter 
*adapter)
 {
int err;
 
+   err = i40evf_alloc_queues(adapter);
+   if (err) {
+   dev_err(&adapter->pdev->dev,
+   "Unable to allocate memory for queues\n");
+   goto err_alloc_queues;
+   }
+
rtnl_lock();
err = i40evf_set_interrupt_capability(adapter);
rtnl_unlock();
@@ -1524,23 +1531,16 @@ int i40evf_init_interrupt_scheme(struct i40evf_adapter 
*adapter)
goto err_alloc_q_vectors;
}
 
-   err = i40evf_alloc_queues(adapter);
-   if (err) {
-   dev_err(&adapter->pdev->dev,
-   "Unable to allocate memory for queues\n");
-   goto err_alloc_queues;
-   }
-
dev_info(&adapter->pdev->dev, "Multiqueue %s: Queue pair count = %u",
 (adapter->num_active_queues > 1) ? "Enabled" : "Disabled",
 adapter->num_active_queues);
 
return 0;
-err_alloc_queues:
-   i40evf_free_q_vectors(adapter);
 err_alloc_q_vectors:
i40evf_reset_interrupt_capability(adapter);
 err_set_interrupt:
+   i40evf_free_queues(adapter);
+err_alloc_queues:
return err;
 }
 
-- 
2.12.2

[net-next 08/13] i40e: use DECLARE_BITMAP for state fields

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

Instead of assuming our flags fit within an unsigned long, use
DECLARE_BITMAP which will ensure that we always allocate enough space.
Additionally, use __I40E_STATE_SIZE__ markers as the last element of the
enumeration so that the size of the BITMAP is compile-time assigned
rather than programmer-time assigned. This ensures that potential future
flag additions do not actually overrun the array. This is especially
important as 32bit systems would only have 32bit longs instead of 64bit
longs as we generally have assumed in the prior code.

This change also removes a dereference of the state fields throughout
the code, so it does have a bit of code churn. The conversions were
automated using sed replacements with an alternation

  s/&(vsi->back|vsi|pf)->state/\1->state/
  s/&adapter->vsi.state/adapter->vsi.state/

For debugfs, we modify the printing so that we can display chunks of the
state value on new lines. This ensures that we can print the entire set
of state values. Additionally, we now print them as 08lx to ensure that
they display nicely.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |   8 +-
 drivers/net/ethernet/intel/i40e/i40e_client.c  |  16 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |  13 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  42 ++--
 drivers/net/ethernet/intel/i40e/i40e_main.c| 233 ++---
 drivers/net/ethernet/intel/i40e/i40e_ptp.c |   4 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|  14 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  18 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  |   8 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h |   4 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|  10 +-
 11 files changed, 189 insertions(+), 181 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index ac2a4850a30b..6eb21abdc60e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -145,6 +145,8 @@ enum i40e_state_t {
__I40E_RESET_FAILED,
__I40E_PORT_SUSPENDED,
__I40E_VF_DISABLE,
+   /* This must be last as it determines the size of the BITMAP */
+   __I40E_STATE_SIZE__,
 };
 
 /* VSI state flags */
@@ -155,6 +157,8 @@ enum i40e_vsi_state_t {
__I40E_VSI_OVERFLOW_PROMISC,
__I40E_VSI_REINIT_REQUESTED,
__I40E_VSI_DOWN_REQUESTED,
+   /* This must be last as it determines the size of the BITMAP */
+   __I40E_VSI_STATE_SIZE__,
 };
 
 enum i40e_interrupt_policy {
@@ -330,7 +334,7 @@ struct i40e_flex_pit {
 struct i40e_pf {
struct pci_dev *pdev;
struct i40e_hw hw;
-   unsigned long state;
+   DECLARE_BITMAP(state, __I40E_STATE_SIZE__);
struct msix_entry *msix_entries;
bool fc_autoneg_status;
 
@@ -601,7 +605,7 @@ struct i40e_vsi {
bool stat_offsets_loaded;
 
u32 current_netdev_flags;
-   unsigned long state;
+   DECLARE_BITMAP(state, __I40E_VSI_STATE_SIZE__);
 #define I40E_VSI_FLAG_FILTER_CHANGED   BIT(0)
 #define I40E_VSI_FLAG_VEB_OWNERBIT(1)
unsigned long flags;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c 
b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 75e528a6943f..c3b81a97558e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -371,8 +371,8 @@ void i40e_client_subtask(struct i40e_pf *pf)
cdev = pf->cinst;
 
/* If we're down or resetting, just bail */
-   if (test_bit(__I40E_DOWN, &pf->state) ||
-   test_bit(__I40E_CONFIG_BUSY, &pf->state))
+   if (test_bit(__I40E_DOWN, pf->state) ||
+   test_bit(__I40E_CONFIG_BUSY, pf->state))
return;
 
if (!client || !cdev)
@@ -382,7 +382,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
 * the netdev is up, then open the client.
 */
if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
-   if (!test_bit(__I40E_VSI_DOWN, &vsi->state) &&
+   if (!test_bit(__I40E_VSI_DOWN, vsi->state) &&
client->ops && client->ops->open) {
set_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
ret = client->ops->open(&cdev->lan_info, client);
@@ -397,7 +397,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
/* Likewise for client close. If the client is up, but the netdev
 * is down, then close the client.
 */
-   if (test_bit(__I40E_VSI_DOWN, &vsi->state) &&
+   if (test_bit(__I40E_VSI_DOWN, vsi->state) &&
client->ops && client->ops->close) {
clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
client->ops->close(&cdev->lan_info, clien

[net-next 00/13][pull request] 40GbE Intel Wired LAN Driver Updates 2017-04-30

2017-04-30 Thread Jeff Kirsher

This series contains updates to i40e and i40evf only.

Jake provides majority of the changes in this series, starting with the
renaming of a flag to avoid confusion.  Then renamed a variable to a
more meaningful name to clarify what is actually being done and to
reduce confusion.  Amortizes the wait time when initializing or disabling
lots of VFs by using i40e_reset_all_vfs() and
i40e_vsi_stop_rings_no_wait().  Cleaned up a unnecessary delay since
pci_disable_sriov() already has its own delay, so need to add a additional
delay when removing VFs.  Avoid using the same name flags for both
vsi->state and pf->state, to make code review easier and assist future
work to use the correct state field when checking bits.  Use
DECLARE_BITMAP() to ensure that we always allocate enough space for flags.
Replace hw_disabled_flags with the new _AUTO_DISABLED flags, which are
more readable because we are not setting an *_ENABLED flag to
disable the feature.

Alex corrects a oversight where we were not reprogramming the ports
after a reset, which was causing us to lose all of the receive tunnel
offloads.

Arnd Bergmann moves the declaration of a local variable to avoid a
warning seen on architectures with larger pages about an unused variable.

The following are changes since commit c08bac03d2894113bdb114e66e6ada009defb120:
  Merge branch '10GbE' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alexander Duyck (1):
  i40e: Reprogram port offloads after reset

Arnd Bergmann (1):
  i40evf: hide unused variable

Jacob Keller (11):
  i40e: properly spell I40E_VF_STATE_* flags
  i40e: make use of i40e_reset_all_vfs when initializing new VFs
  i40e: rename index to port to avoid confusion
  i40e: amortize wait time when disabling lots of VFs
  i40e: remove unnecessary msleep() delay in i40e_free_vfs
  i40e: separate PF and VSI state flags
  i40e: use DECLARE_BITMAP for state fields
  i40evf: remove needless min_t() on num_online_cpus()*2
  i40e: remove hw_disabled_flags in favor of using separate flag bits
  i40evf: remove I40E_FLAG_FDIR_ATR_ENABLED
  i40evf: allocate queues before we setup the interrupts and q_vectors

 drivers/net/ethernet/intel/i40e/i40e.h |  31 +-
 drivers/net/ethernet/intel/i40e/i40e_client.c  |  16 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |  13 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  50 ++--
 drivers/net/ethernet/intel/i40e/i40e_main.c| 332 +++--
 drivers/net/ethernet/intel/i40e/i40e_ptp.c |   4 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|  36 +--
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 144 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  15 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  |   8 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h |  12 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|  45 +--
 12 files changed, 372 insertions(+), 334 deletions(-)

-- 
2.12.2

[net-next 01/13] i40e: properly spell I40E_VF_STATE_* flags

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

These flags represent the state of the VF at various times. Do not
spell them as _STAT_ which can be confusing to readers who may think
these refer to statistics.

Change-ID: I6bc092cd472e8276896a1fd7498aced2084312df
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c|  2 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 98 +++---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 14 ++--
 4 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 08035c4389cd..523dd81d76b7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1826,7 +1826,7 @@ static inline bool i40e_active_vfs(struct i40e_pf *pf)
int i;
 
for (i = 0; i < pf->num_alloc_vfs; i++)
-   if (test_bit(I40E_VF_STAT_ACTIVE, &vfs[i].vf_states))
+   if (test_bit(I40E_VF_STATE_ACTIVE, &vfs[i].vf_states))
return true;
return false;
 }
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c001562f19b2..8f47a31cb2c8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7318,7 +7318,7 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf)
 "Too many MDD events on VF %d, disabled\n", i);
dev_info(&pf->pdev->dev,
 "Use PF Control I/F to re-enable the VF\n");
-   set_bit(I40E_VF_STAT_DISABLED, &vf->vf_states);
+   set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
}
}
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 350cba70490c..a46c07799384 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -50,8 +50,8 @@ static void i40e_vc_vf_broadcast(struct i40e_pf *pf,
for (i = 0; i < pf->num_alloc_vfs; i++, vf++) {
int abs_vf_id = vf->vf_id + (int)hw->func_caps.vf_base_id;
/* Not all vfs are enabled so skip the ones that are not */
-   if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states) &&
-   !test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states))
+   if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) &&
+   !test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
continue;
 
/* Ignore return value on purpose - a given VF may fail, but
@@ -137,8 +137,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf)
return;
 
/* verify if the VF is in either init or active before proceeding */
-   if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states) &&
-   !test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states))
+   if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) &&
+   !test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
return;
 
abs_vf_id = vf->vf_id + (int)vf->pf->hw.func_caps.vf_base_id;
@@ -812,7 +812,7 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
/* Start by disabling VF's configuration API to prevent the OS from
 * accessing the VF's VSI after it's freed / invalidated.
 */
-   clear_bit(I40E_VF_STAT_INIT, &vf->vf_states);
+   clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
 
/* free vsi & disconnect it from the parent uplink */
if (vf->lan_vsi_idx) {
@@ -884,7 +884,7 @@ static int i40e_alloc_vf_res(struct i40e_vf *vf)
vf->num_queue_pairs = total_queue_pairs;
 
/* VF is now completely initialized */
-   set_bit(I40E_VF_STAT_INIT, &vf->vf_states);
+   set_bit(I40E_VF_STATE_INIT, &vf->vf_states);
 
 error_alloc:
if (ret)
@@ -938,7 +938,7 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool 
flr)
u32 reg, reg_idx, bit_idx;
 
/* warn the VF */
-   clear_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states);
+   clear_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
 
/* Disable VF's configuration API during reset. The flag is re-enabled
 * in i40e_alloc_vf_res(), when it's safe again to access VF's VSI.
@@ -946,7 +946,7 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool 
flr)
 * to do it earlier to give some time to finish to any VF config
 * functions that may still be running at this point.
 */
-   clear_bit(I40E_VF_STAT_INIT, &vf->vf_states);
+   clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
 
/* In the case of a VFLR, the HW has already reset the VF and we
 * just need to clean up, so don'

[net-next 03/13] i40e: rename index to port to avoid confusion

2017-04-30 Thread Jeff Kirsher

From: Jacob Keller 

The .index field of i40e_udp_port_config represents the udp port number.
Rename this variable to port so that it is more obvious.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h  |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 70f9458f7a01..0f22c03ec726 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -245,7 +245,7 @@ struct i40e_tc_configuration {
 
 struct i40e_udp_port_config {
/* AdminQ command interface expects port number in Host byte order */
-   u16 index;
+   u16 port;
u8 type;
 };
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 8f47a31cb2c8..063044268170 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7349,7 +7349,7 @@ static void i40e_sync_udp_filters_subtask(struct i40e_pf 
*pf)
for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
if (pf->pending_udp_bitmap & BIT_ULL(i)) {
pf->pending_udp_bitmap &= ~BIT_ULL(i);
-   port = pf->udp_ports[i].index;
+   port = pf->udp_ports[i].port;
if (port)
ret = i40e_aq_add_udp_tunnel(hw, port,
pf->udp_ports[i].type,
@@ -7366,7 +7366,7 @@ static void i40e_sync_udp_filters_subtask(struct i40e_pf 
*pf)
i40e_stat_str(&pf->hw, ret),
i40e_aq_str(&pf->hw,
pf->hw.aq.asq_last_status));
-   pf->udp_ports[i].index = 0;
+   pf->udp_ports[i].port = 0;
}
}
}
@@ -8953,7 +8953,7 @@ static u8 i40e_get_udp_port_idx(struct i40e_pf *pf, u16 
port)
u8 i;
 
for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
-   if (pf->udp_ports[i].index == port)
+   if (pf->udp_ports[i].port == port)
return i;
}
 
@@ -9006,7 +9006,7 @@ static void i40e_udp_tunnel_add(struct net_device *netdev,
}
 
/* New port: add it and mark its index in the bitmap */
-   pf->udp_ports[next_idx].index = port;
+   pf->udp_ports[next_idx].port = port;
pf->pending_udp_bitmap |= BIT_ULL(next_idx);
pf->flags |= I40E_FLAG_UDP_FILTER_SYNC;
 }
@@ -9047,7 +9047,7 @@ static void i40e_udp_tunnel_del(struct net_device *netdev,
/* if port exists, set it to 0 (mark for deletion)
 * and make it pending
 */
-   pf->udp_ports[idx].index = 0;
+   pf->udp_ports[idx].port = 0;
pf->pending_udp_bitmap |= BIT_ULL(idx);
pf->flags |= I40E_FLAG_UDP_FILTER_SYNC;
 
-- 
2.12.2

[net-next 13/15] net/mlx5e: Use u8 as ownership type in mlx5e_get_cqe()

2017-04-30 Thread Saeed Mahameed

From: Tariq Toukan 

CQE ownership indication is as small as a single bit.
Use u8 to speedup the comparison.

Signed-off-by: Tariq Toukan 
Cc: kernel-t...@fb.com
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 491e83d09b58..5ca6714e3e02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -37,8 +37,8 @@ struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq)
struct mlx5_cqwq *wq = &cq->wq;
u32 ci = mlx5_cqwq_get_ci(wq);
struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci);
-   int cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK;
-   int sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1;
+   u8 cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK;
+   u8 sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1;
 
if (cqe_ownership_bit != sw_ownership_val)
return NULL;
-- 
2.11.0

[net-next 15/15] net/mlx5: E-Switch, Avoid redundant memory allocation

2017-04-30 Thread Saeed Mahameed

From: Eli Cohen 

struct esw_mc_addr is a small struct that can be part of struct
mlx5_eswitch. Define it as a field and not as a pointer and save the
kzalloc call and then error flow handling.

Signed-off-by: Eli Cohen 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 20 ++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |  9 -
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 21bed3c3334d..2e34d95ea776 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -53,13 +53,6 @@ struct esw_uc_addr {
u32vport;
 };
 
-/* E-Switch MC FDB table hash node */
-struct esw_mc_addr { /* SRIOV only */
-   struct l2addr_node node;
-   struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
-   u32refcnt;
-};
-
 /* Vport UC/MC hash node */
 struct vport_addr {
struct l2addr_node node;
@@ -817,7 +810,7 @@ static void esw_update_vport_mc_promisc(struct mlx5_eswitch 
*esw, u32 vport_num)
 static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
bool promisc, bool mc_promisc)
 {
-   struct esw_mc_addr *allmulti_addr = esw->mc_promisc;
+   struct esw_mc_addr *allmulti_addr = &esw->mc_promisc;
struct mlx5_vport *vport = &esw->vports[vport_num];
 
if (IS_ERR_OR_NULL(vport->allmulti_rule) != mc_promisc)
@@ -1688,7 +1681,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
 esw->enabled_vports, esw->mode);
 
-   mc_promisc = esw->mc_promisc;
+   mc_promisc = &esw->mc_promisc;
nvports = esw->enabled_vports;
 
for (i = 0; i < esw->total_vports; i++)
@@ -1732,7 +1725,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
int total_vports = MLX5_TOTAL_VPORTS(dev);
-   struct esw_mc_addr *mc_promisc;
struct mlx5_eswitch *esw;
int vport_num;
int err;
@@ -1761,13 +1753,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
}
esw->l2_table.size = l2_table_size;
 
-   mc_promisc = kzalloc(sizeof(*mc_promisc), GFP_KERNEL);
-   if (!mc_promisc) {
-   err = -ENOMEM;
-   goto abort;
-   }
-   esw->mc_promisc = mc_promisc;
-
esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
if (!esw->work_queue) {
err = -ENOMEM;
@@ -1835,7 +1820,6 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
esw->dev->priv.eswitch = NULL;
destroy_workqueue(esw->work_queue);
kfree(esw->l2_table.bitmap);
-   kfree(esw->mc_promisc);
kfree(esw->offloads.vport_reps);
kfree(esw->vports);
kfree(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 55beda6bf134..b746f62c8c79 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -212,6 +212,13 @@ struct mlx5_esw_offload {
u8 encap;
 };
 
+/* E-Switch MC FDB table hash node */
+struct esw_mc_addr { /* SRIOV only */
+   struct l2addr_node node;
+   struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
+   u32refcnt;
+};
+
 struct mlx5_eswitch {
struct mlx5_core_dev*dev;
struct mlx5_l2_tablel2_table;
@@ -225,7 +232,7 @@ struct mlx5_eswitch {
 * and async SRIOV admin state changes
 */
struct mutexstate_lock;
-   struct esw_mc_addr  *mc_promisc;
+   struct esw_mc_addr  mc_promisc;
 
struct {
boolenabled;
-- 
2.11.0

[net-next 06/15] net/mlx5e: Read neigh parameters with proper locking

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

The nud_state and hardware address fields are protected by the neighbour
lock, we should acquire it before accessing those parameters.

Use this lock to avoid inconsistency between the neighbour validity state
and it's hardware address.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 2a9289b8a33b..ae07fe6473bb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1223,6 +1223,7 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
struct flowi4 fl4 = {};
char *encap_header;
int ttl, err;
+   u8 nud_state;
 
if (max_encap_size < ipv4_encap_size) {
mlx5_core_warn(priv->mdev, "encap size %d too big, max 
supported is %d\n",
@@ -1252,7 +1253,12 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
if (err)
goto out;
 
-   if (!(n->nud_state & NUD_VALID)) {
+   read_lock_bh(&n->lock);
+   nud_state = n->nud_state;
+   ether_addr_copy(e->h_dest, n->ha);
+   read_unlock_bh(&n->lock);
+
+   if (!(nud_state & NUD_VALID)) {
pr_warn("%s: can't offload, neighbour to %pI4 invalid\n", 
__func__, &fl4.daddr);
err = -EOPNOTSUPP;
goto out;
@@ -1261,8 +1267,6 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
e->n = n;
e->out_dev = out_dev;
 
-   neigh_ha_snapshot(e->h_dest, n, out_dev);
-
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
gen_vxlan_header_ipv4(out_dev, encap_header,
@@ -1297,6 +1301,7 @@ static int mlx5e_create_encap_header_ipv6(struct 
mlx5e_priv *priv,
struct flowi6 fl6 = {};
char *encap_header;
int err, ttl = 0;
+   u8 nud_state;
 
if (max_encap_size < ipv6_encap_size) {
mlx5_core_warn(priv->mdev, "encap size %d too big, max 
supported is %d\n",
@@ -1327,7 +1332,12 @@ static int mlx5e_create_encap_header_ipv6(struct 
mlx5e_priv *priv,
if (err)
goto out;
 
-   if (!(n->nud_state & NUD_VALID)) {
+   read_lock_bh(&n->lock);
+   nud_state = n->nud_state;
+   ether_addr_copy(e->h_dest, n->ha);
+   read_unlock_bh(&n->lock);
+
+   if (!(nud_state & NUD_VALID)) {
pr_warn("%s: can't offload, neighbour to %pI6 invalid\n", 
__func__, &fl6.daddr);
err = -EOPNOTSUPP;
goto out;
@@ -1336,8 +1346,6 @@ static int mlx5e_create_encap_header_ipv6(struct 
mlx5e_priv *priv,
e->n = n;
e->out_dev = out_dev;
 
-   neigh_ha_snapshot(e->h_dest, n, out_dev);
-
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
gen_vxlan_header_ipv6(out_dev, encap_header,
-- 
2.11.0

[net-next 07/15] net/mlx5e: Add neighbour hash table to the representors

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

Add hash table to the representors which is to be used by the next patch
to save neighbours information in the driver.

In order to offload IP tunnel encapsulation rules, the driver must find
the tunnel dst neighbour according to the output device and the
destination address given by the user. The next patch will cache the
neighbors information in the driver to allow support in neigh update
flow for tunnel encap rules.

The neighbour entries are also saved in a list so we easily iterate over
them when querying statistics in order to provide 'used' feedback to the
kernel neighbour NUD core.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 107 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h |  30 +++
 2 files changed, 129 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 8e82b11afd99..52ea7f1c0973 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -224,6 +224,68 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
mlx5_eswitch_sqs2vport_stop(esw, rep);
 }
 
+static const struct rhashtable_params mlx5e_neigh_ht_params = {
+   .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
+   .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
+   .key_len = sizeof(struct mlx5e_neigh),
+   .automatic_shrinking = true,
+};
+
+static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
+{
+   struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+   INIT_LIST_HEAD(&neigh_update->neigh_list);
+   return rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
+}
+
+static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+   struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+   rhashtable_destroy(&neigh_update->neigh_ht);
+}
+
+static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
+   struct mlx5e_neigh_hash_entry *nhe)
+{
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
+   int err;
+
+   err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
+&nhe->rhash_node,
+mlx5e_neigh_ht_params);
+   if (err)
+   return err;
+
+   list_add(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
+
+   return err;
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_priv *priv,
+struct mlx5e_neigh_hash_entry *nhe)
+{
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+   list_del(&nhe->neigh_list);
+
+   rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
+  &nhe->rhash_node,
+  mlx5e_neigh_ht_params);
+}
+
+static struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+struct mlx5e_neigh *m_neigh)
+{
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
+   struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+   return rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
+ mlx5e_neigh_ht_params);
+}
+
 static int mlx5e_rep_open(struct net_device *dev)
 {
struct mlx5e_priv *priv = netdev_priv(dev);
@@ -540,19 +602,33 @@ static struct mlx5e_profile mlx5e_rep_profile = {
 static int
 mlx5e_nic_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep)
 {
-   struct net_device *netdev = rep->netdev;
-   struct mlx5e_priv *priv = netdev_priv(netdev);
+   struct mlx5e_priv *priv = netdev_priv(rep->netdev);
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+   int err;
+
+   if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+   err = mlx5e_add_sqs_fwd_rules(priv);
+   if (err)
+   return err;
+   }
+
+   err = mlx5e_rep_neigh_init(rpriv);
+   if (err)
+   goto err_remove_sqs;
 
-   if (test_bit(MLX5E_STATE_OPENED, &priv->state))
-   return mlx5e_add_sqs_fwd_rules(priv);
return 0;
+
+err_remove_sqs:
+   mlx5e_remove_sqs_fwd_rules(priv);
+   return err;
 }
 
 static void
 mlx5e_nic_rep_unload(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep)
 {
-   struct net_device *netdev = rep->netdev;
-   struct mlx5e_priv *priv = netdev_priv(netdev);
+   struct mlx5e_priv *priv = netdev_priv(rep->netdev);
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
 
if (test_bit(MLX5E_STATE_OPENED, &priv->state))
mlx5e_remove_sqs_fwd_rules(priv);
@@ -560,6 +636,8 @@ mlx5e_nic_rep_unload(struct mlx5_eswitch *esw, struct 
mlx5_eswitch_rep *rep)

[net-next 09/15] net/mlx5e: Update neighbour 'used' state using HW flow rules counters

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

When IP tunnel encapsulation rules are offloaded, the kernel can't see
the traffic of the offloaded flow. The neighbour for the IP tunnel
destination of the offloaded flow can mistakenly become STALE and
deleted by the kernel since its 'used' value wasn't changed.

To make sure that a neighbour which is used by the HW won't become
STALE, we proactively update the neighbour 'used' value every
DELAY_PROBE_TIME period, when packets were matched and counted by the HW
for one of the tunnel encap flows related to this neighbour.

The periodic task that updates the used neighbours is scheduled when a
tunnel encap rule is successfully offloaded into HW and keeps re-scheduling
itself as long as the representor's neighbours list isn't empty.

Add, remove, lookup and status change operations done over the
representor's neighbours list or the neighbour hash entry encaps list
are all serialized by RTNL lock.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 52 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   | 11 
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 58 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h|  3 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  5 ++
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 24 -
 include/linux/mlx5/driver.h|  1 +
 7 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 730de6b7e46e..af61b10b85bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -41,6 +41,7 @@
 #include "en.h"
 #include "en_rep.h"
 #include "en_tc.h"
+#include "fs_core.h"
 
 static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
 
@@ -226,6 +227,51 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
mlx5_eswitch_sqs2vport_stop(esw, rep);
 }
 
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+   unsigned long ipv6_interval = NEIGH_VAR(&ipv6_stub->nd_tbl->parms,
+   DELAY_PROBE_TIME);
+#else
+   unsigned long ipv6_interval = ~0UL;
+#endif
+   unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms,
+   DELAY_PROBE_TIME);
+   struct net_device *netdev = rpriv->rep->netdev;
+   struct mlx5e_priv *priv = netdev_priv(netdev);
+
+   rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, 
ipv4_interval);
+   mlx5_fc_update_sampling_interval(priv->mdev, 
rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
+   struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+   mlx5_fc_queue_stats_work(priv->mdev,
+&neigh_update->neigh_stats_work,
+neigh_update->min_interval);
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+   struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+   
neigh_update.neigh_stats_work.work);
+   struct net_device *netdev = rpriv->rep->netdev;
+   struct mlx5e_priv *priv = netdev_priv(netdev);
+   struct mlx5e_neigh_hash_entry *nhe;
+
+   rtnl_lock();
+   if (!list_empty(&rpriv->neigh_update.neigh_list))
+   mlx5e_rep_queue_neigh_stats_work(priv);
+
+   list_for_each_entry(nhe, &rpriv->neigh_update.neigh_list, neigh_list)
+   mlx5e_tc_update_neigh_used_value(nhe);
+
+   rtnl_unlock();
+}
+
 static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
 {
refcount_inc(&nhe->refcnt);
@@ -325,6 +371,7 @@ static int mlx5e_rep_netevent_event(struct notifier_block 
*nb,
return NOTIFY_DONE;
 
m_neigh.dev = n->dev;
+   m_neigh.family = n->ops->family;
memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
 
/* We are in atomic context and can't take RTNL mutex, so use
@@ -378,6 +425,9 @@ static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv 
*rpriv)
 
INIT_LIST_HEAD(&neigh_update->neigh_list);
spin_lock_init(&neigh_update->encap_lock);
+   INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+ mlx5e_rep_neigh_stats_work);
+   mlx5e_rep_neigh_update_init_interval(rpriv);
 
rpriv->neigh_update.netevent_nb.notifier_call = 
mlx5e_rep_netevent_event;
err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
@@ -399,6 +449,8 @@ static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv 
*r

[net-next 08/15] net/mlx5e: Add support to neighbour update flow

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

In order to offload TC encap rules, the driver does a lookup for the IP
tunnel neighbour according to the output device and the destination IP
given by the user.

To keep tracking after the validity state of such neighbours, we keep
the neighbours information (pair of device pointer and destination IP)
in a hash table maintained at the relevant egress representor and
register to get NETEVENT_NEIGH_UPDATE events. When getting neighbour update
netevent, we search for a match among the cached neighbours entries used for
encapsulation.

In case the neighbour isn't valid, we can't offload the flow into the
HW. We cache the flow (requested matching and actions) in the driver and
offload the rule later, when the neighbour is resolved and becomes
valid.

When a flow is only cached in the driver and not offloaded into HW
yet, we use EAGAIN return value to mark it internally, the TC ndo still
returns success.

Listen to kernel neighbour update netevents to trace relevant neighbours
validity state:

1. If a neighbour becomes valid, offload the related rules to HW.

2. If the neighbour becomes invalid, remove the related rules from HW.

3. If the neighbour mac address was changed, update the encap header.
   Remove all the offloaded rules using the old encap header from the HW
   and insert new rules to HW with updated encap header.

Access to the neighbors hash table is protected by RTNL lock of its
caller or by the table's spinlock.

Details of the locking/synchronization among the different actions
applied on the neighbour table:

Add/remove operations - protected by RTNL lock of its caller (all TC
commands are protected by RTNL lock). Add and remove operations are
initiated only when the user inserts/removes a TC rule into/from the driver.

Lookup/remove operations - since the lookup operation is done from
netevent notifier block, RTNL lock can't be used (atomic context).
Use the table's spin lock to protect lookups from TC user removal operation.
bh is used since netevent can be called from a softirq context.

Lookup/add operations - The hash table access functions are taking
care of the protection between lookup and add operations.

When adding/removing encap headers and rules to/from the HW, RTNL lock
is used. It can happen when:

1. The user inserts/removes a TC rule into/from the driver (TC commands
are protected by RTNL lock of it's caller).

2. The driver gets neighbour notification event, which reports about
neighbour validity status change. Before adding/removing encap headers
and rules to/from the HW, RTNL lock is taken.

A neighbour hash table entry should be freed when its encap list is empty.
Since The neighbour update netevent notification schedules a neighbour
update work that uses the neighbour hash entry, it can't be freed
unconditionally when the encap list becomes empty during TC delete rule flow.
Use reference count to protect from freeing neighbour hash table entry
while it's still in use.

When the user asks to unregister a netdvice used by one of the neigbours,
neighbour removal notification is received. Then we take a reference on the
neighbour and don't free it until the relevant encap entries (and flows) are
marked as invalid (not offloaded) and removed from HW.
As long as the encap entry is still valid (checked under RTNL lock) we
can safely access the neighbour device saved on mlx5e_neigh struct.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 230 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h  |  38 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   | 202 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h   |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |   1 +
 5 files changed, 434 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 52ea7f1c0973..730de6b7e46e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -34,6 +34,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "eswitch.h"
 #include "en.h"
@@ -224,6 +226,140 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
mlx5_eswitch_sqs2vport_stop(esw, rep);
 }
 
+static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
+{
+   refcount_inc(&nhe->refcnt);
+}
+
+static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
+{
+   if (refcount_dec_and_test(&nhe->refcnt))
+   kfree(nhe);
+}
+
+static void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+  struct mlx5e_encap_entry *e,
+  bool neigh_connected,
+  unsigned char ha[ETH_ALEN])
+{
+   struct ethhdr *eth = (struct ethhdr *)e->encap_header;
+
+   A

[net-next 12/15] net/mlx5e: Use prefetchw when a write is to follow

2017-04-30 Thread Saeed Mahameed

From: Tariq Toukan 

"prefetchw()" prefetches the cacheline for write. Use it for
skb->data, as soon we'll be copying the packet header there.

Performance:
Single-stream packet-rate tested with pktgen.
Packets are dropped in tc level to zoom into driver data-path.
Larger gain is expected for smaller packets, as less time
is spent on handling SKB fragments, making the path shorter
and the improvement more significant.

-
packet size | before| after | gain  |
64B | 4,113,306 | 4,778,720 |  16%  |
1024B   | 3,633,819 | 3,950,593 | 8.7%  |

Signed-off-by: Tariq Toukan 
Cc: kernel-t...@fb.com
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d717573b73da..7b1566f0ae58 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -906,7 +906,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
goto mpwrq_cqe_out;
}
 
-   prefetch(skb->data);
+   prefetchw(skb->data);
cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
 
mlx5e_mpwqe_fill_rx_skb(rq, cqe, wi, cqe_bcnt, skb);
-- 
2.11.0

[net-next 11/15] net/mlx5e: Optimize poll ICOSQ completion queue

2017-04-30 Thread Saeed Mahameed

From: Tariq Toukan 

UMR operations are more frequent and important.
Check them first, and add a compiler branch predictor hint.

According to current design, ICOSQ CQ can contain at most one
pending CQE per napi. Poll function is optimized accordingly.

Performance:
Single-stream packet-rate tested with pktgen.
Packets are dropped in tc level to zoom into driver data-path.
Larger gain is expected for larger packet sizes, as BW is higher
and UMR posts are more frequent.

-
packet size | before| after | gain  |
64B | 4,092,370 | 4,113,306 |  0.5% |
1024B   | 3,421,435 | 3,633,819 |  6.2% |

Signed-off-by: Tariq Toukan 
Cc: kernel-t...@fb.com
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 62 ---
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 43729ec35dfc..491e83d09b58 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -49,10 +49,40 @@ struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq)
return cqe;
 }
 
+static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
+struct mlx5e_icosq *sq,
+struct mlx5_cqe64 *cqe,
+u16 *sqcc)
+{
+   struct mlx5_wq_cyc *wq = &sq->wq;
+   u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
+   struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
+   struct mlx5e_rq *rq = &sq->channel->rq;
+
+   prefetch(rq);
+   mlx5_cqwq_pop(&cq->wq);
+   *sqcc += icowi->num_wqebbs;
+
+   if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
+   WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
+ cqe->op_own);
+   return;
+   }
+
+   if (likely(icowi->opcode == MLX5_OPCODE_UMR)) {
+   mlx5e_post_rx_mpwqe(rq);
+   return;
+   }
+
+   if (unlikely(icowi->opcode != MLX5_OPCODE_NOP))
+   WARN_ONCE(true,
+ "mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
+ icowi->opcode);
+}
+
 static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
 {
struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
-   struct mlx5_wq_cyc *wq;
struct mlx5_cqe64 *cqe;
u16 sqcc;
 
@@ -63,39 +93,13 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
if (likely(!cqe))
return;
 
-   wq = &sq->wq;
-
/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
 * otherwise a cq overrun may occur
 */
sqcc = sq->cc;
 
-   do {
-   u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
-   struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
-
-   mlx5_cqwq_pop(&cq->wq);
-   sqcc += icowi->num_wqebbs;
-
-   if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
-   WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
- cqe->op_own);
-   break;
-   }
-
-   switch (icowi->opcode) {
-   case MLX5_OPCODE_NOP:
-   break;
-   case MLX5_OPCODE_UMR:
-   mlx5e_post_rx_mpwqe(&sq->channel->rq);
-   break;
-   default:
-   WARN_ONCE(true,
- "mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
- icowi->opcode);
-   }
-
-   } while ((cqe = mlx5e_get_cqe(cq)));
+   /* by design, there's only a single cqe */
+   mlx5e_poll_ico_single_cqe(cq, sq, cqe, &sqcc);
 
mlx5_cqwq_update_db_record(&cq->wq);
 
-- 
2.11.0

[net-next 10/15] net/mlx5e: Act on delay probe time updates

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

The user can change delay_first_probe_time parameter through sysctl.
Listen to NETEVENT_DELAY_PROBE_TIME_UPDATE notifications and update the
intervals for updating the neighbours 'used' value periodic task and
for flow HW counters query periodic task.
Both of the intervals will be update only in case the new delay prob
time value is lower the current interval.

Since the driver saves only one min interval value and not per device,
the users will be able to set lower interval value for updating
neighbour 'used' value periodic task but they won't be able to schedule
a higher interval for this periodic task.
The used interval for scheduling neighbour 'used' value periodic task is
the minimal delay prob time parameter ever seen by the driver.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 39 
 1 file changed, 39 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index af61b10b85bf..79462c0368a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -358,7 +358,9 @@ static int mlx5e_rep_netevent_event(struct notifier_block 
*nb,
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5e_neigh_hash_entry *nhe = NULL;
struct mlx5e_neigh m_neigh = {};
+   struct neigh_parms *p;
struct neighbour *n;
+   bool found = false;
 
switch (event) {
case NETEVENT_NEIGH_UPDATE:
@@ -403,6 +405,43 @@ static int mlx5e_rep_netevent_event(struct notifier_block 
*nb,
}
spin_unlock_bh(&neigh_update->encap_lock);
break;
+
+   case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+   p = ptr;
+
+   /* We check the device is present since we don't care about
+* changes in the default table, we only care about changes
+* done per device delay prob time parameter.
+*/
+#if IS_ENABLED(CONFIG_IPV6)
+   if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != 
&arp_tbl))
+#else
+   if (!p->dev || p->tbl != &arp_tbl)
+#endif
+   return NOTIFY_DONE;
+
+   /* We are in atomic context and can't take RTNL mutex,
+* so use spin_lock_bh to walk the neigh list and look for
+* the relevant device. bh is used since netevent can be
+* called from a softirq context.
+*/
+   spin_lock_bh(&neigh_update->encap_lock);
+   list_for_each_entry(nhe, &neigh_update->neigh_list, neigh_list) 
{
+   if (p->dev == nhe->m_neigh.dev) {
+   found = true;
+   break;
+   }
+   }
+   spin_unlock_bh(&neigh_update->encap_lock);
+   if (!found)
+   return NOTIFY_DONE;
+
+   neigh_update->min_interval = min_t(unsigned long,
+  NEIGH_VAR(p, 
DELAY_PROBE_TIME),
+  neigh_update->min_interval);
+   mlx5_fc_update_sampling_interval(priv->mdev,
+neigh_update->min_interval);
+   break;
}
return NOTIFY_DONE;
 }
-- 
2.11.0

[net-next 14/15] net/mlx5e: Disable HW LRO when PCI is slower than link on striding RQ

2017-04-30 Thread Saeed Mahameed

From: Eran Ben Elisha 

We will activate the HW LRO only on servers with PCI BW > MAX LINK BW,
or when PCI BW > 16Gbps. On other cases we do not want LRO by default as
LRO sessions might get timeout and add redundant software overhead.

Tested:
ethtool -k  | grep large-receive-offload
On systems with and without the limitations.

Signed-off-by: Eran Ben Elisha 
Cc: kernel-t...@fb.com
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1afaca96a30d..a61b71b6fff3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3785,6 +3785,12 @@ static bool cqe_compress_heuristic(u32 link_speed, u32 
pci_bw)
(pci_bw < 4) && (pci_bw < link_speed));
 }
 
+static bool hw_lro_heuristic(u32 link_speed, u32 pci_bw)
+{
+   return !(link_speed && pci_bw &&
+(pci_bw <= 16000) && (pci_bw < link_speed));
+}
+
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 
cq_period_mode)
 {
params->rx_cq_period_mode = cq_period_mode;
@@ -3829,6 +3835,11 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
params->num_channels = max_channels;
params->num_tc   = 1;
 
+   mlx5e_get_max_linkspeed(mdev, &link_speed);
+   mlx5e_get_pci_bw(mdev, &pci_bw);
+   mlx5_core_dbg(mdev, "Max link speed = %d, PCI BW = %d\n",
+ link_speed, pci_bw);
+
/* SQ */
params->log_sq_size = is_kdump_kernel() ?
MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
@@ -3837,13 +3848,9 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
/* set CQE compression */
params->rx_cqe_compress_def = false;
if (MLX5_CAP_GEN(mdev, cqe_compression) &&
-MLX5_CAP_GEN(mdev, vport_group_manager)) {
-   mlx5e_get_max_linkspeed(mdev, &link_speed);
-   mlx5e_get_pci_bw(mdev, &pci_bw);
-   mlx5_core_dbg(mdev, "Max link speed = %d, PCI BW = %d\n",
-  link_speed, pci_bw);
+MLX5_CAP_GEN(mdev, vport_group_manager))
params->rx_cqe_compress_def = 
cqe_compress_heuristic(link_speed, pci_bw);
-   }
+
MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, 
params->rx_cqe_compress_def);
 
/* RQ */
@@ -3852,7 +3859,7 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
/* HW LRO */
/* TODO: && MLX5_CAP_ETH(mdev, lro_cap) */
if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
-   params->lro_en = true;
+   params->lro_en = hw_lro_heuristic(link_speed, pci_bw);
params->lro_timeout = mlx5e_choose_lro_timeout(mdev, 
MLX5E_DEFAULT_LRO_TIMEOUT);
 
/* CQ moderation params */
-- 
2.11.0

[pull request][net-next 00/15] Mellanox, mlx5 updates 2017-04-30

2017-04-30 Thread Saeed Mahameed

Hi Dave,

This series contains two sets of patches to the mlx5 driver,
1. Nine patches (mostly from Hadar) to add 'mlx5 neigh update' feature.
2. Six misc patches.

For more details please see below.

Sorry for the last minute submission, originally I planned to submit before
weekend, but in order to provide clean patches, we had to deal with some
auto build issues first.

Please pull and let me know if there's any problem.

---

The following changes since commit c08bac03d2894113bdb114e66e6ada009defb120:

  Merge branch '10GbE' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (2017-04-29 
23:16:20 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-updates-2017-04-30

for you to fetch changes up to 0a0ab1d2cc5d5e68191488235074b5b30d793bb7:

  net/mlx5: E-Switch, Avoid redundant memory allocation (2017-04-30 16:03:21 
+0300)


mlx5-updates-2017-04-30

Or says:

mlx5 neigh update

This series (whose code name is 'neigh update') from Hadar, enhances the
mlx5 TC IP tunnel offloads to deal with changes to tunnel destination
neighbours used in offloaded flows which involved encapsulation.

In order to keep track on the validity state of such neighbours, we register
a netevent notifier callback and act on NEIGH_UPDATE events: if a neighbour
becomes valid, offload the related flows to HW (the other way around when
neigh becomes invalid) and similarly when a neigh mac addresses changes.

Since this traffic is offloaded from the host OS, the neighbour for the IP
tunnel destination can mistakenly become STALE and deleted by the kernel
since its 'used' value wasn't changed. To address that, we proactively
update the neighbour 'used' value every DELAY_PROBE_TIME seconds, using
time stamps generated by the existing driver code for HW flow counters.
We use the DELAY_PROBE_TIME_UPDATE event to adjust the frequency of the updates.

Prior to the core of the series, there's a patch from Saeed that introduces an
extendable vport representor implementation scheme. It provides a separation
between the eswitch to the netdev related aspects of the representors.

We would like to thank Ido Schimmel and Ilya Lesokhin for their coaching && 
advice
through the long design and review cycles while we struggled to understand and
(hopefully correctly) implement the locking around the different driver 
flows(..) .

- Or.
=

Misc Updates:

>From Tariq:
Some small performance and trivial code optimization for mlx5 netdev driver
- Optimize poll ICOSQ completion queue
- Use prefetchw when a write is to follow
- Use u8 as ownership type in mlx5e_get_cqe()

>From Eran:
- Disable LRO by default on specific setups

>From Eli:
- Small cleanup for E-Switch to avoid redundant allocation

Thanks,
Saeed.


Eli Cohen (1):
  net/mlx5: E-Switch, Avoid redundant memory allocation

Eran Ben Elisha (1):
  net/mlx5e: Disable HW LRO when PCI is slower than link on striding RQ

Hadar Hen Zion (7):
  net/mlx5e: Remove output device parameter from create encap header 
helpers definition
  net/mlx5e: Use flag to properly monitor a flow rule offloading state
  net/mlx5e: Read neigh parameters with proper locking
  net/mlx5e: Add neighbour hash table to the representors
  net/mlx5e: Add support to neighbour update flow
  net/mlx5e: Update neighbour 'used' state using HW flow rules counters
  net/mlx5e: Act on delay probe time updates

Or Gerlitz (2):
  net/mlx5: Remove encap entry pointer from the eswitch flow attributes
  net/mlx5e: Move the encap entry structure from the eswitch header

Saeed Mahameed (1):
  net/mlx5e: Extendable vport representor netdev private data

Tariq Toukan (3):
  net/mlx5e: Optimize poll ICOSQ completion queue
  net/mlx5e: Use prefetchw when a write is to follow
  net/mlx5e: Use u8 as ownership type in mlx5e_get_cqe()

 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  20 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  98 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 574 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   | 145 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c|   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 341 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h|   9 +
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  66 +--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  20 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  25 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   5 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |  24 +-
 include/linux/mlx5/driver.h|   1 +
 14 files changed, 1074 insertions(+), 262

[net-next 03/15] net/mlx5e: Move the encap entry structure from the eswitch header

2017-04-30 Thread Saeed Mahameed

From: Or Gerlitz 

The encap entry structure isn't manipulated by the eswitch code,
hence it can/needs to be removed from the eswitch header.

Do that, and change it to have mlx5e_ prefix.

This patch doesn't change any functionality.

Signed-off-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h  | 13 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   | 11 +--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 13 -
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index b6595a699dc1..425cb1b0bf02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -33,6 +33,7 @@
 #ifndef __MLX5E_REP_H__
 #define __MLX5E_REP_H__
 
+#include 
 #include "eswitch.h"
 #include "en.h"
 
@@ -40,6 +41,18 @@ struct mlx5e_rep_priv {
struct mlx5_eswitch_rep *rep;
 };
 
+struct mlx5e_encap_entry {
+   struct hlist_node encap_hlist;
+   struct list_head flows;
+   u32 encap_id;
+   struct neighbour *n;
+   struct ip_tunnel_info tun_info;
+   unsigned char h_dest[ETH_ALEN]; /* destination eth addr */
+
+   struct net_device *out_dev;
+   int tunnel_type;
+};
+
 void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
 void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv);
 bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index c7b034eeb149..3582ebcd4173 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -264,9 +264,9 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 
list_del(&flow->encap);
if (list_empty(next)) {
-   struct mlx5_encap_entry *e;
+   struct mlx5e_encap_entry *e;
 
-   e = list_entry(next, struct mlx5_encap_entry, flows);
+   e = list_entry(next, struct mlx5e_encap_entry, flows);
if (e->n) {
mlx5_encap_dealloc(priv->mdev, e->encap_id);
neigh_release(e->n);
@@ -1211,7 +1211,7 @@ static void gen_vxlan_header_ipv6(struct net_device 
*out_dev,
 
 static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
  struct net_device *mirred_dev,
- struct mlx5_encap_entry *e,
+ struct mlx5e_encap_entry *e,
  struct net_device **out_dev)
 {
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
@@ -1285,9 +1285,8 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
 
 static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
  struct net_device *mirred_dev,
- struct mlx5_encap_entry *e,
+ struct mlx5e_encap_entry *e,
  struct net_device **out_dev)
-
 {
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
@@ -1371,7 +1370,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5e_priv *up_priv = netdev_priv(up_dev);
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct ip_tunnel_key *key = &tun_info->key;
-   struct mlx5_encap_entry *e;
+   struct mlx5e_encap_entry *e;
struct net_device *out_dev;
int tunnel_type, err = 0;
uintptr_t hash_key;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 9056961689fa..751a673de97a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -36,7 +36,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #define MLX5_MAX_UC_PER_VPORT(dev) \
@@ -289,18 +288,6 @@ enum {
 #define MLX5_FLOW_CONTEXT_ACTION_VLAN_POP  0x4000
 #define MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH 0x8000
 
-struct mlx5_encap_entry {
-   struct hlist_node encap_hlist;
-   struct list_head flows;
-   u32 encap_id;
-   struct neighbour *n;
-   struct ip_tunnel_info tun_info;
-   unsigned char h_dest[ETH_ALEN]; /* destination eth addr */
-
-   struct net_device *out_dev;
-   int tunnel_type;
-};
-
 struct mlx5_esw_flow_attr {
struct mlx5_eswitch_rep *in_rep;
struct mlx5_eswitch_rep *out_rep;
-- 
2.11.0

[net-next 05/15] net/mlx5e: Use flag to properly monitor a flow rule offloading state

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

Instead of relaying on the 'flow->rule' pointer value which can be
valid or invalid (in case the FW returns an error while trying to offload
the rule), monitor the rule state using a flag.

In downstream patch which adds support to IP tunneling neigh update
flow, a TC rule could be cached in the driver and not offloaded into the
HW. In this case, the flow handle pointer stays NULL.

Check the offloaded flag to properly deal with rules which are currently
not offloaded when querying rule statistics.

This patch doesn't add any new functionality.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 25ecffa1a3df..2a9289b8a33b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -59,6 +59,7 @@ struct mlx5_nic_flow_attr {
 enum {
MLX5E_TC_FLOW_ESWITCH   = BIT(0),
MLX5E_TC_FLOW_NIC   = BIT(1),
+   MLX5E_TC_FLOW_OFFLOADED = BIT(2),
 };
 
 struct mlx5e_tc_flow {
@@ -245,7 +246,8 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
 
-   mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->esw_attr);
+   if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
+   mlx5_eswitch_del_offloaded_rule(esw, flow->rule, 
flow->esw_attr);
 
mlx5_eswitch_del_vlan_action(esw, flow->esw_attr);
 
@@ -1591,6 +1593,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, 
__be16 protocol,
goto err_free;
}
 
+   flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
err = rhashtable_insert_fast(&tc->ht, &flow->node,
 tc->ht_params);
if (err)
@@ -1646,6 +1649,9 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
if (!flow)
return -EINVAL;
 
+   if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
+   return 0;
+
counter = mlx5_flow_rule_counter(flow->rule);
if (!counter)
return 0;
-- 
2.11.0

[net-next 04/15] net/mlx5e: Remove output device parameter from create encap header helpers definition

2017-04-30 Thread Saeed Mahameed

From: Hadar Hen Zion 

Passing output device parameter to the helper functions that deal with
creation of encapsulation headers is redundant. Output device parameter
can be defined inside those helpers, no need to pass it. Refactor the code by
removing the parameter from the function signature.

This patch doesn't change any functionality.

Signed-off-by: Hadar Hen Zion 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 29 -
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 3582ebcd4173..25ecffa1a3df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1211,12 +1211,12 @@ static void gen_vxlan_header_ipv6(struct net_device 
*out_dev,
 
 static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
  struct net_device *mirred_dev,
- struct mlx5e_encap_entry *e,
- struct net_device **out_dev)
+ struct mlx5e_encap_entry *e)
 {
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
int ipv4_encap_size = ETH_HLEN + sizeof(struct iphdr) + VXLAN_HLEN;
struct ip_tunnel_key *tun_key = &e->tun_info.key;
+   struct net_device *out_dev;
struct neighbour *n = NULL;
struct flowi4 fl4 = {};
char *encap_header;
@@ -1245,7 +1245,7 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
fl4.daddr = tun_key->u.ipv4.dst;
fl4.saddr = tun_key->u.ipv4.src;
 
-   err = mlx5e_route_lookup_ipv4(priv, mirred_dev, out_dev,
+   err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev,
  &fl4, &n, &ttl);
if (err)
goto out;
@@ -1257,13 +1257,13 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
}
 
e->n = n;
-   e->out_dev = *out_dev;
+   e->out_dev = out_dev;
 
-   neigh_ha_snapshot(e->h_dest, n, *out_dev);
+   neigh_ha_snapshot(e->h_dest, n, out_dev);
 
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
-   gen_vxlan_header_ipv4(*out_dev, encap_header,
+   gen_vxlan_header_ipv4(out_dev, encap_header,
  ipv4_encap_size, e->h_dest, ttl,
  fl4.daddr,
  fl4.saddr, tun_key->tp_dst,
@@ -1285,12 +1285,12 @@ static int mlx5e_create_encap_header_ipv4(struct 
mlx5e_priv *priv,
 
 static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
  struct net_device *mirred_dev,
- struct mlx5e_encap_entry *e,
- struct net_device **out_dev)
+ struct mlx5e_encap_entry *e)
 {
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
struct ip_tunnel_key *tun_key = &e->tun_info.key;
+   struct net_device *out_dev;
struct neighbour *n = NULL;
struct flowi6 fl6 = {};
char *encap_header;
@@ -1320,7 +1320,7 @@ static int mlx5e_create_encap_header_ipv6(struct 
mlx5e_priv *priv,
fl6.daddr = tun_key->u.ipv6.dst;
fl6.saddr = tun_key->u.ipv6.src;
 
-   err = mlx5e_route_lookup_ipv6(priv, mirred_dev, out_dev,
+   err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev,
  &fl6, &n, &ttl);
if (err)
goto out;
@@ -1332,13 +1332,13 @@ static int mlx5e_create_encap_header_ipv6(struct 
mlx5e_priv *priv,
}
 
e->n = n;
-   e->out_dev = *out_dev;
+   e->out_dev = out_dev;
 
-   neigh_ha_snapshot(e->h_dest, n, *out_dev);
+   neigh_ha_snapshot(e->h_dest, n, out_dev);
 
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
-   gen_vxlan_header_ipv6(*out_dev, encap_header,
+   gen_vxlan_header_ipv6(out_dev, encap_header,
  ipv6_encap_size, e->h_dest, ttl,
  &fl6.daddr,
  &fl6.saddr, tun_key->tp_dst,
@@ -1371,7 +1371,6 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct ip_tunnel_key *key = &tun_info->key;
struct mlx5e_encap_entry *e;
-   struct net_device *out_dev;
int tunnel_type, err = 0;
uintptr_t hash_key;
bool found = false;
@@ -1419,9 +1418,9 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
INIT_LI

1 2 >

1 - 100 of 121 matches

Mail list logo