date:20180518

[PATCHv2 net-next] erspan: set bso bit based on mirrored packet's len

2018-05-18 Thread William Tu

Before the patch, the erspan BSO bit (Bad/Short/Oversized) is not
handled.  BSO has 4 possible values:
  00 --> Good frame with no error, or unknown integrity
  11 --> Payload is a Bad Frame with CRC or Alignment Error
  01 --> Payload is a Short Frame
  10 --> Payload is an Oversized Frame

Based the short/oversized definitions in RFC1757, the patch sets
the bso bit based on the mirrored packet's size.

Reported-by: Xiaoyan Jin 
Signed-off-by: William Tu 
---
v1->v2
  Improve code comments, make enum erspan_bso clearer
---
 include/net/erspan.h | 28 
 1 file changed, 28 insertions(+)

diff --git a/include/net/erspan.h b/include/net/erspan.h
index d044aa60cc76..b39643ef4c95 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -219,6 +219,33 @@ static inline __be32 erspan_get_timestamp(void)
return htonl((u32)h_usecs);
 }
 
+/* ERSPAN BSO (Bad/Short/Oversized), see RFC1757
+ *   00b --> Good frame with no error, or unknown integrity
+ *   01b --> Payload is a Short Frame
+ *   10b --> Payload is an Oversized Frame
+ *   11b --> Payload is a Bad Frame with CRC or Alignment Error
+ */
+enum erspan_bso {
+   BSO_NOERROR = 0x0,
+   BSO_SHORT = 0x1,
+   BSO_OVERSIZED = 0x2,
+   BSO_BAD = 0x3,
+};
+
+static inline u8 erspan_detect_bso(struct sk_buff *skb)
+{
+   /* BSO_BAD is not handled because the frame CRC
+* or alignment error information is in FCS.
+*/
+   if (skb->len < ETH_ZLEN)
+   return BSO_SHORT;
+
+   if (skb->len > ETH_FRAME_LEN)
+   return BSO_OVERSIZED;
+
+   return BSO_NOERROR;
+}
+
 static inline void erspan_build_header_v2(struct sk_buff *skb,
  u32 id, u8 direction, u16 hwid,
  bool truncate, bool is_ipv4)
@@ -248,6 +275,7 @@ static inline void erspan_build_header_v2(struct sk_buff 
*skb,
vlan_tci = ntohs(qp->tci);
}
 
+   bso = erspan_detect_bso(skb);
skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
ershdr = (struct erspan_base_hdr *)skb->data;
memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
-- 
2.7.4

[PATCH net] net: ip6_gre: fix tunnel metadata device sharing.

2018-05-18 Thread William Tu

Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'.  Thus, when doing:
  ip link add dev ip6gre11 type ip6gretap external
  ip link add dev ip6erspan12 type ip6erspan external
  RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.

The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'.  As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.

First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan.  Tested it
using the samples/bpf/test_tunnel_bpf.sh.

Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu 
---
 net/ipv6/ip6_gre.c | 101 +
 1 file changed, 79 insertions(+), 22 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 5162ecc45c20..458de353f5d9 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -71,6 +71,7 @@ struct ip6gre_net {
struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
struct ip6_tnl __rcu *collect_md_tun;
+   struct ip6_tnl __rcu *collect_md_tun_erspan;
struct net_device *fb_tunnel_dev;
 };
 
@@ -233,7 +234,12 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct 
net_device *dev,
if (cand)
return cand;
 
-   t = rcu_dereference(ign->collect_md_tun);
+   if (gre_proto == htons(ETH_P_ERSPAN) ||
+   gre_proto == htons(ETH_P_ERSPAN2))
+   t = rcu_dereference(ign->collect_md_tun_erspan);
+   else
+   t = rcu_dereference(ign->collect_md_tun);
+
if (t && t->dev->flags & IFF_UP)
return t;
 
@@ -262,6 +268,31 @@ static struct ip6_tnl __rcu **__ip6gre_bucket(struct 
ip6gre_net *ign,
return >tunnels[prio][h];
 }
 
+static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+   if (t->parms.collect_md)
+   rcu_assign_pointer(ign->collect_md_tun, t);
+}
+
+static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+   if (t->parms.collect_md)
+   rcu_assign_pointer(ign->collect_md_tun_erspan, t);
+}
+
+static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+   if (t->parms.collect_md)
+   rcu_assign_pointer(ign->collect_md_tun, NULL);
+}
+
+static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign,
+  struct ip6_tnl *t)
+{
+   if (t->parms.collect_md)
+   rcu_assign_pointer(ign->collect_md_tun_erspan, NULL);
+}
+
 static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign,
const struct ip6_tnl *t)
 {
@@ -272,9 +303,6 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, 
struct ip6_tnl *t)
 {
struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
 
-   if (t->parms.collect_md)
-   rcu_assign_pointer(ign->collect_md_tun, t);
-
rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
 }
@@ -284,9 +312,6 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, 
struct ip6_tnl *t)
struct ip6_tnl __rcu **tp;
struct ip6_tnl *iter;
 
-   if (t->parms.collect_md)
-   rcu_assign_pointer(ign->collect_md_tun, NULL);
-
for (tp = ip6gre_bucket(ign, t);
 (iter = rtnl_dereference(*tp)) != NULL;
 tp = >next) {
@@ -375,11 +400,23 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net 
*net,
return NULL;
 }
 
+static void ip6erspan_tunnel_uninit(struct net_device *dev)
+{
+   struct ip6_tnl *t = netdev_priv(dev);
+   struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+
+   ip6erspan_tunnel_unlink_md(ign, t);
+   ip6gre_tunnel_unlink(ign, t);
+   dst_cache_reset(>dst_cache);
+   dev_put(dev);
+}
+
 static void ip6gre_tunnel_uninit(struct net_device *dev)
 {
struct ip6_tnl *t = netdev_priv(dev);
struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
 
+   ip6gre_tunnel_unlink_md(ign, t);
ip6gre_tunnel_unlink(ign, t);
dst_cache_reset(>dst_cache);
dev_put(dev);
@@ -1806,7 +1843,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 
 static const struct net_device_ops ip6erspan_netdev_ops = {
.ndo_init = ip6erspan_tap_init,
-   .ndo_uninit =   ip6gre_tunnel_uninit,
+   .ndo_uninit =   ip6erspan_tunnel_uninit,
.ndo_start_xmit =   ip6erspan_tunnel_xmit,
.ndo_set_mac_address =  eth_mac_addr,
.ndo_validate_addr =eth_validate_addr,
@@ -1875,8 +1912,6 @@ static int

Re: [patch net-next 0/5] devlink: introduce port flavours and common phys_port_name generation

2018-05-18 Thread Florian Fainelli



On 05/18/2018 12:28 AM, Jiri Pirko wrote:
> From: Jiri Pirko 
> 
> This patchset resolves 2 issues we have right now:
> 1) There are many netdevices / ports in the system, for port, pf, vf
>represenatation but the user has no way to see which is which
> 2) The ndo_get_phys_port_name is implemented in each driver separatelly,
>which may lead to inconsistent names between drivers.
> 
> This patchset introduces port flavours which should address the first
> problem. In this initial patchset, I focus on DSA and their port
> flavours. As a follow-up, I plan to add PF and VF representor flavours.
> However, that needs additional dependencies in drivers (nfp, mlx5).
> 
> The common phys_port_name generation is used by mlxsw. An example output
> for mlxsw looks like this:
> 
> # devlink port
> ...
> pci/:03:00.0/59: type eth netdev enp3s0np4 flavour physical number 4
> pci/:03:00.0/61: type eth netdev enp3s0np1 flavour physical number 1
> pci/:03:00.0/63: type eth netdev enp3s0np2 flavour physical number 2
> pci/:03:00.0/49: type eth netdev enp3s0np8s0 flavour physical number 8 
> split_group 8 subport 0
> pci/:03:00.0/50: type eth netdev enp3s0np8s1 flavour physical number 8 
> split_group 8 subport 1
> pci/:03:00.0/51: type eth netdev enp3s0np8s2 flavour physical number 8 
> split_group 8 subport 2
> pci/:03:00.0/52: type eth netdev enp3s0np8s3 flavour physical number 8 
> split_group 8 subport 3
> 
> As you can see, the netdev names are generated according to the flavour
> and port number. In case the port is split, the split subnumber is also
> included.
> 
> An example output for dsa_loop testing module looks like this:
> # devlink port
> mdio_bus/fixed-0:1f/0: type eth netdev lan1 flavour physical number 0
> mdio_bus/fixed-0:1f/1: type eth netdev lan2 flavour physical number 1
> mdio_bus/fixed-0:1f/2: type eth netdev lan3 flavour physical number 2
> mdio_bus/fixed-0:1f/3: type eth netdev lan4 flavour physical number 3
> mdio_bus/fixed-0:1f/4: type notset
> mdio_bus/fixed-0:1f/5: type notset flavour cpu number 5
> mdio_bus/fixed-0:1f/6: type notset
> mdio_bus/fixed-0:1f/7: type notset
> mdio_bus/fixed-0:1f/8: type notset
> mdio_bus/fixed-0:1f/9: type notset
> mdio_bus/fixed-0:1f/10: type notset
> mdio_bus/fixed-0:1f/11: type notset

Reviewed-by: Florian Fainelli 
Tested-by: Florian Fainelli 

Thanks!
-- 
Florian

Re: [patch net-next RFC 04/12] dsa: set devlink port attrs for dsa ports

2018-05-18 Thread Florian Fainelli



On 05/18/2018 06:45 AM, Andrew Lunn wrote:
>> What benefit does it have to register unused ports? What is a usecase
>> for them. Like Florian, I also think they should not be registered.
> 
> Hi Jiri
> 
> They physically exist, so we are accurately describing the hardware by
> registering them.

You are right that the driver is advertising a number of ports that does
not match what is being expected. We unfortunately do not have a good
API for specifying e.g: a sparse port allocation.
-- 
Florian

Re: [PATCH] net: sched: don't disable bh when accessing action idr

2018-05-18 Thread Cong Wang

On Fri, May 18, 2018 at 8:45 AM, Vlad Buslov  wrote:
> Underlying implementation of action map has changed and doesn't require
> disabling bh anymore. Replace all action idr spinlock usage with regular
> calls that do not disable bh.

Please explain explicitly why it is not required, don't let people
dig, this would save everyone's time.

Also, this should be targeted for net-next, right?

Thanks.

Re: [RFC v4 3/5] virtio_ring: add packed ring support

2018-05-18 Thread Tiwei Bie

On Sat, May 19, 2018 at 09:12:30AM +0800, Jason Wang wrote:
> On 2018年05月18日 22:33, Tiwei Bie wrote:
> > On Fri, May 18, 2018 at 09:17:05PM +0800, Jason Wang wrote:
> > > On 2018年05月18日 19:29, Tiwei Bie wrote:
> > > > On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
> > > > > On 2018年05月16日 22:33, Tiwei Bie wrote:
> > > > > > On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
> > > > > > > On 2018年05月16日 21:45, Tiwei Bie wrote:
> > > > > > > > On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
> > > > > > > > > On 2018年05月16日 20:39, Tiwei Bie wrote:
> > > > > > > > > > On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> > > > > > > > > > > On 2018年05月16日 16:37, Tiwei Bie wrote:
> > > > > > [...]
> > > > > > > > > > > > +static void detach_buf_packed(struct vring_virtqueue 
> > > > > > > > > > > > *vq, unsigned int head,
> > > > > > > > > > > > + unsigned int id, void 
> > > > > > > > > > > > **ctx)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +   struct vring_packed_desc *desc;
> > > > > > > > > > > > +   unsigned int i, j;
> > > > > > > > > > > > +
> > > > > > > > > > > > +   /* Clear data ptr. */
> > > > > > > > > > > > +   vq->desc_state[id].data = NULL;
> > > > > > > > > > > > +
> > > > > > > > > > > > +   i = head;
> > > > > > > > > > > > +
> > > > > > > > > > > > +   for (j = 0; j < vq->desc_state[id].num; j++) {
> > > > > > > > > > > > +   desc = >vring_packed.desc[i];
> > > > > > > > > > > > +   vring_unmap_one_packed(vq, desc);
> > > > > > > > > > > As mentioned in previous discussion, this probably won't 
> > > > > > > > > > > work for the case
> > > > > > > > > > > of out of order completion since it depends on the 
> > > > > > > > > > > information in the
> > > > > > > > > > > descriptor ring. We probably need to extend ctx to record 
> > > > > > > > > > > such information.
> > > > > > > > > > Above code doesn't depend on the information in the 
> > > > > > > > > > descriptor
> > > > > > > > > > ring. The vq->desc_state[] is the extended ctx.
> > > > > > > > > > 
> > > > > > > > > > Best regards,
> > > > > > > > > > Tiwei Bie
> > > > > > > > > Yes, but desc is a pointer to descriptor ring I think so
> > > > > > > > > vring_unmap_one_packed() still depends on the content of 
> > > > > > > > > descriptor ring?
> > > > > > > > > 
> > > > > > > > I got your point now. I think it makes sense to reserve
> > > > > > > > the bits of the addr field. Driver shouldn't try to get
> > > > > > > > addrs from the descriptors when cleanup the descriptors
> > > > > > > > no matter whether we support out-of-order or not.
> > > > > > > Maybe I was wrong, but I remember spec mentioned something like 
> > > > > > > this.
> > > > > > You're right. Spec mentioned this. I was just repeating
> > > > > > the spec to emphasize that it does make sense. :)
> > > > > > 
> > > > > > > > But combining it with the out-of-order support, it will
> > > > > > > > mean that the driver still needs to maintain a desc/ctx
> > > > > > > > list that is very similar to the desc ring in the split
> > > > > > > > ring. I'm not quite sure whether it's something we want.
> > > > > > > > If it is true, I'll do it. So do you think we also want
> > > > > > > > to maintain such a desc/ctx list for packed ring?
> > > > > > > To make it work for OOO backends I think we need something like 
> > > > > > > this
> > > > > > > (hardware NIC drivers are usually have something like this).
> > > > > > Which hardware NIC drivers have this?
> > > > > It's quite common I think, e.g driver track e.g dma addr and page frag
> > > > > somewhere. e.g the ring->rx_info in mlx4 driver.
> > > > It seems that I had a misunderstanding on your
> > > > previous comments. I know it's quite common for
> > > > drivers to track e.g. DMA addrs somewhere (and
> > > > I think one reason behind this is that they want
> > > > to reuse the bits of addr field).
> > > Yes, we may want this for virtio-net as well in the future.
> > > 
> > > >But tracking
> > > > addrs somewhere doesn't means supporting OOO.
> > > > I thought you were saying it's quite common for
> > > > hardware NIC drivers to support OOO (i.e. NICs
> > > > will return the descriptors OOO):
> > > > 
> > > > I'm not familiar with mlx4, maybe I'm wrong.
> > > > I just had a quick glance. And I found below
> > > > comments in mlx4_en_process_rx_cq():
> > > > 
> > > > ```
> > > > /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
> > > >* descriptor offset can be deduced from the CQE index instead of
> > > >* reading 'cqe->index' */
> > > > index = cq->mcq.cons_index & ring->size_mask;
> > > > cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
> > > > ```
> > > > 
> > > > It seems that although they have a completion
> > > > queue, they are still using the ring in order.
> > > I guess so (at least from the above bits). Git grep -i "out of order" in
> > >

[PATCH bpf-next 2/5] bpf: Sync bpf.h to tools/

2018-05-18 Thread Andrey Ignatov

Sync new `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG`
attach types to tools/.

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97446bb..b70ad2c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -158,6 +158,8 @@ enum bpf_attach_type {
BPF_CGROUP_INET6_CONNECT,
BPF_CGROUP_INET4_POST_BIND,
BPF_CGROUP_INET6_POST_BIND,
+   BPF_CGROUP_UDP4_SENDMSG,
+   BPF_CGROUP_UDP6_SENDMSG,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2247,6 +2249,12 @@ struct bpf_sock_addr {
__u32 family;   /* Allows 4-byte read, but no write */
__u32 type; /* Allows 4-byte read, but no write */
__u32 protocol; /* Allows 4-byte read, but no write */
+   __u32 msg_src_ip4;  /* Allows 1,2,4-byte read an 4-byte write.
+* Stored in network byte order.
+*/
+   __u32 msg_src_ip6[4];   /* Allows 1,2,4-byte read an 4-byte write.
+* Stored in network byte order.
+*/
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
2.9.5

[PATCH bpf-next 5/5] selftests/bpf: Selftest for sys_sendmsg hooks

2018-05-18 Thread Andrey Ignatov

Add selftest for BPF_CGROUP_UDP4_SENDMSG and BPF_CGROUP_UDP6_SENDMSG
attach types.

Try to sendmsg(2) to specific IP:port and test that:
* source IP is overridden as expected.
* remote IP:port pair is overridden as expected;

Both UDPv4 and UDPv6 are tested.

Output:
  # test_sock_addr.sh 2>/dev/null
  Wait for testing IPv4/IPv6 to become available ... OK
  ... pre-existing test-cases skipped ...
  Test case: sendmsg4: load prog with wrong expected attach type .. [PASS]
  Test case: sendmsg4: attach prog with wrong attach type .. [PASS]
  Test case: sendmsg4: rewrite IP & port (asm) .. [PASS]
  Test case: sendmsg4: rewrite IP & port (C) .. [PASS]
  Test case: sendmsg4: deny call .. [PASS]
  Test case: sendmsg6: load prog with wrong expected attach type .. [PASS]
  Test case: sendmsg6: attach prog with wrong attach type .. [PASS]
  Test case: sendmsg6: rewrite IP & port (asm) .. [PASS]
  Test case: sendmsg6: rewrite IP & port (C) .. [PASS]
  Test case: sendmsg6: deny call .. [PASS]
  Summary: 26 PASSED, 0 FAILED

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
---
 tools/testing/selftests/bpf/Makefile |   3 +-
 tools/testing/selftests/bpf/sendmsg4_prog.c  |  49 +++
 tools/testing/selftests/bpf/sendmsg6_prog.c  |  60 
 tools/testing/selftests/bpf/test_sock_addr.c | 481 +++
 4 files changed, 592 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/sendmsg4_prog.c
 create mode 100644 tools/testing/selftests/bpf/sendmsg6_prog.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 1eb0fa2..d87277a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,7 +33,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o 
test_adjust_tail.o \
test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o 
\
-   test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o
+   test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
+   sendmsg4_prog.o sendmsg6_prog.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/sendmsg4_prog.c 
b/tools/testing/selftests/bpf/sendmsg4_prog.c
new file mode 100644
index 000..a91536b
--- /dev/null
+++ b/tools/testing/selftests/bpf/sendmsg4_prog.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include 
+#include 
+#include 
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define SRC1_IP4   0xAC11U /* 172.16.0.1 */
+#define SRC2_IP4   0xU
+#define SRC_REWRITE_IP40x7f04U
+#define DST_IP40xC0A801FEU /* 192.168.1.254 */
+#define DST_REWRITE_IP40x7f01U
+#define DST_PORT   4040
+#define DST_REWRITE_PORT4  
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg4")
+int sendmsg_v4_prog(struct bpf_sock_addr *ctx)
+{
+   if (ctx->type != SOCK_DGRAM)
+   return 0;
+
+   /* Rewrite source. */
+   if (ctx->msg_src_ip4 == bpf_htonl(SRC1_IP4) ||
+   ctx->msg_src_ip4 == bpf_htonl(SRC2_IP4)) {
+   ctx->msg_src_ip4 = bpf_htonl(SRC_REWRITE_IP4);
+   } else {
+   /* Unexpected source. Reject sendmsg. */
+   return 0;
+   }
+
+   /* Rewrite destination. */
+   if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) &&
+ctx->user_port == bpf_htons(DST_PORT)) {
+   ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
+   ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
+   } else {
+   /* Unexpected source. Reject sendmsg. */
+   return 0;
+   }
+
+   return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/sendmsg6_prog.c 
b/tools/testing/selftests/bpf/sendmsg6_prog.c
new file mode 100644
index 000..5aeaa28
--- /dev/null
+++ b/tools/testing/selftests/bpf/sendmsg6_prog.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include 
+#include 
+#include 
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define SRC_REWRITE_IP6_0  0
+#define SRC_REWRITE_IP6_1  0
+#define SRC_REWRITE_IP6_2  0
+#define SRC_REWRITE_IP6_3  6
+
+#define DST_REWRITE_IP6_0  0
+#define DST_REWRITE_IP6_1  0
+#define DST_REWRITE_IP6_2  0
+#define DST_REWRITE_IP6_3  1
+
+#define DST_REWRITE_PORT6  
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_prog(struct bpf_sock_addr *ctx)
+{
+   if (ctx->type != SOCK_DGRAM)
+   return 0;
+
+   /* Rewrite source. */
+   if (ctx->msg_src_ip6[3] ==

[PATCH bpf-next 3/5] libbpf: Support guessing sendmsg{4,6} progs

2018-05-18 Thread Andrey Ignatov

libbpf can guess prog type and expected attach type based on section
name. Add hints for "cgroup/sendmsg4" and "cgroup/sendmsg6" section
names.

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
---
 tools/lib/bpf/libbpf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3dbe217..f5238c5 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2042,6 +2042,8 @@ static const struct {
BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND),
BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT),
+   BPF_SA_PROG_SEC("cgroup/sendmsg4", BPF_CGROUP_UDP4_SENDMSG),
+   BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG),
BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
 };
-- 
2.9.5

[PATCH bpf-next 0/5] bpf: Hooks for sys_sendmsg

2018-05-18 Thread Andrey Ignatov

This path set adds BPF hooks for sys_sendmsg similar to existing hooks for
sys_bind and sys_connect.

Hooks allow to override source IP (including the case when it's set via
cmsg(3)) and destination IP:port for unconnected UDP (slow path). TCP and
connected UDP (fast path) are not affected. This makes UDP support
complete: connected UDP is handled by sys_connect hooks, unconnected by
sys_sendmsg ones.

Similar to sys_connect hooks, sys_sendmsg ones can be used to make system
calls such as sendmsg(2) and sendto(2) return EPERM.

Please see patch 0001 for more details.

Andrey Ignatov (5):
  bpf: Hooks for sys_sendmsg
  bpf: Sync bpf.h to tools/
  libbpf: Support guessing sendmsg{4,6} progs
  selftests/bpf: Prepare test_sock_addr for extension
  selftests/bpf: Selftest for sys_sendmsg hooks

 include/linux/bpf-cgroup.h   |   23 +-
 include/linux/filter.h   |1 +
 include/uapi/linux/bpf.h |8 +
 kernel/bpf/cgroup.c  |   11 +-
 kernel/bpf/syscall.c |8 +
 net/core/filter.c|   39 +
 net/ipv4/udp.c   |   20 +-
 net/ipv6/udp.c   |   17 +
 tools/include/uapi/linux/bpf.h   |8 +
 tools/lib/bpf/libbpf.c   |2 +
 tools/testing/selftests/bpf/Makefile |3 +-
 tools/testing/selftests/bpf/sendmsg4_prog.c  |   49 ++
 tools/testing/selftests/bpf/sendmsg6_prog.c  |   60 ++
 tools/testing/selftests/bpf/test_sock_addr.c | 1118 +-
 14 files changed, 1171 insertions(+), 196 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/sendmsg4_prog.c
 create mode 100644 tools/testing/selftests/bpf/sendmsg6_prog.c

-- 
2.9.5

[PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg

2018-05-18 Thread Andrey Ignatov

In addition to already existing BPF hooks for sys_bind and sys_connect,
the patch provides new hooks for sys_sendmsg.

It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
that provides access to socket itlself (properties like family, type,
protocol) and user-passed `struct sockaddr *` so that BPF program can
override destination IP and port for system calls such as sendto(2) or
sendmsg(2) and/or assign source IP to the socket.

The hooks are implemented as two new attach types:
`BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
UDPv6 correspondingly.

UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
sys_connect hooks, i.e. to prevent reading from / writing to e.g.
user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.

The difference with already existing hooks is sys_sendmsg are
implemented only for unconnected UDP.

For TCP it doesn't make sense to change user-provided `struct sockaddr *`
at sendto(2)/sendmsg(2) time since socket either was already connected
and has source/destination set or wasn't connected and call to
sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.

Connected UDP is already handled by sys_connect hooks that can override
source/destination at connect time and use fast-path later, i.e. these
hooks don't affect UDP fast-path.

Rewriting source IP is implemented differently than that in sys_connect
hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
just bind socket to desired local IP address since source IP can be set
on per-packet basis by using ancillary data (cmsg(3)). So no matter if
socket is bound or not, source IP has to be rewritten on every call to
sys_sendmsg.

To do so two new fields are added to UAPI `struct bpf_sock_addr`;
* `msg_src_ip4` to set source IPv4 for UDPv4;
* `msg_src_ip6` to set source IPv6 for UDPv6.

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h | 23 +--
 include/linux/filter.h |  1 +
 include/uapi/linux/bpf.h   |  8 
 kernel/bpf/cgroup.c| 11 ++-
 kernel/bpf/syscall.c   |  8 
 net/core/filter.c  | 39 +++
 net/ipv4/udp.c | 20 ++--
 net/ipv6/udp.c | 17 +
 8 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 30d15e6..46f01ba 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
  struct sockaddr *uaddr,
- enum bpf_attach_type type);
+ enum bpf_attach_type type,
+ void *t_ctx);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 struct bpf_sock_ops_kern *sock_ops,
@@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
 ({\
int __ret = 0; \
if (cgroup_bpf_enabled)\
-   __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);\
+   __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \
+ NULL);   \
__ret; \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)  \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)   \
 ({\
int __ret = 0; \
if (cgroup_bpf_enabled) {  \
lock_sock(sk); \
-   __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);\
+   __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \
+ t_ctx);  \
release_sock(sk);  \
}  \
__ret; \
@@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \
-

[PATCH bpf-next 4/5] selftests/bpf: Prepare test_sock_addr for extension

2018-05-18 Thread Andrey Ignatov

test_sock_addr was not easy to extend since it was focused on sys_bind
and sys_connect quite a bit.

Reorganized it so that it'll be easier to cover new test-cases for
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR`:

- decouple test-cases so that only one BPF prog is tested at a time;

- check programmatically that local IP:port for sys_bind, source IP and
  destination IP:port for sys_connect are rewritten property by tested
  BPF programs.

The output of new version:
  # test_sock_addr.sh 2>/dev/null
  Wait for testing IPv4/IPv6 to become available ... OK
  Test case: bind4: load prog with wrong expected attach type .. [PASS]
  Test case: bind4: attach prog with wrong attach type .. [PASS]
  Test case: bind4: rewrite IP & TCP port in .. [PASS]
  Test case: bind4: rewrite IP & UDP port in .. [PASS]
  Test case: bind6: load prog with wrong expected attach type .. [PASS]
  Test case: bind6: attach prog with wrong attach type .. [PASS]
  Test case: bind6: rewrite IP & TCP port in .. [PASS]
  Test case: bind6: rewrite IP & UDP port in .. [PASS]
  Test case: connect4: load prog with wrong expected attach type .. [PASS]
  Test case: connect4: attach prog with wrong attach type .. [PASS]
  Test case: connect4: rewrite IP & TCP port .. [PASS]
  Test case: connect4: rewrite IP & UDP port .. [PASS]
  Test case: connect6: load prog with wrong expected attach type .. [PASS]
  Test case: connect6: attach prog with wrong attach type .. [PASS]
  Test case: connect6: rewrite IP & TCP port .. [PASS]
  Test case: connect6: rewrite IP & UDP port .. [PASS]
  Summary: 16 PASSED, 0 FAILED

(stderr contains errors from libbpf when testing load/attach with
invalid arguments)

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
---
 tools/testing/selftests/bpf/test_sock_addr.c | 655 +++
 1 file changed, 460 insertions(+), 195 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sock_addr.c 
b/tools/testing/selftests/bpf/test_sock_addr.c
index 2950f80..ed3e397 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -17,34 +17,292 @@
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
 
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
 #define CG_PATH"/foo"
 #define CONNECT4_PROG_PATH "./connect4_prog.o"
 #define CONNECT6_PROG_PATH "./connect6_prog.o"
 
 #define SERV4_IP   "192.168.1.254"
 #define SERV4_REWRITE_IP   "127.0.0.1"
+#define SRC4_REWRITE_IP"127.0.0.4"
 #define SERV4_PORT 4040
 #define SERV4_REWRITE_PORT 
 
 #define SERV6_IP   "face:b00c:1234:5678::abcd"
 #define SERV6_REWRITE_IP   "::1"
+#define SRC6_REWRITE_IP"::6"
 #define SERV6_PORT 6060
 #define SERV6_REWRITE_PORT 
 
 #define INET_NTOP_BUF  40
 
-typedef int (*load_fn)(enum bpf_attach_type, const char *comment);
+struct sock_addr_test;
+
+typedef int (*load_fn)(const struct sock_addr_test *test);
 typedef int (*info_fn)(int, struct sockaddr *, socklen_t *);
 
-struct program {
-   enum bpf_attach_type type;
-   load_fn loadfn;
-   int fd;
-   const char *name;
-   enum bpf_attach_type invalid_type;
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sock_addr_test {
+   const char *descr;
+   /* BPF prog properties */
+   load_fn loadfn;
+   enum bpf_attach_type expected_attach_type;
+   enum bpf_attach_type attach_type;
+   /* Socket properties */
+   int domain;
+   int type;
+   /* IP:port pairs for BPF prog to override */
+   const char *requested_ip;
+   unsigned short requested_port;
+   const char *expected_ip;
+   unsigned short expected_port;
+   const char *expected_src_ip;
+   /* Expected test result */
+   enum {
+   LOAD_REJECT,
+   ATTACH_REJECT,
+   SUCCESS,
+   } expected_result;
 };
 
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
+static int bind4_prog_load(const struct sock_addr_test *test);
+static int bind6_prog_load(const struct sock_addr_test *test);
+static int connect4_prog_load(const struct sock_addr_test *test);
+static int connect6_prog_load(const struct sock_addr_test *test);
+
+static struct sock_addr_test tests[] = {
+   /* bind */
+   {
+   "bind4: load prog with wrong expected attach type",
+   bind4_prog_load,
+   BPF_CGROUP_INET6_BIND,
+   BPF_CGROUP_INET4_BIND,
+   AF_INET,
+   SOCK_STREAM,
+   NULL,
+   0,
+   NULL,
+   0,
+   NULL,
+   LOAD_REJECT,
+   },
+   {
+   "bind4: attach prog with wrong attach type",
+   bind4_prog_load,
+   BPF_CGROUP_INET4_BIND,
+   BPF_CGROUP_INET6_BIND,
+   AF_INET,
+   SOCK_STREAM,
+

Re: [RFC v4 3/5] virtio_ring: add packed ring support

2018-05-18 Thread Jason Wang




On 2018年05月18日 22:33, Tiwei Bie wrote:

On Fri, May 18, 2018 at 09:17:05PM +0800, Jason Wang wrote:

On 2018年05月18日 19:29, Tiwei Bie wrote:

On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:

On 2018年05月16日 22:33, Tiwei Bie wrote:

On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:

On 2018年05月16日 21:45, Tiwei Bie wrote:

On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:

On 2018年05月16日 20:39, Tiwei Bie wrote:

On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:

On 2018年05月16日 16:37, Tiwei Bie wrote:

[...]

+static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
+ unsigned int id, void **ctx)
+{
+   struct vring_packed_desc *desc;
+   unsigned int i, j;
+
+   /* Clear data ptr. */
+   vq->desc_state[id].data = NULL;
+
+   i = head;
+
+   for (j = 0; j < vq->desc_state[id].num; j++) {
+   desc = >vring_packed.desc[i];
+   vring_unmap_one_packed(vq, desc);

As mentioned in previous discussion, this probably won't work for the case
of out of order completion since it depends on the information in the
descriptor ring. We probably need to extend ctx to record such information.

Above code doesn't depend on the information in the descriptor
ring. The vq->desc_state[] is the extended ctx.

Best regards,
Tiwei Bie

Yes, but desc is a pointer to descriptor ring I think so
vring_unmap_one_packed() still depends on the content of descriptor ring?


I got your point now. I think it makes sense to reserve
the bits of the addr field. Driver shouldn't try to get
addrs from the descriptors when cleanup the descriptors
no matter whether we support out-of-order or not.

Maybe I was wrong, but I remember spec mentioned something like this.

You're right. Spec mentioned this. I was just repeating
the spec to emphasize that it does make sense. :)


But combining it with the out-of-order support, it will
mean that the driver still needs to maintain a desc/ctx
list that is very similar to the desc ring in the split
ring. I'm not quite sure whether it's something we want.
If it is true, I'll do it. So do you think we also want
to maintain such a desc/ctx list for packed ring?

To make it work for OOO backends I think we need something like this
(hardware NIC drivers are usually have something like this).

Which hardware NIC drivers have this?

It's quite common I think, e.g driver track e.g dma addr and page frag
somewhere. e.g the ring->rx_info in mlx4 driver.

It seems that I had a misunderstanding on your
previous comments. I know it's quite common for
drivers to track e.g. DMA addrs somewhere (and
I think one reason behind this is that they want
to reuse the bits of addr field).

Yes, we may want this for virtio-net as well in the future.


   But tracking
addrs somewhere doesn't means supporting OOO.
I thought you were saying it's quite common for
hardware NIC drivers to support OOO (i.e. NICs
will return the descriptors OOO):

I'm not familiar with mlx4, maybe I'm wrong.
I just had a quick glance. And I found below
comments in mlx4_en_process_rx_cq():

```
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
   * descriptor offset can be deduced from the CQE index instead of
   * reading 'cqe->index' */
index = cq->mcq.cons_index & ring->size_mask;
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
```

It seems that although they have a completion
queue, they are still using the ring in order.

I guess so (at least from the above bits). Git grep -i "out of order" in
drivers/net gives some hints. Looks like there're few deivces do this.


I guess maybe storage device may want OOO.

Right, some iSCSI did.

But tracking them elsewhere is not only for OOO.

Spec said:

for element address

"
In a used descriptor, Element Address is unused.
"

for Next flag:

"
For example, if descriptors are used in the same order in which they are
made available, this will result in
the used descriptor overwriting the first available descriptor in the list,
the used descriptor for the next list
overwriting the first available descriptor in the next list, etc.
"

for in order completion:

"
This will result in the used descriptor overwriting the first available
descriptor in the batch, the used descriptor
for the next batch overwriting the first available descriptor in the next
batch, etc.
"

So:

- It's an alignment to the spec
- device may (or should) overwrite the descriptor make also make address
field useless.

You didn't get my point...


I don't hope so.


I agreed driver should track the DMA addrs or some
other necessary things from the very beginning. And
I also repeated the spec to emphasize that it does
make sense. And I'd like to do that.

What I was saying is that, to support OOO, we may
need to manage these context (which saves DMA addrs
etc) via a list which is similar to the desc list
maintained via `next` in split ring instead of an
array whose elements always

Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-18 Thread Jason Wang




On 2018年05月18日 22:46, Michael S. Tsirkin wrote:

On Fri, May 18, 2018 at 10:11:54PM +0800, Jason Wang wrote:


On 2018年05月18日 22:06, Michael S. Tsirkin wrote:

On Fri, May 18, 2018 at 10:00:31PM +0800, Jason Wang wrote:

On 2018年05月18日 21:26, Jason Wang wrote:

On 2018年05月18日 21:13, Michael S. Tsirkin wrote:

On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:

We return -EIO on device down but can not raise EPOLLOUT after it was
up. This may confuse user like vhost which expects tuntap to raise
EPOLLOUT to re-enable its TX routine after tuntap is down. This could
be easily reproduced by transmitting packets from VM while down and up
the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.

Cc: Hannes Frederic Sowa 
Cc: Eric Dumazet 
Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
Signed-off-by: Jason Wang 
---
    drivers/net/tun.c | 4 +++-
    1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d45ac37..1b29761 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct
tun_struct *tun, struct tun_file *tfile,
    int skb_xdp = 1;
    bool frags = tun_napi_frags_enabled(tun);
    -    if (!(tun->dev->flags & IFF_UP))
+    if (!(tun->dev->flags & IFF_UP)) {

Isn't this racy?  What if flag is cleared at this point?

I think you mean "set at this point"? Then yes, so we probably need to
set the bit during tun_net_close().

Thanks

Looks no need, vhost will poll socket after it see EIO. So we are ok here?

Thanks

In fact I don't even understand why does this help any longer.


We disable tx polling and only enable it on demand for a better rx
performance. You may want to have a look at :

commit feb8892cb441c742d4220cf7ced001e7fa070731
Author: Jason Wang 
Date:   Mon Nov 13 11:45:34 2017 +0800

     vhost_net: conditionally enable tx polling

Thanks


Question is, what looks at SOCKWQ_ASYNC_NOSPACE.
I think it's tested when packet is transmitted,
but there is no guarantee here any packet will
ever be transmitted.



Well, actually, I do plan to disable vq polling from the beginning. But 
looks like you do not want this:


See https://patchwork.kernel.org/patch/10034025/

Thanks

Re: [PATCH 1/2] bpf: sockmap, double free in __sock_map_ctx_update_elem()

2018-05-18 Thread Gustavo A. R. Silva



Hi Dan,

On 05/18/2018 09:39 AM, Dan Carpenter wrote:

On Fri, May 18, 2018 at 10:27:18AM +0200, Daniel Borkmann wrote:


Thanks for the two fixes, appreciate it! There were two similar ones that
fix the same issues which were already applied yesterday to bpf-next:

https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=0e4364560361d57e8cd873a8990327f3471d7d8a
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=a78622932c27e8ec33e5ba180f3d2e87fb806b28


Hey Gustavo,

We're sort of duplicating each other's work.  Could you CC
kernel-janit...@vger.kernel.org for static checker fixes so that I can
see what you're working on?



Sure thing.

I've been doing this work for more than a year now and just recently we 
are having these issues. I'm a bit curious about it.



We'll probably still send the occasional duplicate which is fine...



Yep. Not a big deal for me.

Have a good one.
--
Gustavo

[PATCH bpf-next 3/7] bpf: btf: Check array->index_type

2018-05-18 Thread Martin KaFai Lau

Instead of ingoring the array->index_type field.  Enforce that
it must be an unsigned BTF_KIND_INT.

Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/btf.c | 83 
 1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 536e5981ad8c..b4e48dae2240 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -444,6 +444,28 @@ static const struct btf_type *btf_type_by_id(const struct 
btf *btf, u32 type_id)
return btf->types[type_id];
 }
 
+/*
+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+   u16 nr_bits, nr_bytes;
+   u32 int_data;
+
+   int_data = btf_type_int(t);
+   nr_bits = BTF_INT_BITS(int_data);
+   nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+   if (BITS_PER_BYTE_MASKED(nr_bits) ||
+   BTF_INT_OFFSET(int_data) ||
+   (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+   return false;
+   }
+
+   return true;
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
  const char *fmt, ...)
 {
@@ -1309,14 +1331,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
-   /* We are a little forgiving on array->index_type since
-* the kernel is not using it.
-*/
-   /* Array elem cannot be in type void,
-* so !array->type is not allowed.
+   /* Array elem type and index type cannot be in type void,
+* so !array->type and !array->index_type are not allowed.
 */
if (!array->type || BTF_TYPE_PARENT(array->type)) {
-   btf_verifier_log_type(env, t, "Invalid type_id");
+   btf_verifier_log_type(env, t, "Invalid elem");
+   return -EINVAL;
+   }
+
+   if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+   btf_verifier_log_type(env, t, "Invalid index");
return -EINVAL;
}
 
@@ -1329,11 +1353,35 @@ static int btf_array_resolve(struct btf_verifier_env 
*env,
 const struct resolve_vertex *v)
 {
const struct btf_array *array = btf_type_array(v->t);
-   const struct btf_type *elem_type;
-   u32 elem_type_id = array->type;
+   const struct btf_type *elem_type, *index_type;
+   u32 elem_type_id, index_type_id;
struct btf *btf = env->btf;
u32 elem_size;
 
+   /* Check array->index_type */
+   index_type_id = array->index_type;
+   index_type = btf_type_by_id(btf, index_type_id);
+   if (btf_type_is_void_or_null(index_type)) {
+   btf_verifier_log_type(env, v->t, "Invalid index");
+   return -EINVAL;
+   }
+
+   if (!env_type_is_resolve_sink(env, index_type) &&
+   !env_type_is_resolved(env, index_type_id))
+   return env_stack_push(env, index_type, index_type_id);
+
+   index_type = btf_type_id_size(btf, _type_id, NULL);
+   if (!index_type || !btf_type_is_int(index_type) ||
+   /* bit field int is not allowed */
+   !btf_type_int_is_regular(index_type) ||
+   /* unsigned only */
+   BTF_INT_ENCODING(btf_type_int(index_type))) {
+   btf_verifier_log_type(env, v->t, "Invalid index");
+   return -EINVAL;
+   }
+
+   /* Check array->type */
+   elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
if (btf_type_is_void_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
@@ -1351,22 +1399,9 @@ static int btf_array_resolve(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
-   if (btf_type_is_int(elem_type)) {
-   int int_type_data = btf_type_int(elem_type);
-   u16 nr_bits = BTF_INT_BITS(int_type_data);
-   u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-
-   /* Put more restriction on array of int.  The int cannot
-* be a bit field and it must be either u8/u16/u32/u64.
-*/
-   if (BITS_PER_BYTE_MASKED(nr_bits) ||
-   BTF_INT_OFFSET(int_type_data) ||
-   (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
-nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
-   btf_verifier_log_type(env, v->t,
- "Invalid array of int");
-   return -EINVAL;
-   }
+   if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+   btf_verifier_log_type(env, v->t, "Invalid array of int");
+   return -EINVAL;
}
 
if (array->nelems && elem_size

[PATCH bpf-next 7/7] bpf: btf: Add tests for the btf uapi changes

2018-05-18 Thread Martin KaFai Lau

This patch does the followings:
1. Modify libbpf and test_btf to reflect the uapi changes in btf
2. Add test for the btf_header changes
3. Add tests for array->index_type
4. Add err_str check to the tests
5. Fix a 4 bytes hole in "struct test #1" by swapping "m" and "n"

Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/bpf.c|   4 +-
 tools/lib/bpf/bpf.h|   4 +-
 tools/lib/bpf/btf.c|   5 +-
 tools/lib/bpf/libbpf.c |  34 +--
 tools/lib/bpf/libbpf.h |   4 +-
 tools/testing/selftests/bpf/test_btf.c | 528 ++---
 6 files changed, 448 insertions(+), 131 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6a8a00097fd8..442b4cdfeb71 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -89,8 +89,8 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr 
*create_attr)
   min(name_len, BPF_OBJ_NAME_LEN - 1));
attr.numa_node = create_attr->numa_node;
attr.btf_fd = create_attr->btf_fd;
-   attr.btf_key_id = create_attr->btf_key_id;
-   attr.btf_value_id = create_attr->btf_value_id;
+   attr.btf_key_type_id = create_attr->btf_key_type_id;
+   attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
 
return sys_bpf(BPF_MAP_CREATE, , sizeof(attr));
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 15bff7728cf1..d12344f66d4e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -36,8 +36,8 @@ struct bpf_create_map_attr {
__u32 max_entries;
__u32 numa_node;
__u32 btf_fd;
-   __u32 btf_key_id;
-   __u32 btf_value_id;
+   __u32 btf_key_type_id;
+   __u32 btf_value_type_id;
__u32 map_ifindex;
 };
 
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 2bac710e3194..8c54a4b6f187 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -35,9 +35,8 @@ struct btf {
 
 static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
 {
-   if (!BTF_STR_TBL_ELF_ID(offset) &&
-   BTF_STR_OFFSET(offset) < btf->hdr->str_len)
-   return >strings[BTF_STR_OFFSET(offset)];
+   if (offset < btf->hdr->str_len)
+   return >strings[offset];
else
return NULL;
 }
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3dbe217bf23e..8f1707dbfcfa 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -216,8 +216,8 @@ struct bpf_map {
size_t offset;
int map_ifindex;
struct bpf_map_def def;
-   uint32_t btf_key_id;
-   uint32_t btf_value_id;
+   uint32_t btf_key_type_id;
+   uint32_t btf_value_type_id;
void *priv;
bpf_map_clear_priv_t clear_priv;
 };
@@ -1074,8 +1074,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, 
const struct btf *btf)
return -EINVAL;
}
 
-   map->btf_key_id = key_id;
-   map->btf_value_id = value_id;
+   map->btf_key_type_id = key_id;
+   map->btf_value_type_id = value_id;
 
return 0;
 }
@@ -1100,24 +1100,24 @@ bpf_object__create_maps(struct bpf_object *obj)
create_attr.value_size = def->value_size;
create_attr.max_entries = def->max_entries;
create_attr.btf_fd = 0;
-   create_attr.btf_key_id = 0;
-   create_attr.btf_value_id = 0;
+   create_attr.btf_key_type_id = 0;
+   create_attr.btf_value_type_id = 0;
 
if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) {
create_attr.btf_fd = btf__fd(obj->btf);
-   create_attr.btf_key_id = map->btf_key_id;
-   create_attr.btf_value_id = map->btf_value_id;
+   create_attr.btf_key_type_id = map->btf_key_type_id;
+   create_attr.btf_value_type_id = map->btf_value_type_id;
}
 
*pfd = bpf_create_map_xattr(_attr);
-   if (*pfd < 0 && create_attr.btf_key_id) {
+   if (*pfd < 0 && create_attr.btf_key_type_id) {
pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). 
Retrying without BTF.\n",
   map->name, strerror(errno), errno);
create_attr.btf_fd = 0;
-   create_attr.btf_key_id = 0;
-   create_attr.btf_value_id = 0;
-   map->btf_key_id = 0;
-   map->btf_value_id = 0;
+   create_attr.btf_key_type_id = 0;
+   create_attr.btf_value_type_id = 0;
+   map->btf_key_type_id = 0;
+   map->btf_value_type_id = 0;
*pfd = bpf_create_map_xattr(_attr);
}
 
@@ -2085,14 +2085,14 @@ const char

[PATCH bpf-next 1/7] bpf: Expose check_uarg_tail_zero()

2018-05-18 Thread Martin KaFai Lau

This patch exposes check_uarg_tail_zero() which will
be reused by a later BTF patch.  Its name is changed to
bpf_check_uarg_tail_zero().

Signed-off-by: Martin KaFai Lau 
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c | 14 +++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ed0122b45b63..f6fe3c719ca8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -463,6 +463,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct 
file *map_file,
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
 int bpf_get_file_flag(int flags);
+int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size,
+size_t actual_size);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde949c7f8..2b29ef84ded3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -65,9 +65,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  * copy_from_user() call. However, this is not a concern since this function is
  * meant to be a future-proofing of bits.
  */
-static int check_uarg_tail_zero(void __user *uaddr,
-   size_t expected_size,
-   size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+size_t expected_size,
+size_t actual_size)
 {
unsigned char __user *addr;
unsigned char __user *end;
@@ -1899,7 +1899,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
u32 ulen;
int err;
 
-   err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+   err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -1998,7 +1998,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
u32 info_len = attr->info.info_len;
int err;
 
-   err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+   err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -2038,7 +2038,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
u32 info_len = attr->info.info_len;
int err;
 
-   err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+   err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
if (err)
return err;
 
@@ -2110,7 +2110,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
 
-   err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+   err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
size = min_t(u32, size, sizeof(attr));
-- 
2.9.5

[PATCH bpf-next 5/7] bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info

2018-05-18 Thread Martin KaFai Lau

In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id"
could cause confusion because the "id" of "btf_id" means the BPF obj id
given to the BTF object while
"btf_key_id" and "btf_value_id" means the BTF type id within
that BTF object.

To make it clear, btf_key_id and btf_value_id are
renamed to btf_key_type_id and btf_value_type_id.

Suggested-by: Daniel Borkmann 
Signed-off-by: Martin KaFai Lau 
---
 include/linux/bpf.h  |  4 ++--
 include/uapi/linux/bpf.h |  8 
 kernel/bpf/arraymap.c|  2 +-
 kernel/bpf/syscall.c | 18 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f6fe3c719ca8..1795846c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,8 +69,8 @@ struct bpf_map {
u32 pages;
u32 id;
int numa_node;
-   u32 btf_key_id;
-   u32 btf_value_id;
+   u32 btf_key_type_id;
+   u32 btf_value_type_id;
struct btf *btf;
bool unpriv_array;
/* 55 bytes hole */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333a8225..123ebe4b3662 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -284,8 +284,8 @@ union bpf_attr {
charmap_name[BPF_OBJ_NAME_LEN];
__u32   map_ifindex;/* ifindex of netdev to create on */
__u32   btf_fd; /* fd pointing to a BTF type data */
-   __u32   btf_key_id; /* BTF type_id of the key */
-   __u32   btf_value_id;   /* BTF type_id of the value */
+   __u32   btf_key_type_id;/* BTF type_id of the key */
+   __u32   btf_value_type_id;  /* BTF type_id of the value */
};
 
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -2211,8 +2211,8 @@ struct bpf_map_info {
__u64 netns_dev;
__u64 netns_ino;
__u32 btf_id;
-   __u32 btf_key_id;
-   __u32 btf_value_id;
+   __u32 btf_key_type_id;
+   __u32 btf_value_type_id;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0fd8d8f1a398..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, 
void *key,
}
 
seq_printf(m, "%u: ", *(u32 *)key);
-   btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+   btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
seq_puts(m, "\n");
 
rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b29ef84ded3..0b4c94551001 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -422,7 +422,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -457,10 +457,10 @@ static int map_create(union bpf_attr *attr)
atomic_set(>usercnt, 1);
 
if (bpf_map_support_seq_show(map) &&
-   (attr->btf_key_id || attr->btf_value_id)) {
+   (attr->btf_key_type_id || attr->btf_value_type_id)) {
struct btf *btf;
 
-   if (!attr->btf_key_id || !attr->btf_value_id) {
+   if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
err = -EINVAL;
goto free_map_nouncharge;
}
@@ -471,16 +471,16 @@ static int map_create(union bpf_attr *attr)
goto free_map_nouncharge;
}
 
-   err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
- attr->btf_value_id);
+   err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
if (err) {
btf_put(btf);
goto free_map_nouncharge;
}
 
map->btf = btf;
-   map->btf_key_id = attr->btf_key_id;
-   map->btf_value_id = attr->btf_value_id;
+   map->btf_key_type_id = attr->btf_key_type_id;
+   map->btf_value_type_id = attr->btf_value_type_id;
}
 
err = security_bpf_map_alloc(map);
@@ -2013,8 +2013,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 
if (map->btf) {
info.btf_id = btf_id(map->btf);
-   info.btf_key_id = map->btf_key_id;
-   info.btf_value_id = map->btf_value_id;
+   info.btf_key_type_id = map->btf_key_type_id;
+   info.btf_value_type_id = map->btf_value_type_id;
}
 
if (bpf_map_is_dev_bound(map)) {
-- 
2.9.5

[PATCH bpf-next 4/7] bpf: btf: Remove unused bits from uapi/linux/btf.h

2018-05-18 Thread Martin KaFai Lau

This patch does the followings:
1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k.  We can
   raise it later.

2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID.  They are
   currently encoded at the highest bit of a u32.
   It is because the current use case does not require supporting
   parent type (i.e type_id referring to a type in another BTF file).
   It also does not support referring to a string in ELF.

   The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced
   by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are
   defined in btf.c instead of uapi/linux/btf.h.

3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough.
   There is unused bits headroom if we ever needed it later.

4. The root bit in BTF_INFO is also removed because it is not
   used in the current use case.

The above can be added back later because the verifier
ensures the unused bits are zeros.

Signed-off-by: Martin KaFai Lau 
---
 include/uapi/linux/btf.h | 20 +---
 kernel/bpf/btf.c | 34 +-
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 4fa479741a02..b89b56f2b099 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -22,28 +22,19 @@ struct btf_header {
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE   0x7fff
+#define BTF_MAX_TYPE   0x
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET0x7fff
+#define BTF_MAX_NAME_OFFSET0x
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN   0x
 
-/* The type id is referring to a parent BTF */
-#define BTF_TYPE_PARENT(id)(((id) >> 31) & 0x1)
-#define BTF_TYPE_ID(id)((id) & BTF_MAX_TYPE)
-
-/* String is in the ELF string section */
-#define BTF_STR_TBL_ELF_ID(ref)(((ref) >> 31) & 0x1)
-#define BTF_STR_OFFSET(ref)((ref) & BTF_MAX_NAME_OFFSET)
-
 struct btf_type {
__u32 name_off;
/* "info" bits arrangement
 * bits  0-15: vlen (e.g. # of struct's members)
 * bits 16-23: unused
-* bits 24-28: kind (e.g. int, ptr, array...etc)
-* bits 29-30: unused
-* bits31: root
+* bits 24-27: kind (e.g. int, ptr, array...etc)
+* bits 28-31: unused
 */
__u32 info;
/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -58,8 +49,7 @@ struct btf_type {
};
 };
 
-#define BTF_INFO_KIND(info)(((info) >> 24) & 0x1f)
-#define BTF_INFO_ISROOT(info)  (!!(((info) >> 24) & 0x80))
+#define BTF_INFO_KIND(info)(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)((info) & 0x)
 
 #define BTF_KIND_UNKN  0   /* Unknown  */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b4e48dae2240..5d1967d4fb62 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -163,13 +163,15 @@
 #define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
+#define BTF_INFO_MASK 0x0f00
+#define BTF_TYPE_ID_CHECK(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_CHECK(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
 /* 16MB for 64k structs and each has 16 members and
  * a few MB spaces for the string section.
  * The hard limit is S32_MAX.
  */
 #define BTF_MAX_SIZE (16 * 1024 * 1024)
-/* 64k. We can raise it later. The hard limit is S32_MAX. */
-#define BTF_MAX_NR_TYPES 65535
 
 #define for_each_member(i, struct_type, member)\
for (i = 0, member = btf_type_member(struct_type);  \
@@ -422,16 +424,16 @@ static const struct btf_kind_operations 
*btf_type_ops(const struct btf_type *t)
 
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
-   return !BTF_STR_TBL_ELF_ID(offset) &&
-   BTF_STR_OFFSET(offset) < btf->hdr.str_len;
+   return BTF_STR_OFFSET_CHECK(offset) &&
+   offset < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
-   if (!BTF_STR_OFFSET(offset))
+   if (!offset)
return "(anon)";
-   else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
-   return >strings[BTF_STR_OFFSET(offset)];
+   else if (offset < btf->hdr.str_len)
+   return >strings[offset];
else
return "(invalid-name-offset)";
 }
@@ -599,13 +601,13 @@ static int btf_add_type(struct btf_verifier_env *env, 
struct btf_type *t)
struct btf_type **new_types;
u32 expand_by, new_size;
 
-   if (btf->types_size == BTF_MAX_NR_TYPES) {
+   if (btf->types_size == BTF_MAX_TYPE) {
btf_verifier_log(env, "Exceeded max num of types");
return -E2BIG;
}
 
expand_by = max_t(u32, btf->types_size >> 2, 16);
-   new_size = min_t(u32,

[PATCH bpf-next 0/7] BTF uapi cleanup

2018-05-18 Thread Martin KaFai Lau

This patch set makes some changes to cleanup the unused
bits in BTF uapi.  It also makes the btf_header extensible.

Please see individual patches for details.

Martin KaFai Lau (7):
  bpf: Expose check_uarg_tail_zero()
  bpf: btf: Change how section is supported in btf_header
  bpf: btf: Check array->index_type
  bpf: btf: Remove unused bits from uapi/linux/btf.h
  bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info
  bpf: btf: Sync bpf.h and btf.h to tools/include/uapi/linux/
  bpf: btf: Add tests for the btf uapi changes

 include/linux/bpf.h|   6 +-
 include/uapi/linux/bpf.h   |   8 +-
 include/uapi/linux/btf.h   |  28 +-
 kernel/bpf/arraymap.c  |   2 +-
 kernel/bpf/btf.c   | 318 ++--
 kernel/bpf/syscall.c   |  32 +-
 tools/include/uapi/linux/bpf.h |   8 +-
 tools/include/uapi/linux/btf.h |  28 +-
 tools/lib/bpf/bpf.c|   4 +-
 tools/lib/bpf/bpf.h|   4 +-
 tools/lib/bpf/btf.c|   5 +-
 tools/lib/bpf/libbpf.c |  34 +--
 tools/lib/bpf/libbpf.h |   4 +-
 tools/testing/selftests/bpf/test_btf.c | 528 ++---
 14 files changed, 724 insertions(+), 285 deletions(-)

-- 
2.9.5

[PATCH bpf-next 2/7] bpf: btf: Change how section is supported in btf_header

2018-05-18 Thread Martin KaFai Lau

There are currently unused section descriptions in the btf_header.  Those
sections are here to support future BTF use cases.  For example, the
func section (func_off) is to support function signature (e.g. the BPF
prog function signature).

Instead of spelling out all potential sections up-front in the btf_header.
This patch makes changes to btf_header such that extending it (e.g. adding
a section) is possible later.  The unused ones can be removed for now and
they can be added back later.

This patch:
1. adds a hdr_len to the btf_header.  It will allow adding
sections (and other info like parent_label and parent_name)
later.  The check is similar to the existing bpf_attr.
If a user passes in a longer hdr_len, the kernel
ensures the extra tailing bytes are 0.

2. allows the section order in the BTF object to be
different from its sec_off order in btf_header.

3. each sec_off is followed by a sec_len.  It must not have gap or
overlapping among sections.

The string section is ensured to be at the end due to the 4 bytes
alignment requirement of the type section.

The above changes will allow enough flexibility to
add new sections (and other info) to the btf_header later.

This patch also removes an unnecessary !err check
at the end of btf_parse().

Signed-off-by: Martin KaFai Lau 
---
 include/uapi/linux/btf.h |   8 +-
 kernel/bpf/btf.c | 207 +++
 2 files changed, 158 insertions(+), 57 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index bcb56ee47014..4fa479741a02 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -12,15 +12,11 @@ struct btf_header {
__u16   magic;
__u8version;
__u8flags;
-
-   __u32   parent_label;
-   __u32   parent_name;
+   __u32   hdr_len;
 
/* All offsets are in bytes relative to the end of this header */
-   __u32   label_off;  /* offset of label section  */
-   __u32   object_off; /* offset of data object section*/
-   __u32   func_off;   /* offset of function section   */
__u32   type_off;   /* offset of type section   */
+   __u32   type_len;   /* length of type section   */
__u32   str_off;/* offset of string section */
__u32   str_len;/* length of string section */
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ded10ab47b8a..536e5981ad8c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr);
 static DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
-   union {
-   struct btf_header *hdr;
-   void *data;
-   };
+   void *data;
struct btf_type **types;
u32 *resolved_ids;
u32 *resolved_sizes;
const char *strings;
void *nohdr_data;
+   struct btf_header hdr;
u32 nr_types;
u32 types_size;
u32 data_size;
@@ -227,6 +226,12 @@ enum resolve_mode {
 };
 
 #define MAX_RESOLVE_DEPTH 32
+#define NR_SECS 2
+
+struct btf_sec_info {
+   u32 off;
+   u32 len;
+};
 
 struct btf_verifier_env {
struct btf *btf;
@@ -418,14 +423,14 @@ static const struct btf_kind_operations 
*btf_type_ops(const struct btf_type *t)
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
return !BTF_STR_TBL_ELF_ID(offset) &&
-   BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+   BTF_STR_OFFSET(offset) < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
if (!BTF_STR_OFFSET(offset))
return "(anon)";
-   else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+   else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
return >strings[BTF_STR_OFFSET(offset)];
else
return "(invalid-name-offset)";
@@ -536,7 +541,8 @@ static void btf_verifier_log_member(struct btf_verifier_env 
*env,
__btf_verifier_log(log, "\n");
 }
 
-static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+u32 btf_data_size)
 {
struct bpf_verifier_log *log = >log;
const struct btf *btf = env->btf;
@@ -545,19 +551,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env 
*env)
if (!bpf_verifier_log_needed(log))
return;
 
-   hdr = btf->hdr;
+   hdr = >hdr;
__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
__btf_verifier_log(log, "version: %u\n", hdr->version);
__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
-   __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
-   __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
-   __btf_verifier_log(log,

Re: [PATCH iproute2] ip link: Do not call ll_name_to_index when creating a new link

2018-05-18 Thread David Ahern

On 5/18/18 4:08 PM, Stephen Hemminger wrote:
> 
> What about just pushing the lookup down to the leaf functions that need it?
> 

That should work as well. You want to re-send a formal patch?

general protection fault in smc_ioctl

2018-05-18 Thread syzbot


Hello,

syzbot found the following crash on:

HEAD commit:1f7455c3912d tcp: tcp_rack_reo_wnd() can be static
git tree:   net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=171a133780
kernel config:  https://syzkaller.appspot.com/x/.config?x=b632d8e2c2ab2c1
dashboard link: https://syzkaller.appspot.com/bug?extid=e6714328fda813fc670f
compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=15782d5780
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=108711a780

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+e6714328fda813fc6...@syzkaller.appspotmail.com

random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault:  [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 4559 Comm: syz-executor292 Not tainted 4.17.0-rc4+ #50
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

RIP: 0010:smc_ioctl+0x3dc/0x9f0 net/smc/af_smc.c:1499
RSP: 0018:8801ad22f770 EFLAGS: 00010202
RAX: dc00 RBX: 8801ad0df7c0 RCX: 8741188f
RDX: 0004 RSI: 8741189e RDI: 0020
RBP: 8801ad22f9d0 R08: 8801ae87e6c0 R09: ed00363e1818
R10: ed00363e1818 R11: 8801b1f0c0c3 R12: 110035a45ef1
R13: 2080 R14:  R15: 
FS:  017b7880() GS:8801daf0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7ffd1f18f038 CR3: 0001ad044000 CR4: 001406e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 sock_do_ioctl+0xe4/0x3e0 net/socket.c:957
 sock_ioctl+0x30d/0x680 net/socket.c:1081
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:500 [inline]
 do_vfs_ioctl+0x1cf/0x16a0 fs/ioctl.c:684
 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701
 __do_sys_ioctl fs/ioctl.c:708 [inline]
 __se_sys_ioctl fs/ioctl.c:706 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x43fca9
RSP: 002b:7ffd1f073588 EFLAGS: 0213 ORIG_RAX: 0010
RAX: ffda RBX: 004002c8 RCX: 0043fca9
RDX: 2080 RSI: 5411 RDI: 0003
RBP: 006ca018 R08: 004002c8 R09: 004002c8
R10: 004002c8 R11: 0213 R12: 004015d0
R13: 00401660 R14:  R15: 
Code: fa 48 c1 ea 03 80 3c 02 00 0f 85 7d 05 00 00 4c 8b b3 90 04 00 00 48  
b8 00 00 00 00 00 fc ff df 49 8d 7e 20 48 89 fa 48 c1 ea 03 <0f> b6 04 02  
84 c0 74 08 3c 03 0f 8e 47 05 00 00 45 8b 7e 20 4c

RIP: smc_ioctl+0x3dc/0x9f0 net/smc/af_smc.c:1499 RSP: 8801ad22f770
---[ end trace b586e1eb098f7714 ]---


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.

syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

Re: [PATCH iproute2] Allow to configure /var/run/netns directory

2018-05-18 Thread Pavel Maltsev

Thanks, Stephen,

I've uploaded new patch as you suggested by putting these
variables in the makefile rather than configure script.

On Fri, May 18, 2018 at 2:53 PM Stephen Hemminger
 wrote:
>
> On Tue, 15 May 2018 14:49:46 -0700
> Pavel Maltsev  wrote:
>
> > Currently NETNS_RUN_DIR is hardcoded and refers to /var/run/netns.
> > However, some systems (e.g. Android) doesn't have /var
> > which results in error attempts to create network namespaces on these
> > systems.  This change makes NETNS_RUN_DIR configurable at build time
> > by allowing to pass environment variable to configre script.
> >
> > For example: NETNS_RUN_DIR=/mnt/vendor/netns ./configure && make
> >
> > Tested: verified that iproute2 with configuration mentioned above
> > creates namespaces in /mnt/vendor/netns
> >
> > Signed-off-by: Pavel Maltsev 
>
> The directory path should definitely be overrideable on the build.
> The configure script is already messy enough, lets do it instead like
> the other runtime directories are already done ARPDDIR and CONFDIR.
>
> Something like?
>
> diff --git a/Makefile b/Makefile
> index b526d3b5b5c4..ab828669e711 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -16,6 +16,7 @@ PREFIX?=/usr
>  LIBDIR?=$(PREFIX)/lib
>  SBINDIR?=/sbin
>  CONFDIR?=/etc/iproute2
> +NETNS_RUN_DIR?=/var/run/netns
>  DATADIR?=$(PREFIX)/share
>  HDRDIR?=$(PREFIX)/include/iproute2
>  DOCDIR?=$(DATADIR)/doc/iproute2
> @@ -34,7 +35,7 @@ ifneq ($(SHARED_LIBS),y)
>  DEFINES+= -DNO_SHARED_LIBS
>  endif
>
> -DEFINES+=-DCONFDIR=\"$(CONFDIR)\"
> +DEFINES+=-DCONFDIR=\"$(CONFDIR)\" -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\"
>
>  #options for decnet
>  ADDLIB+=dnet_ntop.o dnet_pton.o
> diff --git a/include/namespace.h b/include/namespace.h
> index aed7ce08507f..e47f9b5d49d1 100644
> --- a/include/namespace.h
> +++ b/include/namespace.h
> @@ -8,8 +8,13 @@
>  #include 
>  #include 
>
> +#ifndef NETNS_RUN_DIR
>  #define NETNS_RUN_DIR "/var/run/netns"
> +#endif
> +
> +#ifndef NETNS_ETC_DIR
>  #define NETNS_ETC_DIR "/etc/netns"
> +#endif
>
>  #ifndef CLONE_NEWNET
>  #define CLONE_NEWNET 0x4000/* New network namespace (lo, device, 
> names sockets, etc) */
>

[PATCH iproute2] Allow to configure /var/run/netns directory

2018-05-18 Thread Pavel Maltsev

Currently NETNS_RUN_DIR is hardcoded and refers to /var/run/netns.
However, some systems (e.g. Android) doesn't have /var
which results in error attempts to create network namespaces on these
systems.  This change makes NETNS_RUN_DIR configurable at build time
by allowing to pass environment variable to make command.
Also, this change makes /etc/netns directory configurable through
NETNS_ETC_DIR environment variable.

For example: ./configure && NETNS_RUN_DIR=/mnt/vendor/netns make

Tested: verified that iproute2 with configuration mentioned above
creates namespaces in /mnt/vendor/netns

Signed-off-by: Pavel Maltsev 
---
 Makefile| 6 +-
 include/namespace.h | 5 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b526d3b5..651d2a50 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,8 @@ PREFIX?=/usr
 LIBDIR?=$(PREFIX)/lib
 SBINDIR?=/sbin
 CONFDIR?=/etc/iproute2
+NETNS_RUN_DIR?=/var/run/netns
+NETNS_ETC_DIR?=/etc/netns
 DATADIR?=$(PREFIX)/share
 HDRDIR?=$(PREFIX)/include/iproute2
 DOCDIR?=$(DATADIR)/doc/iproute2
@@ -34,7 +36,9 @@ ifneq ($(SHARED_LIBS),y)
 DEFINES+= -DNO_SHARED_LIBS
 endif
 
-DEFINES+=-DCONFDIR=\"$(CONFDIR)\"
+DEFINES+=-DCONFDIR=\"$(CONFDIR)\" \
+ -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \
+ -DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\"
 
 #options for decnet
 ADDLIB+=dnet_ntop.o dnet_pton.o
diff --git a/include/namespace.h b/include/namespace.h
index aed7ce08..e47f9b5d 100644
--- a/include/namespace.h
+++ b/include/namespace.h
@@ -8,8 +8,13 @@
 #include 
 #include 
 
+#ifndef NETNS_RUN_DIR
 #define NETNS_RUN_DIR "/var/run/netns"
+#endif
+
+#ifndef NETNS_ETC_DIR
 #define NETNS_ETC_DIR "/etc/netns"
+#endif
 
 #ifndef CLONE_NEWNET
 #define CLONE_NEWNET 0x4000/* New network namespace (lo, device, 
names sockets, etc) */
-- 
2.17.0.441.gb46fe60e1d-goog

Re: [PATCH net-next v2 1/3] net: ethernet: ti: Allow most drivers with COMPILE_TEST

2018-05-18 Thread kbuild test robot

Hi Florian,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Florian-Fainelli/net-ethernet-ti-Allow-most-drivers-with-COMPILE_TEST/20180519-043005
config: sparc64-allyesconfig (attached as .config)
compiler: sparc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=sparc64 

All errors (new ones prefixed by >>):

   `.exit.data' referenced in section `.exit.text' of drivers/tty/n_hdlc.o: 
defined in discarded section `.exit.data' of drivers/tty/n_hdlc.o
   `.exit.data' referenced in section `.exit.text' of drivers/tty/n_hdlc.o: 
defined in discarded section `.exit.data' of drivers/tty/n_hdlc.o
   `.exit.data' referenced in section `.exit.text' of drivers/tty/n_hdlc.o: 
defined in discarded section `.exit.data' of drivers/tty/n_hdlc.o
   `.exit.data' referenced in section `.exit.text' of drivers/tty/n_hdlc.o: 
defined in discarded section `.exit.data' of drivers/tty/n_hdlc.o
   drivers/net/ethernet/ti/netcp_core.o: In function `netcp_txpipe_open':
>> netcp_core.c:(.text+0xc84): undefined reference to `knav_queue_open'

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

[PATCH v2] selftests: net: reuseport_bpf_numa: don't fail if no numa support

2018-05-18 Thread Anders Roxell

The reuseport_bpf_numa test case fails there's no numa support.  The
test shouldn't fail if there's no support it should be skipped.

Fixes: 3c2c3c16aaf6 ("reuseport, bpf: add test case for bpf_get_numa_node_id")
Signed-off-by: Anders Roxell 
---
 tools/testing/selftests/net/reuseport_bpf_numa.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/reuseport_bpf_numa.c 
b/tools/testing/selftests/net/reuseport_bpf_numa.c
index 365c32e84189..c9f478b40996 100644
--- a/tools/testing/selftests/net/reuseport_bpf_numa.c
+++ b/tools/testing/selftests/net/reuseport_bpf_numa.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 
+#include "../kselftest.h"
+
 static const int PORT = ;
 
 static void build_rcv_group(int *rcv_fd, size_t len, int family, int proto)
@@ -229,7 +231,7 @@ int main(void)
int *rcv_fd, nodes;
 
if (numa_available() < 0)
-   error(1, errno, "no numa api support");
+   ksft_exit_skip("no numa api support\n");
 
nodes = numa_max_node() + 1;
 
-- 
2.17.0

Re: [PATCH bpf-next v2 7/7] tools/bpftool: add perf subcommand

2018-05-18 Thread Y Song

On Fri, May 18, 2018 at 1:51 PM, Jakub Kicinski
 wrote:
> On Thu, 17 May 2018 22:03:10 -0700, Yonghong Song wrote:
>> The new command "bpftool perf [show | list]" will traverse
>> all processes under /proc, and if any fd is associated
>> with a perf event, it will print out related perf event
>> information. Documentation is also added.
>
> Thanks for the changes, it looks good with some minor nits which can be
> addressed as follow up if there is no other need to respin.  Please
> consider it:
>
> Reviewed-by: Jakub Kicinski 

Most likely will need respin. Will make suggested changes then.

>
>> Below is an example to show the results using bcc commands.
>> Running the following 4 bcc commands:
>>   kprobe: trace.py '__x64_sys_nanosleep'
>>   kretprobe:  trace.py 'r::__x64_sys_nanosleep'
>>   tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
>>   uprobe: trace.py 'p:/home/yhs/a.out:main'
>>
>> The bpftool command line and result:
>>
>>   $ bpftool perf
>>   pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
>>   pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
>>   pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
>>   pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
>>
>>   $ bpftool -j perf
>>   
>> {"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0},
>>  \
>>   
>> {"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
>>  \
>>   
>> {"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
>>  \
>>   
>> {"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}
>
> nit: this is now an array

Sorry, this is probably updated in middle of work. Will make the change in
the next revision.

>
>>   $ bpftool prog
>>   5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
>> loaded_at 2018-05-15T04:46:37-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 4
>>   7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
>> loaded_at 2018-05-15T04:48:32-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 7
>>   8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
>> loaded_at 2018-05-15T04:48:48-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 8
>>   9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
>> loaded_at 2018-05-15T04:49:52-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 9
>>
>>   $ ps ax | grep "python ./trace.py"
>>   21711 pts/0T  0:03 python ./trace.py __x64_sys_write
>>   21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
>>   21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
>>   21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
>>   22374 pts/1S+ 0:00 grep --color=auto python ./trace.py
>>
>> Signed-off-by: Yonghong Song 
>
>> diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
>> b/tools/bpf/bpftool/bash-completion/bpftool
>> index b301c9b..3680ad4 100644
>> --- a/tools/bpf/bpftool/bash-completion/bpftool
>> +++ b/tools/bpf/bpftool/bash-completion/bpftool
>> @@ -448,6 +448,15 @@ _bpftool()
>>  ;;
>>  esac
>>  ;;
>> +cgroup)
>
> s/cgroup/perf/ :)

A mistake in my side to consolidate different version of code.
I did have "perf" in one of my versions and tested it properly.

>
>> +case $command in
>> +*)
>> +[[ $prev == $object ]] && \
>> +COMPREPLY=( $( compgen -W 'help \
>> +show list' -- "$cur" ) )
>> +;;
>> +esac
>> +;;
>>  esac
>>  } &&
>>  complete -F _bpftool bpftool
>
>> +static int show_proc(const char *fpath, const struct stat *sb,
>> +  int tflag, struct FTW *ftwbuf)
>> +{
>> + __u64 probe_offset, probe_addr;
>> + __u32 prog_id, attach_info;
>> + int err, pid = 0, fd = 0;
>> + const char *pch;
>> + char buf[4096];
>> +
>> + /* prefix always /proc */
>> + pch = fpath + 5;
>> + if (*pch == '\0')
>> + return 0;
>> +
>> + /* pid should be all numbers */
>> + pch++;
>> + while (isdigit(*pch)) {
>> + pid = pid * 10 + *pch - '0';
>> + pch++;
>> + }
>> + if (*pch == '\0')
>> + return 0;
>> + if (*pch != '/')
>> + return FTW_SKIP_SUBTREE;
>> +
>> + /* check /proc//fd directory */
>> + pch++;
>> + if (strncmp(pch, "fd", 2))
>> + return FTW_SKIP_SUBTREE;
>> + pch += 2;
>> + if (*pch == '\0')
>> + return 0;
>> + if (*pch != '/')
>> + return FTW_SKIP_SUBTREE;
>> +
>> + /* check

Re: [PATCH iproute2] ip link: Do not call ll_name_to_index when creating a new link

2018-05-18 Thread Stephen Hemminger

On Thu, 17 May 2018 18:17:12 -0600
David Ahern  wrote:

> On 5/17/18 4:36 PM, Stephen Hemminger wrote:
> > On Thu, 17 May 2018 16:22:37 -0600
> > dsah...@kernel.org wrote:
> >   
> >> From: David Ahern 
> >>
> >> Using iproute2 to create a bridge and add 4094 vlans to it can take from
> >> 2 to 3 *minutes*. The reason is the extraneous call to ll_name_to_index.
> >> ll_name_to_index results in an ioctl(SIOCGIFINDEX) call which in turn
> >> invokes dev_load. If the index does not exist, which it won't when
> >> creating a new link, dev_load calls modprobe twice -- once for
> >> netdev-NAME and again for NAME. This is unnecessary overhead for each
> >> link create.
> >>
> >> When ip link is invoked for a new device, there is no reason to
> >> call ll_name_to_index for the new device. With this patch, creating
> >> a bridge and adding 4094 vlans takes less than 3 *seconds*.
> >>
> >> Signed-off-by: David Ahern   
> > 
> > Yes this looks like a real problem.
> > Isn't the cache supposed to reduce this?
> > 
> > Don't like to make lots of special case flags.
> >   
> 
> The device does not exist, so it won't be in any cache. ll_name_to_index
> already checks it though before calling if_nametoindex.

Good point, I just don't like adding more conditional paths in a function
it is a common source of errors.

What about just pushing the lookup down to the leaf functions that need it?

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 1b89795caa58..49eb7d7bed40 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -36,7 +36,7 @@ int print_addrlabel(const struct sockaddr_nl *who,
 int print_neigh(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg);
 int ipaddr_list_link(int argc, char **argv);
-void ipaddr_get_vf_rate(int, int *, int *, int);
+void ipaddr_get_vf_rate(int, int *, int *, const char *);
 void iplink_usage(void) __attribute__((noreturn));
 
 void iproute_reset_filter(int ifindex);
@@ -145,7 +145,7 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int 
*argcp, char ***argvp);
 void lwt_print_encap(FILE *fp, struct rtattr *encap_type, struct rtattr 
*encap);
 
 /* iplink_xdp.c */
-int xdp_parse(int *argc, char ***argv, struct iplink_req *req, __u32 ifindex,
+int xdp_parse(int *argc, char ***argv, struct iplink_req *req, const char 
*ifname,
  bool generic, bool drv, bool offload);
 void xdp_dump(FILE *fp, struct rtattr *tb, bool link, bool details);
 
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 75539e057f6a..00da14c6f97c 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1967,14 +1967,20 @@ ipaddr_loop_each_vf(struct rtattr *tb[], int vfnum, int 
*min, int *max)
exit(1);
 }
 
-void ipaddr_get_vf_rate(int vfnum, int *min, int *max, int idx)
+void ipaddr_get_vf_rate(int vfnum, int *min, int *max, const char *dev)
 {
struct nlmsg_chain linfo = { NULL, NULL};
struct rtattr *tb[IFLA_MAX+1];
struct ifinfomsg *ifi;
struct nlmsg_list *l;
struct nlmsghdr *n;
-   int len;
+   int idx, len;
+
+   idx = ll_name_to_index(dev);
+   if (idx == 0) {
+   fprintf(stderr, "Device %s does not exist\n", dev);
+   exit(1);
+   }
 
if (rtnl_wilddump_request(, AF_UNSPEC, RTM_GETLINK) < 0) {
perror("Cannot send dump request");
diff --git a/ip/iplink.c b/ip/iplink.c
index 22afe0221f3c..9ff5f692a1d4 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -242,9 +242,10 @@ static int iplink_have_newlink(void)
 }
 #endif /* ! IPLINK_IOCTL_COMPAT */
 
-static int nl_get_ll_addr_len(unsigned int dev_index)
+static int nl_get_ll_addr_len(const char *ifname)
 {
int len;
+   int dev_index = ll_name_to_index(ifname);
struct iplink_req req = {
.n = {
.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
@@ -259,6 +260,9 @@ static int nl_get_ll_addr_len(unsigned int dev_index)
struct nlmsghdr *answer;
struct rtattr *tb[IFLA_MAX+1];
 
+   if (dev_index == 0)
+   return -1;
+
if (rtnl_talk(, , ) < 0)
return -1;
 
@@ -337,7 +341,7 @@ static void iplink_parse_vf_vlan_info(int vf, int *argcp, 
char ***argvp,
 }
 
 static int iplink_parse_vf(int vf, int *argcp, char ***argvp,
-  struct iplink_req *req, int dev_index)
+  struct iplink_req *req, const char *dev)
 {
char new_rate_api = 0, count = 0, override_legacy_rate = 0;
struct ifla_vf_rate tivt;
@@ -373,7 +377,7 @@ static int iplink_parse_vf(int vf, int *argcp, char 
***argvp,
NEXT_ARG();
if (matches(*argv, "mac") == 0) {
struct ifla_vf_mac ivm = { 0 };
-   int halen = nl_get_ll_addr_len(dev_index);
+   int halen = nl_get_ll_addr_len(dev);
 
NEXT_ARG();
ivm.vf = vf;
@@

Re: [PATCH net-next v2 1/3] net: ethernet: ti: Allow most drivers with COMPILE_TEST

2018-05-18 Thread kbuild test robot

Hi Florian,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Florian-Fainelli/net-ethernet-ti-Allow-most-drivers-with-COMPILE_TEST/20180519-043005
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=ia64 

All warnings (new ones prefixed by >>):

   drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_free_rx_desc_chain':
>> drivers/net/ethernet/ti/netcp_core.c:613:13: warning: cast to pointer from 
>> integer of different size [-Wint-to-pointer-cast]
  buf_ptr = (void *)GET_SW_DATA0(ndesc);
^
   drivers/net/ethernet/ti/netcp_core.c:622:12: warning: cast to pointer from 
integer of different size [-Wint-to-pointer-cast]
 buf_ptr = (void *)GET_SW_DATA0(desc);
   ^
   drivers/net/ethernet/ti/netcp_core.c: In function 
'netcp_process_one_rx_packet':
   drivers/net/ethernet/ti/netcp_core.c:681:16: warning: cast to pointer from 
integer of different size [-Wint-to-pointer-cast]
 org_buf_ptr = (void *)GET_SW_DATA0(desc);
   ^
   drivers/net/ethernet/ti/netcp_core.c:718:10: warning: cast to pointer from 
integer of different size [-Wint-to-pointer-cast]
  page = (struct page *)GET_SW_DATA0(ndesc);
 ^
   drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_free_rx_buf':
   drivers/net/ethernet/ti/netcp_core.c:822:13: warning: cast to pointer from 
integer of different size [-Wint-to-pointer-cast]
  buf_ptr = (void *)GET_SW_DATA0(desc);
^
   drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_allocate_rx_buf':
>> drivers/net/ethernet/ti/netcp_core.c:906:16: warning: cast from pointer to 
>> integer of different size [-Wpointer-to-int-cast]
  sw_data[0] = (u32)bufptr;
   ^
   drivers/net/ethernet/ti/netcp_core.c:919:16: warning: cast from pointer to 
integer of different size [-Wpointer-to-int-cast]
  sw_data[0] = (u32)page;
   ^
   drivers/net/ethernet/ti/netcp_core.c: In function 
'netcp_process_tx_compl_packets':
   drivers/net/ethernet/ti/netcp_core.c:1041:9: warning: cast to pointer from 
integer of different size [-Wint-to-pointer-cast]
  skb = (struct sk_buff *)GET_SW_DATA0(desc);
^
   drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_tx_submit_skb':
   drivers/net/ethernet/ti/netcp_core.c:1256:15: warning: cast from pointer to 
integer of different size [-Wpointer-to-int-cast]
 SET_SW_DATA0((u32)skb, desc);
  ^
   drivers/net/ethernet/ti/netcp_core.c:181:49: note: in definition of macro 
'SET_SW_DATA0'
#define SET_SW_DATA0(data, desc) set_sw_data(0, data, desc)
^~~~

vim +613 drivers/net/ethernet/ti/netcp_core.c

84640e27 Karicheri, Muralidharan 2015-01-15  591  
84640e27 Karicheri, Muralidharan 2015-01-15  592  static void 
netcp_free_rx_desc_chain(struct netcp_intf *netcp,
84640e27 Karicheri, Muralidharan 2015-01-15  593
 struct knav_dma_desc *desc)
84640e27 Karicheri, Muralidharan 2015-01-15  594  {
84640e27 Karicheri, Muralidharan 2015-01-15  595struct knav_dma_desc 
*ndesc;
84640e27 Karicheri, Muralidharan 2015-01-15  596dma_addr_t dma_desc, 
dma_buf;
84640e27 Karicheri, Muralidharan 2015-01-15  597unsigned int buf_len, 
dma_sz = sizeof(*ndesc);
84640e27 Karicheri, Muralidharan 2015-01-15  598void *buf_ptr;
958d104e Arnd Bergmann   2015-12-18  599u32 tmp;
84640e27 Karicheri, Muralidharan 2015-01-15  600  
84640e27 Karicheri, Muralidharan 2015-01-15  601get_words(_desc, 1, 
>next_desc);
84640e27 Karicheri, Muralidharan 2015-01-15  602  
84640e27 Karicheri, Muralidharan 2015-01-15  603while (dma_desc) {
84640e27 Karicheri, Muralidharan 2015-01-15  604ndesc = 
knav_pool_desc_unmap(netcp->rx_pool, dma_desc, dma_sz);
84640e27 Karicheri, Muralidharan 2015-01-15  605if 
(unlikely(!ndesc)) {
84640e27 Karicheri, Muralidharan 2015-01-15  606
dev_err(netcp->ndev_dev, "failed to unmap Rx desc\n");
84640e27 Karicheri, Muralidharan 2015-01-15  607break;
84640e27 Karicheri, Muralidharan 2015-01-15  608}
958d104e Arnd Bergmann   2015-12-18  609
get_pkt_info(_buf, , _desc, ndesc);
06324481 Karicheri, Muralidharan 2016-02-19  610/* warning 
We are retrieving the virtual ptr in the sw_data
06324481 Karicheri, Muralidharan 2016-02-19  611 * field as a 
32bit value. Will not work on 64bit machines
06324481 Karicheri, Muralidharan 2016-02-19  612 */
06324481

Re: [PATCH iproute2] Allow to configure /var/run/netns directory

2018-05-18 Thread Stephen Hemminger

On Tue, 15 May 2018 14:49:46 -0700
Pavel Maltsev  wrote:

> Currently NETNS_RUN_DIR is hardcoded and refers to /var/run/netns.
> However, some systems (e.g. Android) doesn't have /var
> which results in error attempts to create network namespaces on these
> systems.  This change makes NETNS_RUN_DIR configurable at build time
> by allowing to pass environment variable to configre script.
> 
> For example: NETNS_RUN_DIR=/mnt/vendor/netns ./configure && make
> 
> Tested: verified that iproute2 with configuration mentioned above
> creates namespaces in /mnt/vendor/netns
> 
> Signed-off-by: Pavel Maltsev 

The directory path should definitely be overrideable on the build.
The configure script is already messy enough, lets do it instead like
the other runtime directories are already done ARPDDIR and CONFDIR.

Something like?

diff --git a/Makefile b/Makefile
index b526d3b5b5c4..ab828669e711 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ PREFIX?=/usr
 LIBDIR?=$(PREFIX)/lib
 SBINDIR?=/sbin
 CONFDIR?=/etc/iproute2
+NETNS_RUN_DIR?=/var/run/netns
 DATADIR?=$(PREFIX)/share
 HDRDIR?=$(PREFIX)/include/iproute2
 DOCDIR?=$(DATADIR)/doc/iproute2
@@ -34,7 +35,7 @@ ifneq ($(SHARED_LIBS),y)
 DEFINES+= -DNO_SHARED_LIBS
 endif
 
-DEFINES+=-DCONFDIR=\"$(CONFDIR)\"
+DEFINES+=-DCONFDIR=\"$(CONFDIR)\" -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\"
 
 #options for decnet
 ADDLIB+=dnet_ntop.o dnet_pton.o
diff --git a/include/namespace.h b/include/namespace.h
index aed7ce08507f..e47f9b5d49d1 100644
--- a/include/namespace.h
+++ b/include/namespace.h
@@ -8,8 +8,13 @@
 #include 
 #include 
 
+#ifndef NETNS_RUN_DIR
 #define NETNS_RUN_DIR "/var/run/netns"
+#endif
+
+#ifndef NETNS_ETC_DIR
 #define NETNS_ETC_DIR "/etc/netns"
+#endif
 
 #ifndef CLONE_NEWNET
 #define CLONE_NEWNET 0x4000/* New network namespace (lo, device, 
names sockets, etc) */

Re: [PATCH 05/15] mtd: nand: pxa3xx: remove the dmaengine compat need

2018-05-18 Thread Daniel Mack


Hi Robert,

Thanks for this series.

On Monday, April 02, 2018 04:26 PM, Robert Jarzmik wrote:

From: Robert Jarzmik 

As the pxa architecture switched towards the dmaengine slave map, the
old compatibility mechanism to acquire the dma requestor line number and
priority are not needed anymore.

This patch simplifies the dma resource acquisition, using the more
generic function dma_request_slave_channel().

Signed-off-by: Robert Jarzmik 
---
  drivers/mtd/nand/pxa3xx_nand.c | 10 +-


This driver was replaced by drivers/mtd/nand/raw/marvell_nand.c 
recently, so this patch can be dropped. I attached a version for the new 
driver which you can pick instead.



Thanks,
Daniel
>From c63bc40bdfe2d596e42919235840109a2f1b2776 Mon Sep 17 00:00:00 2001
From: Daniel Mack 
Date: Sat, 12 May 2018 21:50:13 +0200
Subject: [PATCH] mtd: rawnand: marvell: remove dmaengine compat code

As the pxa architecture switched towards the dmaengine slave map, the
old compatibility mechanism to acquire the dma requestor line number and
priority are not needed anymore.

This patch simplifies the dma resource acquisition, using the more
generic function dma_request_slave_channel().

Signed-off-by: Daniel Mack 
---
 drivers/mtd/nand/raw/marvell_nand.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/mtd/nand/raw/marvell_nand.c b/drivers/mtd/nand/raw/marvell_nand.c
index ebb1d141b900..30017cd7d91c 100644
--- a/drivers/mtd/nand/raw/marvell_nand.c
+++ b/drivers/mtd/nand/raw/marvell_nand.c
@@ -2612,8 +2612,6 @@ static int marvell_nfc_init_dma(struct marvell_nfc *nfc)
 		dev);
 	struct dma_slave_config config = {};
 	struct resource *r;
-	dma_cap_mask_t mask;
-	struct pxad_param param;
 	int ret;
 
 	if (!IS_ENABLED(CONFIG_PXA_DMA)) {
@@ -2632,14 +2630,7 @@ static int marvell_nfc_init_dma(struct marvell_nfc *nfc)
 		return -ENXIO;
 	}
 
-	param.drcmr = r->start;
-	param.prio = PXAD_PRIO_LOWEST;
-	dma_cap_zero(mask);
-	dma_cap_set(DMA_SLAVE, mask);
-	nfc->dma_chan =
-		dma_request_slave_channel_compat(mask, pxad_filter_fn,
-		 , nfc->dev,
-		 "data");
+	nfc->dma_chan = dma_request_slave_channel(nfc->dev, "data");
 	if (!nfc->dma_chan) {
 		dev_err(nfc->dev,
 			"Unable to request data DMA channel\n");
-- 
2.14.3

[PATCH v2] isdn: eicon: fix a missing-check bug

2018-05-18 Thread Wenwen Wang

In divasmain.c, the function divas_write() firstly invokes the function
diva_xdi_open_adapter() to open the adapter that matches with the adapter
number provided by the user, and then invokes the function diva_xdi_write()
to perform the write operation using the matched adapter. The two functions
diva_xdi_open_adapter() and diva_xdi_write() are located in diva.c.

In diva_xdi_open_adapter(), the user command is copied to the object 'msg'
from the userspace pointer 'src' through the function pointer 'cp_fn',
which eventually calls copy_from_user() to do the copy. Then, the adapter
number 'msg.adapter' is used to find out a matched adapter from the
'adapter_queue'. A matched adapter will be returned if it is found.
Otherwise, NULL is returned to indicate the failure of the verification on
the adapter number.

As mentioned above, if a matched adapter is returned, the function
diva_xdi_write() is invoked to perform the write operation. In this
function, the user command is copied once again from the userspace pointer
'src', which is the same as the 'src' pointer in diva_xdi_open_adapter() as
both of them are from the 'buf' pointer in divas_write(). Similarly, the
copy is achieved through the function pointer 'cp_fn', which finally calls
copy_from_user(). After the successful copy, the corresponding command
processing handler of the matched adapter is invoked to perform the write
operation.

It is obvious that there are two copies here from userspace, one is in
diva_xdi_open_adapter(), and one is in diva_xdi_write(). Plus, both of
these two copies share the same source userspace pointer, i.e., the 'buf'
pointer in divas_write(). Given that a malicious userspace process can race
to change the content pointed by the 'buf' pointer, this can pose potential
security issues. For example, in the first copy, the user provides a valid
adapter number to pass the verification process and a valid adapter can be
found. Then the user can modify the adapter number to an invalid number.
This way, the user can bypass the verification process of the adapter
number and inject inconsistent data.

This patch reuses the data copied in
diva_xdi_open_adapter() and passes it to diva_xdi_write(). This way, the
above issues can be avoided.

Signed-off-by: Wenwen Wang 
---
 drivers/isdn/hardware/eicon/diva.c  | 20 +---
 drivers/isdn/hardware/eicon/diva.h  |  5 +++--
 drivers/isdn/hardware/eicon/divasmain.c | 18 +++---
 3 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/drivers/isdn/hardware/eicon/diva.c 
b/drivers/isdn/hardware/eicon/diva.c
index 944a7f3..fa239d8 100644
--- a/drivers/isdn/hardware/eicon/diva.c
+++ b/drivers/isdn/hardware/eicon/diva.c
@@ -388,10 +388,9 @@ void divasa_xdi_driver_unload(void)
 **  Receive and process command from user mode utility
 */
 void *diva_xdi_open_adapter(void *os_handle, const void __user *src,
-   int length,
+   int length, diva_xdi_um_cfg_cmd_t *msg,
divas_xdi_copy_from_user_fn_t cp_fn)
 {
-   diva_xdi_um_cfg_cmd_t msg;
diva_os_xdi_adapter_t *a = NULL;
diva_os_spin_lock_magic_t old_irql;
struct list_head *tmp;
@@ -401,21 +400,21 @@ void *diva_xdi_open_adapter(void *os_handle, const void 
__user *src,
 length, sizeof(diva_xdi_um_cfg_cmd_t)))
return NULL;
}
-   if ((*cp_fn) (os_handle, , src, sizeof(msg)) <= 0) {
+   if ((*cp_fn) (os_handle, msg, src, sizeof(*msg)) <= 0) {
DBG_ERR(("A: A(?) open, write error"))
return NULL;
}
diva_os_enter_spin_lock(_lock, _irql, "open_adapter");
list_for_each(tmp, _queue) {
a = list_entry(tmp, diva_os_xdi_adapter_t, link);
-   if (a->controller == (int)msg.adapter)
+   if (a->controller == (int)msg->adapter)
break;
a = NULL;
}
diva_os_leave_spin_lock(_lock, _irql, "open_adapter");
 
if (!a) {
-   DBG_ERR(("A: A(%d) open, adapter not found", msg.adapter))
+   DBG_ERR(("A: A(%d) open, adapter not found", msg->adapter))
}
 
return (a);
@@ -437,7 +436,8 @@ void diva_xdi_close_adapter(void *adapter, void *os_handle)
 
 int
 diva_xdi_write(void *adapter, void *os_handle, const void __user *src,
-  int length, divas_xdi_copy_from_user_fn_t cp_fn)
+  int length, diva_xdi_um_cfg_cmd_t *msg,
+  divas_xdi_copy_from_user_fn_t cp_fn)
 {
diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) adapter;
void *data;
@@ -459,7 +459,13 @@ diva_xdi_write(void *adapter, void *os_handle, const void 
__user *src,
return (-2);
}
 
-   length = (*cp_fn) (os_handle, data, src, length);
+   if (msg) {
+   *(diva_xdi_um_cfg_cmd_t *)data =

[RFC PATCH 1/6] net: ethernet: ti: cpsw: use cpdma channels in backward order for txq

2018-05-18 Thread Ivan Khoronzhuk

The cpdma channel highest priority is from hi to lo number.
The driver has limited number of descriptors that are shared between
number of cpdma channels. Number of queues can be tuned with ethtool,
that allows to not spend descriptors on not needed cpdma channels.
In AVB usually only 2 tx queues can be enough with rate limitation.
The rate limitation can be used only for hi priority queues. Thus, to
use only 2 queues the 8 has to be created. It's wasteful.

So, in order to allow using only needed number of rate limited
tx queues, save resources, and be able to set rate limitation for
them, let assign tx cpdma channels in backward order to queues.

Signed-off-by: Ivan Khoronzhuk 
---
 drivers/net/ethernet/ti/cpsw.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index a7285dddfd29..9bd615da04d3 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -967,8 +967,8 @@ static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int 
budget)
 
/* process every unprocessed channel */
ch_map = cpdma_ctrl_txchs_state(cpsw->dma);
-   for (ch = 0, num_tx = 0; ch_map; ch_map >>= 1, ch++) {
-   if (!(ch_map & 0x01))
+   for (ch = 0, num_tx = 0; ch_map & 0xff; ch_map <<= 1, ch++) {
+   if (!(ch_map & 0x80))
continue;
 
txv = >txv[ch];
@@ -2431,7 +2431,7 @@ static int cpsw_update_channels_res(struct cpsw_priv 
*priv, int ch_num, int rx)
void (*handler)(void *, int, int);
struct netdev_queue *queue;
struct cpsw_vector *vec;
-   int ret, *ch;
+   int ret, *ch, vch;
 
if (rx) {
ch = >rx_ch_num;
@@ -2444,7 +2444,8 @@ static int cpsw_update_channels_res(struct cpsw_priv 
*priv, int ch_num, int rx)
}
 
while (*ch < ch_num) {
-   vec[*ch].ch = cpdma_chan_create(cpsw->dma, *ch, handler, rx);
+   vch = rx ? *ch : 7 - *ch;
+   vec[*ch].ch = cpdma_chan_create(cpsw->dma, vch, handler, rx);
queue = netdev_get_tx_queue(priv->ndev, *ch);
queue->tx_maxrate = 0;
 
@@ -2980,7 +2981,7 @@ static int cpsw_probe(struct platform_device *pdev)
u32 slave_offset, sliver_offset, slave_size;
const struct soc_device_attribute *soc;
struct cpsw_common  *cpsw;
-   int ret = 0, i;
+   int ret = 0, i, ch;
int irq;
 
cpsw = devm_kzalloc(>dev, sizeof(struct cpsw_common), GFP_KERNEL);
@@ -3155,7 +3156,8 @@ static int cpsw_probe(struct platform_device *pdev)
if (soc)
cpsw->quirk_irq = 1;
 
-   cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_tx_handler, 0);
+   ch = cpsw->quirk_irq ? 0 : 7;
+   cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
if (IS_ERR(cpsw->txv[0].ch)) {
dev_err(priv->dev, "error initializing tx dma channel\n");
ret = PTR_ERR(cpsw->txv[0].ch);
-- 
2.17.0

[RFC PATCH 4/6] net: ethernet: ti: cpsw: add CBS Qdisc offload

2018-05-18 Thread Ivan Khoronzhuk

The cpsw has up to 4 FIFOs per port and upper 3 FIFOs can feed rate
limited queue with shaping. In order to set and enable shaping for
those 3 FIFOs queues the network device with CBS qdisc attached is
needed. The CBS configuration is added for dual-emac/single port mode
only, but potentially can be used in switch mode also, based on
switchdev for instance.

Despite the FIFO shapers can work w/o cpdma level shapers the base
usage must be in combine with cpdma level shapers as described in TRM,
that are set as maximum rates for interface queues with sysfs.

One of the possible configuration with txq shapers and CBS shapers:

  Configured with echo RATE >
  /sys/class/net/eth0/queues/tx-0/tx_maxrate
 /---
/
   /cpdma level shapers
++ ++ ++ ++ ++ ++ ++ ++
| c7 | | c6 | | c5 | | c4 | | c3 | | c2 | | c1 | | c0 |
\/ \/ \/ \/ \/ \/ \/ \/
 \  /   \  /   \  /   \  /   \  /   \  /   \  /   \  /
  \/ \/ \/ \/ \/ \/ \/ \/
+-|--|--|--|-+
|++  |  |  +---+ |
||  ++  |  | |
|v  v   v  v |
| ++ ++ ++ ++ pp++ ++ ++ ++  |
| || || || || oo|| || || ||  |
| | f3 | | f2 | | f1 | | f0 | r  CPSW  r| f3 | | f2 | | f1 | | f0 |  |
| || || || || tt|| || || ||  |
| \/ \/ \/ \/ 01\/ \/ \/ \/  |
|  \  X   \  /   \  /   \  / \  /   \  /   \  /   \  /   |
|   \/ \   \/ \/ \/   \/ \/ \/ \/|
+---\+
 \
  \ FIFO shaper, set with CBS offload added in this patch,
   \ FIFO0 cannot be rate limited
--

CBS shaper configuration is supposed to be used with root MQPRIO Qdisc
offload allowing to add sk_prio->tc->txq maps that direct traffic to
appropriate tx queue and maps L2 priority to FIFO shaper.

The CBS shaper is intended to be used for AVB where L2 priority
(pcp field) is used to differentiate class of traffic. So additionally
vlan needs to be created with appropriate egress sk_prio->l2 prio map.

If CBS has several tx queues assigned to it, the sum of their
bandwidth has not overlap bandwidth set for CBS. It's recomended the
CBS bandwidth to be a little bit more.

The CBS shaper is configured with CBS qdisc offload interface using tc
tool from iproute2 packet.

For instance:

$ tc qdisc replace dev eth0 handle 100: parent root mqprio num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 1

$ tc -g class show dev eth0
+---(100:ffe2) mqprio
|    +---(100:3) mqprio
|    +---(100:4) mqprio
|    
+---(100:ffe1) mqprio
|    +---(100:2) mqprio
|    
+---(100:ffe0) mqprio
 +---(100:1) mqprio

$ tc qdisc add dev eth0 parent 100:1 cbs locredit -1440 \
hicredit 60 sendslope -96 idleslope 4 offload 1

$ tc qdisc add dev eth0 parent 100:2 cbs locredit -1470 \
hicredit 62 sendslope -98 idleslope 2 offload 1

The above code set CBS shapers for tc0 and tc1, for that txq0 and
txq1 is used. Pay attention, the real set bandwidth can differ a bit
due to discreteness of configuration parameters.

Here parameters like locredit, hicredit and sendslope are ignored
internally and are supposed to be set with assumption that maximum
frame size for frame - 1500.

It's supposed that interface speed is not changed while reconnection,
not always is true, so inform user in case speed of interface was
changed, as it can impact on dependent shapers configuration.

For more examples see Documentation.

Signed-off-by: Ivan Khoronzhuk 
---
 drivers/net/ethernet/ti/cpsw.c | 221 +
 1 file changed, 221 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 4b232cda5436..c7710b0e1c17 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -46,6 +46,8 @@
 #include "cpts.h"
 #include "davinci_cpdma.h"
 
+#include 
+
 #define CPSW_DEBUG (NETIF_MSG_HW   | NETIF_MSG_WOL | \
 NETIF_MSG_DRV  | NETIF_MSG_LINK| \
 NETIF_MSG_IFUP | NETIF_MSG_INTR| \
@@ -154,8 +156,12 @@ do {   
\
 #define IRQ_NUM2
 #define CPSW_MAX_QUEUES8
 #define CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT 256
+#define CPSW_FIFO_QUEUE_TYPE_SHIFT

[RFC PATCH 0/6] net: ethernet: ti: cpsw: add MQPRIO and CBS Qdisc offload

2018-05-18 Thread Ivan Khoronzhuk

This series adds MQPRIO and CBS Qdisc offload for TI cpsw driver.
It potentially can be used in audio video bridging (AVB) and time
sensitive networking (TSN).

Patchset was tested on AM572x EVM and BBB boards. Last patch from this
series adds detailed description of configuration with examples. For
consistency reasons, in role of talker and listener, tools from
patchset "TSN: Add qdisc based config interface for CBS" were used and
can be seen here: https://www.spinics.net/lists/netdev/msg460869.html

Based on net-next/master

Ivan Khoronzhuk (6):
  net: ethernet: ti: cpsw: use cpdma channels in backward order for txq
  net: ethernet: ti: cpdma: fit rated channels in backward order
  net: ethernet: ti: cpsw: add MQPRIO Qdisc offload
  net: ethernet: ti: cpsw: add CBS Qdisc offload
  net: ethernet: ti: cpsw: restore shaper configuration while down/up
  Documentation: networking: cpsw: add MQPRIO & CBS offload examples

 Documentation/networking/cpsw.txt   | 540 
 drivers/net/ethernet/ti/cpsw.c  | 364 +++-
 drivers/net/ethernet/ti/davinci_cpdma.c |  31 +-
 3 files changed, 913 insertions(+), 22 deletions(-)
 create mode 100644 Documentation/networking/cpsw.txt

-- 
2.17.0

[RFC PATCH 3/6] net: ethernet: ti: cpsw: add MQPRIO Qdisc offload

2018-05-18 Thread Ivan Khoronzhuk

That's possible to offload vlan to tc priority mapping with
assumption sk_prio == L2 prio.

Example:
$ ethtool -L eth0 rx 1 tx 4

$ qdisc replace dev eth0 handle 100: parent root mqprio num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 1

$ tc -g class show dev eth0
+---(100:ffe2) mqprio
|    +---(100:3) mqprio
|    +---(100:4) mqprio
|    
+---(100:ffe1) mqprio
|    +---(100:2) mqprio
|    
+---(100:ffe0) mqprio
 +---(100:1) mqprio

Here, 100:1 is txq0, 100:2 is txq1, 100:3 is txq2, 100:4 is txq3
txq0 belongs to tc0, txq1 to tc1, txq2 and txq3 to tc2
The offload part only maps L2 prio to classes of traffic, but not
to transmit queues, so to direct traffic to traffic class vlan has
to be created with appropriate egress map.

Signed-off-by: Ivan Khoronzhuk 
---
 drivers/net/ethernet/ti/cpsw.c | 82 ++
 1 file changed, 82 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 9bd615da04d3..4b232cda5436 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -39,6 +39,7 @@
 #include 
 
 #include 
+#include 
 
 #include "cpsw.h"
 #include "cpsw_ale.h"
@@ -153,6 +154,8 @@ do {
\
 #define IRQ_NUM2
 #define CPSW_MAX_QUEUES8
 #define CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT 256
+#define CPSW_TC_NUM4
+#define CPSW_FIFO_SHAPERS_NUM  (CPSW_TC_NUM - 1)
 
 #define CPSW_RX_VLAN_ENCAP_HDR_PRIO_SHIFT  29
 #define CPSW_RX_VLAN_ENCAP_HDR_PRIO_MSKGENMASK(2, 0)
@@ -453,6 +456,7 @@ struct cpsw_priv {
u8  mac_addr[ETH_ALEN];
boolrx_pause;
booltx_pause;
+   boolmqprio_hw;
u32 emac_port;
struct cpsw_common *cpsw;
 };
@@ -1577,6 +1581,14 @@ static void cpsw_slave_stop(struct cpsw_slave *slave, 
struct cpsw_common *cpsw)
soft_reset_slave(slave);
 }
 
+static int cpsw_tc_to_fifo(int tc, int num_tc)
+{
+   if (tc == num_tc - 1)
+   return 0;
+
+   return CPSW_FIFO_SHAPERS_NUM - tc;
+}
+
 static int cpsw_ndo_open(struct net_device *ndev)
 {
struct cpsw_priv *priv = netdev_priv(ndev);
@@ -2190,6 +2202,75 @@ static int cpsw_ndo_set_tx_maxrate(struct net_device 
*ndev, int queue, u32 rate)
return ret;
 }
 
+static int cpsw_set_tc(struct net_device *ndev, void *type_data)
+{
+   struct tc_mqprio_qopt_offload *mqprio = type_data;
+   struct cpsw_priv *priv = netdev_priv(ndev);
+   struct cpsw_common *cpsw = priv->cpsw;
+   int fifo, num_tc, count, offset;
+   struct cpsw_slave *slave;
+   u32 tx_prio_map = 0;
+   int i, tc, ret;
+
+   num_tc = mqprio->qopt.num_tc;
+   if (num_tc > CPSW_TC_NUM)
+   return -EINVAL;
+
+   if (mqprio->mode != TC_MQPRIO_MODE_DCB)
+   return -EINVAL;
+
+   ret = pm_runtime_get_sync(cpsw->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(cpsw->dev);
+   return ret;
+   }
+
+   if (num_tc) {
+   for (i = 0; i < 8; i++) {
+   tc = mqprio->qopt.prio_tc_map[i];
+   fifo = cpsw_tc_to_fifo(tc, num_tc);
+   tx_prio_map |= fifo << (4 * i);
+   }
+
+   netdev_set_num_tc(ndev, num_tc);
+   for (i = 0; i < num_tc; i++) {
+   count = mqprio->qopt.count[i];
+   offset = mqprio->qopt.offset[i];
+   netdev_set_tc_queue(ndev, i, count, offset);
+   }
+   }
+
+   if (!mqprio->qopt.hw) {
+   /* restore default configuration */
+   netdev_reset_tc(ndev);
+   tx_prio_map = TX_PRIORITY_MAPPING;
+   }
+
+   priv->mqprio_hw = mqprio->qopt.hw;
+
+   offset = cpsw->version == CPSW_VERSION_1 ?
+CPSW1_TX_PRI_MAP : CPSW2_TX_PRI_MAP;
+
+   slave = >slaves[cpsw_slave_index(cpsw, priv)];
+   slave_write(slave, tx_prio_map, offset);
+
+   pm_runtime_put_sync(cpsw->dev);
+
+   return 0;
+}
+
+static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
+void *type_data)
+{
+   switch (type) {
+   case TC_SETUP_QDISC_MQPRIO:
+   return cpsw_set_tc(ndev, type_data);
+
+   default:
+   return -EOPNOTSUPP;
+   }
+}
+
 static const struct net_device_ops cpsw_netdev_ops = {
.ndo_open   = cpsw_ndo_open,
.ndo_stop   = cpsw_ndo_stop,
@@ -2205,6 +2286,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
 #endif
.ndo_vlan_rx_add_vid= cpsw_ndo_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid   = cpsw_ndo_vlan_rx_kill_vid,
+   .ndo_setup_tc   =

Re: [patch net-next 0/5] devlink: introduce port flavours and common phys_port_name generation

2018-05-18 Thread Jakub Kicinski

On Fri, 18 May 2018 09:28:59 +0200, Jiri Pirko wrote:
> From: Jiri Pirko 
> 
> This patchset resolves 2 issues we have right now:
> 1) There are many netdevices / ports in the system, for port, pf, vf
>represenatation but the user has no way to see which is which
> 2) The ndo_get_phys_port_name is implemented in each driver separatelly,
>which may lead to inconsistent names between drivers.
> 
> This patchset introduces port flavours which should address the first
> problem. In this initial patchset, I focus on DSA and their port
> flavours. As a follow-up, I plan to add PF and VF representor flavours.
> However, that needs additional dependencies in drivers (nfp, mlx5).
> 
> The common phys_port_name generation is used by mlxsw. An example output
> for mlxsw looks like this:

FWIW this series LGTM!

[RFC PATCH 6/6] Documentation: networking: cpsw: add MQPRIO & CBS offload examples

2018-05-18 Thread Ivan Khoronzhuk

This document describes MQPRIO and CBS Qdisc offload configuration
for cpsw driver based on examples. It potentially can be used in
audio video bridging (AVB) and time sensitive networking (TSN).

Signed-off-by: Ivan Khoronzhuk 
---
 Documentation/networking/cpsw.txt | 540 ++
 1 file changed, 540 insertions(+)
 create mode 100644 Documentation/networking/cpsw.txt

diff --git a/Documentation/networking/cpsw.txt 
b/Documentation/networking/cpsw.txt
new file mode 100644
index ..28c64896d59d
--- /dev/null
+++ b/Documentation/networking/cpsw.txt
@@ -0,0 +1,540 @@
+* Texas Instruments CPSW ethernet driver
+
+Multiqueue & CBS & MQPRIO
+=
+=
+
+The cpsw has 3 CBS shapers for each external ports. This document
+describes MQPRIO and CBS Qdisc offload configuration for cpsw driver
+based on examples. It potentially can be used in audio video bridging
+(AVB) and time sensitive networking (TSN).
+
+The following examples was tested on AM572x EVM and BBB boards.
+
+Test setup
+==
+
+Under consideration two examples with AM52xx EVM running cpsw driver
+in dual_emac mode.
+
+Several prerequisites:
+- TX queues must be rated starting from txq0 that has highest priority
+- Traffic classes are used starting from 0, that has highest priority
+- CBS shapers should be used with rated queues
+- The bandwidth for CBS shapers has to be set a little bit more then
+  potential incoming rate, thus, rate of all incoming tx queues has
+  to be a little less
+- Real rates can differ, due to discreetness
+- Map skb-priority to txq is not enough, also skb-priority to l2 prio
+  map has to be created with ip or vconfig tool
+- Any l2/socket prio (0 - 7) for classes can be used, but for
+  simplicity default values are used: 3 and 2
+- only 2 classes tested: A and B, but checked and can work with more,
+  maximum allowed 4, but only for 3 rate can be set.
+
+Test setup for examples
+===
++---+
+|--+|
+|  |  Workstation0  |
+|E |  MAC 18:03:73:66:87:42 |
++-+  +--|t ||
+|| 1  | E |  |  |h |./tsn_listener -d \ |
+|  Target board: | 0  | t |--+  |0 | 18:03:73:66:87:42 -i eth0 \|
+|  AM572x EVM| 0  | h | |  | -s 1500|
+|| 0  | 0 | |--+|
+|  Only 2 classes:   |Mb  +---| +---+
+|  class A, class B  ||
+||+---| +---+
+|| 1  | E | |--+|
+|| 0  | t | |  |  Workstation1  |
+|| 0  | h |--+  |E |  MAC 20:cf:30:85:7d:fd |
+||Mb  | 1 |  +--|t ||
++-+ |h |./tsn_listener -d \ |
+|0 | 20:cf:30:85:7d:fd -i eth0 \|
+|  | -s 1500|
+|--+|
++---+
+
+*
+*
+*
+Example 1: One port tx AVB configuration scheme for target board
+--
+(prints and scheme for AM52xx evm, applicable for single port boards)
+
+tc - traffic class
+txq - transmit queue
+p - priority
+f - fifo (cpsw fifo)
+S - shaper configured
+
++--+ u
+| +---+  +---+  +--+ +--+  | s
+| |   |  |   |  |  | |  |  | e
+| | App 1 |  | App 2 |  | Apps | | Apps |  | r
+| | Class A   |  | Class B   |  | Rest | | Rest |  |
+| | Eth0  |  | Eth0  |  | Eth0 | | Eth1 |  | s
+| | VLAN100   |  | VLAN100   |  |   |  | |   |  |  | p
+| | 40 Mb/s   |  | 20 Mb/s   |  |   |  | |   |  |  | a
+| | SO_PRIORITY=3 |  | SO_PRIORITY=2 |  |   |  | |   |  |  | c
+| |   |   |  |   |   |  |   |  | |   |  |  | e
+| +---|---+  +---|---+  +---|--+ +---|--+  |
++-|--|--||-+
++-+ ++

[RFC PATCH 5/6] net: ethernet: ti: cpsw: restore shaper configuration while down/up

2018-05-18 Thread Ivan Khoronzhuk

Need to restore shapers configuration after interface was down/up.
This is needed as appropriate configuration is still replicated in
kernel settings. This only shapers context restore, so vlan
configuration should be restored by user if needed, especially for
devices with one port where vlan frames are sent via ALE.

Signed-off-by: Ivan Khoronzhuk 
---
 drivers/net/ethernet/ti/cpsw.c | 47 ++
 1 file changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index c7710b0e1c17..c3e88be36c1b 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1807,6 +1807,51 @@ static int cpsw_set_cbs(struct net_device *ndev,
return ret;
 }
 
+static void cpsw_cbs_resume(struct cpsw_slave *slave, struct cpsw_priv *priv)
+{
+   int fifo, bw;
+
+   for (fifo = CPSW_FIFO_SHAPERS_NUM; fifo > 0; fifo--) {
+   bw = priv->fifo_bw[fifo];
+   if (!bw)
+   continue;
+
+   cpsw_set_fifo_rlimit(priv, fifo, bw);
+   }
+}
+
+static void cpsw_mqprio_resume(struct cpsw_slave *slave, struct cpsw_priv 
*priv)
+{
+   struct cpsw_common *cpsw = priv->cpsw;
+   u32 tx_prio_map = 0;
+   int i, tc, fifo;
+   u32 tx_prio_rg;
+
+   if (!priv->mqprio_hw)
+   return;
+
+   for (i = 0; i < 8; i++) {
+   tc = netdev_get_prio_tc_map(priv->ndev, i);
+   fifo = CPSW_FIFO_SHAPERS_NUM - tc;
+   tx_prio_map |= fifo << (4 * i);
+   }
+
+   tx_prio_rg = cpsw->version == CPSW_VERSION_1 ?
+CPSW1_TX_PRI_MAP : CPSW2_TX_PRI_MAP;
+
+   slave_write(slave, tx_prio_map, tx_prio_rg);
+}
+
+/* restore resources after port reset */
+static void cpsw_restore(struct cpsw_priv *priv)
+{
+   /* restore MQPRIO offload */
+   for_each_slave(priv, cpsw_mqprio_resume, priv);
+
+   /* restore CBS offload */
+   for_each_slave(priv, cpsw_cbs_resume, priv);
+}
+
 static int cpsw_ndo_open(struct net_device *ndev)
 {
struct cpsw_priv *priv = netdev_priv(ndev);
@@ -1886,6 +1931,8 @@ static int cpsw_ndo_open(struct net_device *ndev)
 
}
 
+   cpsw_restore(priv);
+
/* Enable Interrupt pacing if configured */
if (cpsw->coal_intvl != 0) {
struct ethtool_coalesce coal;
-- 
2.17.0

[RFC PATCH 2/6] net: ethernet: ti: cpdma: fit rated channels in backward order

2018-05-18 Thread Ivan Khoronzhuk

According to TRM tx rated channels should be in 7..0 order,
so correct it.

Signed-off-by: Ivan Khoronzhuk 
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 31 -
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
b/drivers/net/ethernet/ti/davinci_cpdma.c
index 31ae04117f0a..37fbdc668cc7 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -406,37 +406,36 @@ static int cpdma_chan_fit_rate(struct cpdma_chan *ch, u32 
rate,
struct cpdma_chan *chan;
u32 old_rate = ch->rate;
u32 new_rmask = 0;
-   int rlim = 1;
+   int rlim = 0;
int i;
 
-   *prio_mode = 0;
for (i = tx_chan_num(0); i < tx_chan_num(CPDMA_MAX_CHANNELS); i++) {
chan = ctlr->channels[i];
-   if (!chan) {
-   rlim = 0;
+   if (!chan)
continue;
-   }
 
if (chan == ch)
chan->rate = rate;
 
if (chan->rate) {
-   if (rlim) {
-   new_rmask |= chan->mask;
-   } else {
-   ch->rate = old_rate;
-   dev_err(ctlr->dev, "Prev channel of %dch is not 
rate limited\n",
-   chan->chan_num);
-   return -EINVAL;
-   }
-   } else {
-   *prio_mode = 1;
-   rlim = 0;
+   rlim = 1;
+   new_rmask |= chan->mask;
+   continue;
}
+
+   if (rlim)
+   goto err;
}
 
*rmask = new_rmask;
+   *prio_mode = rlim;
return 0;
+
+err:
+   ch->rate = old_rate;
+   dev_err(ctlr->dev, "Upper cpdma ch%d is not rate limited\n",
+   chan->chan_num);
+   return -EINVAL;
 }
 
 static u32 cpdma_chan_set_factors(struct cpdma_ctlr *ctlr,
-- 
2.17.0

Re: [bpf-next PATCH v2 0/2] SK_MSG programs: read sock fields

2018-05-18 Thread Daniel Borkmann

On 05/17/2018 11:16 PM, John Fastabend wrote:
> In this series we add the ability for sk msg programs to read basic
> sock information about the sock they are attached to. The second
> patch adds the tests to the selftest test_verifier.
> 
> One obseration that I had from writing this seriess is lots of the
> ./net/core/filter.c code is almost duplicated across program types.
> I thought about building a template/macro that we could use as a
> single block of code to read sock data out for multiple programs,
> but I wasn't convinced it was worth it yet. The result was using a
> macro saved a couple lines of code per block but made the code
> a bit harder to read IMO. We can probably revisit the idea later
> if we get more duplication.
> 
> v2: add errstr field to negative test_verifier test cases to ensure
> we get the expected err string back from the verifier.
> 
> ---
> 
> John Fastabend (2):
>   bpf: allow sk_msg programs to read sock fields
>   bpf: add sk_msg prog sk access tests to test_verifier
> 
> 
>  include/linux/filter.h  |1 
>  include/uapi/linux/bpf.h|8 ++
>  kernel/bpf/sockmap.c|1 
>  net/core/filter.c   |  114 
> ++-
>  tools/include/uapi/linux/bpf.h  |8 ++
>  tools/testing/selftests/bpf/test_verifier.c |  115 
> +++
>  6 files changed, 244 insertions(+), 3 deletions(-)
> 
> --
> Signature
> 

Applied to bpf-next, thanks John!

Re: [PATCH v4 3/3] bpf: add selftest for lirc_mode2 type program

2018-05-18 Thread Y Song

On Fri, May 18, 2018 at 1:17 PM, Y Song  wrote:
> On Fri, May 18, 2018 at 7:07 AM, Sean Young  wrote:
>> This is simple test over rc-loopback.
>>
>> Signed-off-by: Sean Young 
>
> Acked-by: Yonghong Song 

Just one minor thing. You need to add "test_lirc_mode2_user"
in tools/testing/selftests/bpf/.gitignore
so it will not show up when you do "git status".

If the patch needs respin, you can add this in the new revision.
Otherwise, I think a followup patch to fix this should be fine.

>
>> ---
>>  tools/bpf/bpftool/prog.c  |   1 +
>>  tools/include/uapi/linux/bpf.h|  53 -
>>  tools/include/uapi/linux/lirc.h   | 217 ++
>>  tools/lib/bpf/libbpf.c|   1 +
>>  tools/testing/selftests/bpf/Makefile  |   8 +-
>>  tools/testing/selftests/bpf/bpf_helpers.h |   6 +
>>  .../testing/selftests/bpf/test_lirc_mode2.sh  |  28 +++
>>  .../selftests/bpf/test_lirc_mode2_kern.c  |  23 ++
>>  .../selftests/bpf/test_lirc_mode2_user.c  | 154 +
>>  9 files changed, 487 insertions(+), 4 deletions(-)
>>  create mode 100644 tools/include/uapi/linux/lirc.h
>>  create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
>>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
>>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c
>>
>> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
>> index 9bdfdf2d3fbe..07f1ace39a46 100644
>> --- a/tools/bpf/bpftool/prog.c
>> +++ b/tools/bpf/bpftool/prog.c
>> @@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
>> [BPF_PROG_TYPE_SK_MSG]  = "sk_msg",
>> [BPF_PROG_TYPE_RAW_TRACEPOINT]  = "raw_tracepoint",
>> [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
>> +   [BPF_PROG_TYPE_LIRC_MODE2]  = "lirc_mode2",
>>  };
>>
>>  static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index d94d333a8225..8227832b713e 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -141,6 +141,7 @@ enum bpf_prog_type {
>> BPF_PROG_TYPE_SK_MSG,
>> BPF_PROG_TYPE_RAW_TRACEPOINT,
>> BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
>> +   BPF_PROG_TYPE_LIRC_MODE2,
>>  };
>>
>>  enum bpf_attach_type {
>> @@ -158,6 +159,7 @@ enum bpf_attach_type {
>> BPF_CGROUP_INET6_CONNECT,
>> BPF_CGROUP_INET4_POST_BIND,
>> BPF_CGROUP_INET6_POST_BIND,
>> +   BPF_LIRC_MODE2,
>> __MAX_BPF_ATTACH_TYPE
>>  };
>>
>> @@ -1902,6 +1904,53 @@ union bpf_attr {
>>   * egress otherwise). This is the only flag supported for now.
>>   * Return
>>   * **SK_PASS** on success, or **SK_DROP** on error.
>> + *
>> + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
>> + * Description
>> + * This helper is used in programs implementing IR decoding, to
>> + * report a successfully decoded key press with *scancode*,
>> + * *toggle* value in the given *protocol*. The scancode will be
>> + * translated to a keycode using the rc keymap, and reported as
>> + * an input key down event. After a period a key up event is
>> + * generated. This period can be extended by calling either
>> + * **bpf_rc_keydown** () with the same values, or calling
>> + * **bpf_rc_repeat** ().
>> + *
>> + * Some protocols include a toggle bit, in case the button
>> + * was released and pressed again between consecutive scancodes
>> + *
>> + * The *ctx* should point to the lirc sample as passed into
>> + * the program.
>> + *
>> + * The *protocol* is the decoded protocol number (see
>> + * **enum rc_proto** for some predefined values).
>> + *
>> + * This helper is only available is the kernel was compiled with
>> + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
>> + * "**y**".
>> + *
>> + * Return
>> + * 0
>> + *
>> + * int bpf_rc_repeat(void *ctx)
>> + * Description
>> + * This helper is used in programs implementing IR decoding, to
>> + * report a successfully decoded repeat key message. This delays
>> + * the generation of a key up event for previously generated
>> + * key down event.
>> + *
>> + * Some IR protocols like NEC have a special IR message for
>> + * repeating last button, for when a button is held down.
>> + *
>> + * The *ctx* should point to the lirc sample as passed into
>> + * the program.
>> + *
>> + * This helper is only available is the kernel was compiled with
>> + * the

Re: [PATCH bpf-next v2 7/7] tools/bpftool: add perf subcommand

2018-05-18 Thread Jakub Kicinski

On Thu, 17 May 2018 22:03:10 -0700, Yonghong Song wrote:
> The new command "bpftool perf [show | list]" will traverse
> all processes under /proc, and if any fd is associated
> with a perf event, it will print out related perf event
> information. Documentation is also added.

Thanks for the changes, it looks good with some minor nits which can be
addressed as follow up if there is no other need to respin.  Please
consider it:

Reviewed-by: Jakub Kicinski 

> Below is an example to show the results using bcc commands.
> Running the following 4 bcc commands:
>   kprobe: trace.py '__x64_sys_nanosleep'
>   kretprobe:  trace.py 'r::__x64_sys_nanosleep'
>   tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
>   uprobe: trace.py 'p:/home/yhs/a.out:main'
> 
> The bpftool command line and result:
> 
>   $ bpftool perf
>   pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
>   pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
>   pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
>   pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
> 
>   $ bpftool -j perf
>   
> {"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0},
>  \
>   
> {"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
>  \
>   
> {"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
>  \
>   
> {"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}

nit: this is now an array

>   $ bpftool prog
>   5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
> loaded_at 2018-05-15T04:46:37-0700  uid 0
> xlated 200B  not jited  memlock 4096B  map_ids 4
>   7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
> loaded_at 2018-05-15T04:48:32-0700  uid 0
> xlated 200B  not jited  memlock 4096B  map_ids 7
>   8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
> loaded_at 2018-05-15T04:48:48-0700  uid 0
> xlated 200B  not jited  memlock 4096B  map_ids 8
>   9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
> loaded_at 2018-05-15T04:49:52-0700  uid 0
> xlated 200B  not jited  memlock 4096B  map_ids 9
> 
>   $ ps ax | grep "python ./trace.py"
>   21711 pts/0T  0:03 python ./trace.py __x64_sys_write
>   21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
>   21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
>   21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
>   22374 pts/1S+ 0:00 grep --color=auto python ./trace.py
> 
> Signed-off-by: Yonghong Song 

> diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
> b/tools/bpf/bpftool/bash-completion/bpftool
> index b301c9b..3680ad4 100644
> --- a/tools/bpf/bpftool/bash-completion/bpftool
> +++ b/tools/bpf/bpftool/bash-completion/bpftool
> @@ -448,6 +448,15 @@ _bpftool()
>  ;;
>  esac
>  ;;
> +cgroup)

s/cgroup/perf/ :)

> +case $command in
> +*)
> +[[ $prev == $object ]] && \
> +COMPREPLY=( $( compgen -W 'help \
> +show list' -- "$cur" ) )
> +;;
> +esac
> +;;
>  esac
>  } &&
>  complete -F _bpftool bpftool

> +static int show_proc(const char *fpath, const struct stat *sb,
> +  int tflag, struct FTW *ftwbuf)
> +{
> + __u64 probe_offset, probe_addr;
> + __u32 prog_id, attach_info;
> + int err, pid = 0, fd = 0;
> + const char *pch;
> + char buf[4096];
> +
> + /* prefix always /proc */
> + pch = fpath + 5;
> + if (*pch == '\0')
> + return 0;
> +
> + /* pid should be all numbers */
> + pch++;
> + while (isdigit(*pch)) {
> + pid = pid * 10 + *pch - '0';
> + pch++;
> + }
> + if (*pch == '\0')
> + return 0;
> + if (*pch != '/')
> + return FTW_SKIP_SUBTREE;
> +
> + /* check /proc//fd directory */
> + pch++;
> + if (strncmp(pch, "fd", 2))
> + return FTW_SKIP_SUBTREE;
> + pch += 2;
> + if (*pch == '\0')
> + return 0;
> + if (*pch != '/')
> + return FTW_SKIP_SUBTREE;
> +
> + /* check /proc//fd/ */
> + pch++;
> + while (isdigit(*pch)) {
> + fd = fd * 10 + *pch - '0';
> + pch++;
> + }
> + if (*pch != '\0')
> + return FTW_SKIP_SUBTREE;
> +
> + /* query (pid, fd) for potential perf events */
> + err = bpf_task_fd_query(pid, fd, 0, buf, sizeof(buf), _id,
> + _info, _offset, _addr);
> + if (err < 0)
> + return 0;

nit: it could be nice from user perspective to detect whether kernel

Re: cascaded switch

2018-05-18 Thread Andrew Lunn

> So, it is used so that the 2 switch will behave as if it is one big switch.

Yes. This particularly important with offloading. When your offload a
bridge, you don't need to care which switch the ports or on. If
traffic needs to go from one switch to the other, it will. If you
modelled it as two switches, you would need to manually setup that
cross switch connection.

  Andrew

Re: [bpf-next V4 PATCH 7/8] xdp/trace: extend tracepoint in devmap with an err

2018-05-18 Thread Jesper Dangaard Brouer

On Fri, 18 May 2018 15:35:07 +0200
Jesper Dangaard Brouer  wrote:

> Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code
> allow people to easier identify the reason behind the ndo_xdp_xmit
> call to a given driver is failing.

Signed-off-by: Jesper Dangaard Brouer 

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [bpf-next V4 PATCH 8/8] samples/bpf: xdp_monitor use err code from tracepoint xdp:xdp_devmap_xmit

2018-05-18 Thread Jesper Dangaard Brouer

On Fri, 18 May 2018 15:35:12 +0200
Jesper Dangaard Brouer  wrote:

> Update xdp_monitor to use the recently added err code introduced
> in tracepoint xdp:xdp_devmap_xmit, to show if the drop count is
> caused by some driver general delivery problem.  Other kind of drops
> will likely just be more normal TX space issues.

Signed-off-by: Jesper Dangaard Brouer 

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [bpf-next V4 PATCH 5/8] xdp: introduce xdp_return_frame_rx_napi

2018-05-18 Thread Jesper Dangaard Brouer

On Fri, 18 May 2018 15:34:57 +0200
Jesper Dangaard Brouer  wrote:

> When sending an xdp_frame through xdp_do_redirect call, then error
> cases can happen where the xdp_frame needs to be dropped, and
> returning an -errno code isn't sufficient/possible any-longer
> (e.g. for cpumap case). This is already fully supported, by simply
> calling xdp_return_frame.
> 
> This patch is an optimization, which provides xdp_return_frame_rx_napi,
> which is a faster variant for these error cases.  It take advantage of
> the protection provided by XDP RX running under NAPI protection.
> 
> This change is mostly relevant for drivers using the page_pool
> allocator as it can take advantage of this. (Tested with mlx5).

Signed-off-by: Jesper Dangaard Brouer 

Ups, forgot my SoB... hope it's sufficient to add it this way.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: cascaded switch

2018-05-18 Thread Ran Shalit

On Fri, May 18, 2018 at 11:29 PM, Andrew Lunn  wrote:
>> Hi,
>>
>> I mean the same terminology used in marvell's switch.(I don't think
>> there is more than one terminology for this, please correct me if
>> wrong).
>> Anyway, I can see examples how it is done, but I don't understand the
>> benefit of this constellation, and why device tree needs to be
>> familiar with it.
>>
>> <   switch 1  >---port10port10- <  switch 2 >
>>  | | | | ||
>> port 1-9 |  port 1-9 |
>>  ||
>>  ||
>> --mdio--
>
> Your ASCII art is all messed up, but i get what you mean.
>
> This is the D in DSA. You would use this when a single switch does not
> have enough ports for your use case. So you use two switches.
>
> You need to tell each switch what links are used to get to other
> switches. There is an internal routing table. So you need to describe
> these links in device tree.
>

I understand, thanks,
So, it is used so that the 2 switch will behave as if it is one big switch.

Yet, how does it change the way the ports appears in "ifconfig" ?
Is it that if they were separate switch I wouldn't see incremental
numbers in "lanX" in ifconfig  (as is probably the result in cascaded
switch) ?

Regards,
ranran

>   Andrew

Re: [pull request][for-next 00/15] Mellanox, mlx5 core and netdev updates 2018-05-17

2018-05-18 Thread Saeed Mahameed

On Fri, 2018-05-18 at 11:21 -0600, Jason Gunthorpe wrote:
> On Fri, May 18, 2018 at 01:03:51PM -0400, David Miller wrote:
> > From: Saeed Mahameed 
> > Date: Thu, 17 May 2018 18:22:43 -0700
> > 
> > > Below you can find two pull requests,
> > > 
> > > 1. mlx5 core updates to be shared for both netdev and RDMA,
> > > (patches 1..9)
> > >  which is based on the last mlx5-next pull request
> > >  
> > > The following changes since commit
> > > a8408f4e6db775e245f20edf12b13fd58cc03a1c:
> > > 
> > >   net/mlx5: fix spelling mistake: "modfiy" -> "modify" (2018-05-
> > > 04 12:11:51 -0700)
> > > 
> > > are available in the Git repository at:
> > > 
> > >  
> > > git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git
> > > tags/mlx5-updates-2018-05-17
> > > 
> > > for you to fetch changes up to
> > > 10ff5359f883412728ba816046ee3a696625ca02:
> > > 
> > >   net/mlx5e: Explicitly set source e-switch in offloaded TC rules
> > > (2018-05-17 14:17:35 -0700)
> > > 
> > > 2. mlx5e netdev updates only for net-next branch (patches 10..15)
> > > based on net-next
> > > and the above pull request.
> > > 
> > > The following changes since commit
> > > 538e2de104cfb4ef1acb35af42427bff42adbe4d:
> > > 
> > >   Merge branch 'net-Allow-more-drivers-with-COMPILE_TEST' (2018-
> > > 05-17 17:11:07 -0400)
> > > 
> > > are available in the Git repository at:
> > > 
> > >   git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git
> > > tags/mlx5e-updates-2018-05-17
> > > 
> > > for you to fetch changes up to
> > > a228060a7c9ab88597eeac131e4578595d5d46ae:
> > > 
> > >   net/mlx5e: Add HW vport counters to representor ethtool stats
> > > (2018-05-17 17:48:54 -0700)
> > > 
> > > Dave, for your convenience you can either pull 1. and then 2. or
> > > pull 2.
> > > directly.
> > 
> > Looks good.
> > 
> > I pulled 1 then I pulled 2.  That seemed to work
> > well.  Particularly
> > it allowed me to capture the two different merge commit messages
> > one
> > by one.
> 
> Does this double up the merge commit though? I see this in Saeed's
> tags/mlx5e-updates-2018-05-17 ?
> 
> commit 260ab7042e24ccd4407985c6e775e39d064fab2b
> Merge: 538e2de104cfb4 10ff5359f88341
> Author: Saeed Mahameed 
> Date:   Thu May 17 17:47:09 2018 -0700
> 
> Merge tag 'mlx5-updates-2018-05-17' of
> git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux
> 
> mlx5-updates-2018-05-17
> 
> mlx5 core dirver updates for both net-next and rdma-next
> branches.
> 
> From Christophe JAILLET, first three patches to use kvfree where
> needed.
> 
> From: Or Gerlitz 
> 
> Next six patches from Roi and Co adds support for merged
> sriov e-switch which comes to serve cases where both PFs, VFs set
> on them and both uplinks are to be used in single v-switch SW
> model.
> When merged e-switch is supported, the per-port e-switch is
> logically
> merged into one e-switch that spans both physical ports and all
> the VFs.
> 
> This model allows to offload TC eswitch rules between VFs
> belonging
> to different PFs (and hence have different eswitch affinity), it
> also
> sets the some of the foundations needed for uplink LAG support.
> 
> Signed-off-by: Saeed Mahameed 
> 
> And this in your tree:
> 
> commit 3888ea4e2f1fb2f61e5418adf4b8332107ac0c8f
> Merge: 2c47a65b7009eb 10ff5359f88341
> Author: David S. Miller 
> Date:   Fri May 18 13:00:08 2018 -0400
> 
> Merge tag 'mlx5-updates-2018-05-17' of
> git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux
> 
> Saeed Mahameed says:
> 
> 
> mlx5-updates-2018-05-17
> 
> mlx5 core dirver updates for both net-next and rdma-next
> branches.
> 
> From Christophe JAILLET, first three patche to use kvfree where
> needed.
> 
> From: Or Gerlitz 
> 
> Next six patches from Roi and Co adds support for merged
> sriov e-switch which comes to serve cases where both PFs, VFs set
> on them and both uplinks are to be used in single v-switch SW
> model.
> When merged e-switch is supported, the per-port e-switch is
> logically
> merged into one e-switch that spans both physical ports and all
> the VFs.
> 
> This model allows to offload TC eswitch rules between VFs
> belonging
> to different PFs (and hence have different eswitch affinity), it
> also
> sets the some of the foundations needed for uplink LAG support.
> 
> 
> Signed-off-by: David S. Miller 
> 
> I think the trouble is the Saeed needs to merge the 'core' stuff to
> create the non-core patches for netdev (just like we want to do for
> rdma)
> 
> So maybe netdev should take the #2 pull request and rdma should
> take number #1?
> 

If the concern is the log message split, then yes pulling #2 is
sufficient since #1

Re: [pull request][for-next 00/15] Mellanox, mlx5 core and netdev updates 2018-05-17

2018-05-18 Thread Saeed Mahameed

On Fri, 2018-05-18 at 13:03 -0400, David Miller wrote:
> From: Saeed Mahameed 
> Date: Thu, 17 May 2018 18:22:43 -0700
> 
> > Below you can find two pull requests,
> > 
> > 1. mlx5 core updates to be shared for both netdev and RDMA,
> > (patches 1..9)
> >  which is based on the last mlx5-next pull request
> >  
> > The following changes since commit
> > a8408f4e6db775e245f20edf12b13fd58cc03a1c:
> > 
> >   net/mlx5: fix spelling mistake: "modfiy" -> "modify" (2018-05-04
> > 12:11:51 -0700)
> > 
> > are available in the Git repository at:
> > 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git
> > tags/mlx5-updates-2018-05-17
> > 
> > for you to fetch changes up to
> > 10ff5359f883412728ba816046ee3a696625ca02:
> > 
> >   net/mlx5e: Explicitly set source e-switch in offloaded TC rules
> > (2018-05-17 14:17:35 -0700)
> > 
> > 2. mlx5e netdev updates only for net-next branch (patches 10..15)
> > based on net-next
> > and the above pull request.
> > 
> > The following changes since commit
> > 538e2de104cfb4ef1acb35af42427bff42adbe4d:
> > 
> >   Merge branch 'net-Allow-more-drivers-with-COMPILE_TEST' (2018-05-
> > 17 17:11:07 -0400)
> > 
> > are available in the Git repository at:
> > 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git
> > tags/mlx5e-updates-2018-05-17
> > 
> > for you to fetch changes up to
> > a228060a7c9ab88597eeac131e4578595d5d46ae:
> > 
> >   net/mlx5e: Add HW vport counters to representor ethtool stats
> > (2018-05-17 17:48:54 -0700)
> > 
> > Dave, for your convenience you can either pull 1. and then 2. or
> > pull 2.
> > directly.
> 
> Looks good.
> 
> I pulled 1 then I pulled 2.  That seemed to work well.  Particularly
> it allowed me to capture the two different merge commit messages one
> by one.
> 
> Is this basically how you want to handle things moving forward?
> 

Thanks Dave !
Basically yes, we want to avoid sending netdev related patches to rdma,
and vice versa.
Unlike the previous "shared code" procedure, this is a more natural way
to do things, since the mlx5 core is a shared arena, we want to
maintain it separate from netdev and rdma.

Before, Leon and I needed to sync before each release and create a
"shared code" pull requests that includes everything (core/rdma/netdev)
that was conflicting in the core arena.


> Thanks.

Re: [PATCH] selftests: bpf: config: enable NET_SCH_INGRESS for xdp_meta.sh

2018-05-18 Thread Daniel Borkmann

On 05/18/2018 08:23 PM, Anders Roxell wrote:
> When running bpf's selftest test_xdp_meta.sh it fails:
> ./test_xdp_meta.sh
> Error: Specified qdisc not found.
> selftests: test_xdp_meta [FAILED]
> 
> Need to enable CONFIG_NET_SCH_INGRESS and CONFIG_NET_CLS_ACT to get the
> test to pass.
> 
> Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
> Signed-off-by: Anders Roxell 

Applied to bpf tree, thanks Anders!

Re: [PATCH bpf-next 0/3] nfp: bpf: complete shift supports on NFP JIT

2018-05-18 Thread Daniel Borkmann

On 05/18/2018 09:12 PM, Jakub Kicinski wrote:
> Jiong says:
> 
> NFP eBPF JIT is missing logic indirect shifts (both left and right) and
> arithmetic right shift (both indirect shift and shift by constant).
> 
> This patch adds support for them.
> 
> For indirect shifts, shift amount is not specified as constant, NFP needs
> to get the shift amount through the low 5 bits of source A operand in
> PREV_ALU, therefore extra instructions are needed compared with shifts by
> constants.
> 
> Because NFP is 32-bit, so we are using register pair for 64-bit shifts and
> therefore would need different instruction sequences depending on whether
> shift amount is less than 32 or not.
> 
> NFP branch-on-bit-test instruction emitter is added by this patch set and
> is used for efficient runtime check on shift amount. We'd think the shift
> amount is less than 32 if bit 5 is clear and greater or equal then 32
> otherwise. Shift amount is greater than or equal to 64 will result in
> undefined behavior.
> 
> This patch also use range info to avoid generating unnecessary runtime code
> if we are certain shift amount is less than 32 or not.
> 
> Jiong Wang (3):
>   nfp: bpf: support logic indirect shifts (BPF_[L|R]SH | BPF_X)
>   nfp: bpf: support arithmetic right shift by constant (BPF_ARSH |
> BPF_K)
>   nfp: bpf: support arithmetic indirect right shift (BPF_ARSH | BPF_X)
> 
>  drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 410 --
>  drivers/net/ethernet/netronome/nfp/bpf/main.h |  28 ++
>  .../net/ethernet/netronome/nfp/bpf/offload.c  |   2 +
>  .../net/ethernet/netronome/nfp/bpf/verifier.c |   8 +
>  drivers/net/ethernet/netronome/nfp/nfp_asm.h  |  18 +-
>  5 files changed, 435 insertions(+), 31 deletions(-)

Applied to bpf-next, thanks guys!

Re: WARNING in __static_key_slow_dec

2018-05-18 Thread Willem de Bruijn

On Fri, May 18, 2018 at 4:03 AM, DaeRyong Jeong  wrote:
> We report the crash: WARNING in __static_key_slow_dec
>
> This crash has been found in v4.8 using RaceFuzzer (a modified
> version of Syzkaller), which we describe more at the end of this
> report.
> Even though v4.8 is the relatively old version, we did manual verification
> and we think the bug still exists.
> Our analysis shows that the race occurs when invoking two syscalls
> concurrently, setsockopt() with optname SO_TIMESTAMPING and ioctl() with
> cmd SIOCGSTAMPNS.
>
>
> Diagnosis:
> We think if timestamp was previously enabled with
> SOCK_TIMESTAMPING_RX_SOFTWARE flag, the concurrent execution of
> sock_disable_timestamp() and sock_enable_timestamp() causes the crash.
>
>
> Thread interleaving:
> (Assume sk->flag has the SOCK_TIMESTAMPING_RX_SOFTWARE flag by the
> previous setsockopt() call with SO_TIMESTAMPING)
>
> CPU0 (sock_disable_timestamp()) CPU1 (sock_enable_timestamp())
> =   =
> (flag == 1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)  (flag == SOCK_TIMESTAMP)
>
> if (!sock_flag(sk, flag)) {
> unsigned long 
> previous_flags = sk->sk_flags;
>
> if (sk->sk_flags & flags) {
> sk->sk_flags &= ~flags;
> if (sock_needs_netstamp(sk) &&
> !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
> net_disable_timestamp();
> sock_set_flag(sk, 
> flag);
>
> if 
> (sock_needs_netstamp(sk) &&
> !(previous_flags 
> & SK_FLAGS_TIMESTAMP))
> 
> net_enable_timestamp();
> /* Here, 
> net_enable_timestamp() is not called because
>  * previous_flags has 
> the SOCK_TIMESTAMPING_RX_SOFTWARE
>  * flag
>  */
> /* After the race, sk->sk has the flag SOCK_TIMESTAMP, but
>  * net_enable_timestamp() is not called one more time.
>  * Consequently, when the socket is closed, __sk_destruct()
>  * calls net_disable_timestamp() that leads WARNING.
>  */

Thanks for the detailed analysis.

Indeed the updates to sk->sk_flags and calls to net_(dis|en)able_timestamp
should happen atomically, but this is not the case. The setsockopt
path holds the socket lock, but not all ioctl paths.

Perhaps we can take lock_sock_fast in sock_get_timestamp and
variants.

Re: cascaded switch

2018-05-18 Thread Andrew Lunn

> Hi,
> 
> I mean the same terminology used in marvell's switch.(I don't think
> there is more than one terminology for this, please correct me if
> wrong).
> Anyway, I can see examples how it is done, but I don't understand the
> benefit of this constellation, and why device tree needs to be
> familiar with it.
> 
> <   switch 1  >---port10port10- <  switch 2 >
>  | | | | ||
> port 1-9 |  port 1-9 |
>  ||
>  ||
> --mdio--

Your ASCII art is all messed up, but i get what you mean.

This is the D in DSA. You would use this when a single switch does not
have enough ports for your use case. So you use two switches.

You need to tell each switch what links are used to get to other
switches. There is an internal routing table. So you need to describe
these links in device tree.

  Andrew

Re: [PATCH bpf-next v6 5/6] ipv6: sr: Add seg6local action End.BPF

2018-05-18 Thread Daniel Borkmann

On 05/17/2018 04:28 PM, Mathieu Xhonneux wrote:
> This patch adds the End.BPF action to the LWT seg6local infrastructure.
> This action works like any other seg6local End action, meaning that an IPv6
> header with SRH is needed, whose DA has to be equal to the SID of the
> action. It will also advance the SRH to the next segment, the BPF program
> does not have to take care of this.
> 
> Since the BPF program may not be a source of instability in the kernel, it
> is important to ensure that the integrity of the packet is maintained
> before yielding it back to the IPv6 layer. The hook hence keeps track if
> the SRH has been altered through the helpers, and re-validates its
> content if needed with seg6_validate_srh. The state kept for validation is
> stored in a per-CPU buffer. The BPF program is not allowed to directly
> write into the packet, and only some fields of the SRH can be altered
> through the helper bpf_lwt_seg6_store_bytes.
> 
> Performances profiling has shown that the SRH re-validation does not induce
> a significant overhead. If the altered SRH is deemed as invalid, the packet
> is dropped.
> 
> This validation is also done before executing any action through
> bpf_lwt_seg6_action, and will not be performed again if the SRH is not
> modified after calling the action.
> 
> The BPF program may return 3 types of return codes:
> - BPF_OK: the End.BPF action will look up the next destination through
>  seg6_lookup_nexthop.
> - BPF_REDIRECT: if an action has been executed through the
>   bpf_lwt_seg6_action helper, the BPF program should return this
>   value, as the skb's destination is already set and the default
>   lookup should not be performed.
> - BPF_DROP : the packet will be dropped.
> 
> Signed-off-by: Mathieu Xhonneux 
> Acked-by: David Lebrun 
[...]
>  static struct seg6_action_desc seg6_action_table[] = {
>   {
>   .action = SEG6_LOCAL_ACTION_END,
> @@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
>   .attrs  = (1 << SEG6_LOCAL_SRH),
>   .input  = input_action_end_b6_encap,
>   .static_headroom= sizeof(struct ipv6hdr),
> - }
> + },
> + {
> + .action = SEG6_LOCAL_ACTION_END_BPF,
> + .attrs  = (1 << SEG6_LOCAL_BPF),
> + .input  = input_action_end_bpf,
> + },
> +
>  };
>  
>  static struct seg6_action_desc *__get_action_desc(int action)
> @@ -542,6 +619,7 @@ static const struct nla_policy 
> seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
>   .len = sizeof(struct in6_addr) },
>   [SEG6_LOCAL_IIF]= { .type = NLA_U32 },
>   [SEG6_LOCAL_OIF]= { .type = NLA_U32 },
> + [SEG6_LOCAL_BPF]= { .type = NLA_NESTED },
>  };
>  
>  static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
> @@ -719,6 +797,71 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct 
> seg6_local_lwt *b)
>   return 0;
>  }
>  
> +#define MAX_PROG_NAME 256
> +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
> + [LWT_BPF_PROG_FD]   = { .type = NLA_U32, },

>From UAPI point of view, I wouldn't name it LWT_BPF_PROG_FD but rather 
>something like
LWT_BPF_PROG for example. That way, the setup can contain the fd number, but on 
the
dump you can put the prog->aux->id in there so that prog lookup can be done 
again.

> + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
> + .len = MAX_PROG_NAME },
> +};
> +
> +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
> +{
> + struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
> + struct bpf_prog *p;
> + int ret;
> + u32 fd;
> +
> + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attrs[SEG6_LOCAL_BPF],
> +bpf_prog_policy, NULL);
> + if (ret < 0)
> + return ret;
> +
> + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
> + return -EINVAL;
> +
> + slwt->bpf.name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
> + if (!slwt->bpf.name)
> + return -ENOMEM;
> +
> + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
> + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
> + if (IS_ERR(p))
> + return PTR_ERR(p);

Here in the above error path is definitely a bug in that you don't free the
prior allocated slwt->bpf.name from nla_memdup().

Also when you destroy the struct seg6_local_lwt object, what I'm not getting
is where you drop the prog reference again and free slwt->bpf.name there?

> +
> + slwt->bpf.prog = p;
> +
> + return 0;
> +}
> +
> +static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
> +{
> + struct nlattr *nest;
> +
> + if (!slwt->bpf.prog)
> + return 0;
> +
> + nest =

Re: [PATCH v4 3/3] bpf: add selftest for lirc_mode2 type program

2018-05-18 Thread Y Song

On Fri, May 18, 2018 at 7:07 AM, Sean Young  wrote:
> This is simple test over rc-loopback.
>
> Signed-off-by: Sean Young 

Acked-by: Yonghong Song 

> ---
>  tools/bpf/bpftool/prog.c  |   1 +
>  tools/include/uapi/linux/bpf.h|  53 -
>  tools/include/uapi/linux/lirc.h   | 217 ++
>  tools/lib/bpf/libbpf.c|   1 +
>  tools/testing/selftests/bpf/Makefile  |   8 +-
>  tools/testing/selftests/bpf/bpf_helpers.h |   6 +
>  .../testing/selftests/bpf/test_lirc_mode2.sh  |  28 +++
>  .../selftests/bpf/test_lirc_mode2_kern.c  |  23 ++
>  .../selftests/bpf/test_lirc_mode2_user.c  | 154 +
>  9 files changed, 487 insertions(+), 4 deletions(-)
>  create mode 100644 tools/include/uapi/linux/lirc.h
>  create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c
>
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 9bdfdf2d3fbe..07f1ace39a46 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
> [BPF_PROG_TYPE_SK_MSG]  = "sk_msg",
> [BPF_PROG_TYPE_RAW_TRACEPOINT]  = "raw_tracepoint",
> [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
> +   [BPF_PROG_TYPE_LIRC_MODE2]  = "lirc_mode2",
>  };
>
>  static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index d94d333a8225..8227832b713e 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -141,6 +141,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_SK_MSG,
> BPF_PROG_TYPE_RAW_TRACEPOINT,
> BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
> +   BPF_PROG_TYPE_LIRC_MODE2,
>  };
>
>  enum bpf_attach_type {
> @@ -158,6 +159,7 @@ enum bpf_attach_type {
> BPF_CGROUP_INET6_CONNECT,
> BPF_CGROUP_INET4_POST_BIND,
> BPF_CGROUP_INET6_POST_BIND,
> +   BPF_LIRC_MODE2,
> __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1902,6 +1904,53 @@ union bpf_attr {
>   * egress otherwise). This is the only flag supported for now.
>   * Return
>   * **SK_PASS** on success, or **SK_DROP** on error.
> + *
> + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
> + * Description
> + * This helper is used in programs implementing IR decoding, to
> + * report a successfully decoded key press with *scancode*,
> + * *toggle* value in the given *protocol*. The scancode will be
> + * translated to a keycode using the rc keymap, and reported as
> + * an input key down event. After a period a key up event is
> + * generated. This period can be extended by calling either
> + * **bpf_rc_keydown** () with the same values, or calling
> + * **bpf_rc_repeat** ().
> + *
> + * Some protocols include a toggle bit, in case the button
> + * was released and pressed again between consecutive scancodes
> + *
> + * The *ctx* should point to the lirc sample as passed into
> + * the program.
> + *
> + * The *protocol* is the decoded protocol number (see
> + * **enum rc_proto** for some predefined values).
> + *
> + * This helper is only available is the kernel was compiled with
> + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
> + * "**y**".
> + *
> + * Return
> + * 0
> + *
> + * int bpf_rc_repeat(void *ctx)
> + * Description
> + * This helper is used in programs implementing IR decoding, to
> + * report a successfully decoded repeat key message. This delays
> + * the generation of a key up event for previously generated
> + * key down event.
> + *
> + * Some IR protocols like NEC have a special IR message for
> + * repeating last button, for when a button is held down.
> + *
> + * The *ctx* should point to the lirc sample as passed into
> + * the program.
> + *
> + * This helper is only available is the kernel was compiled with
> + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
> + * "**y**".
> + *
> + * Return
> + * 0
>   */
>  #define __BPF_FUNC_MAPPER(FN)  \
> FN(unspec), \
> @@ -1976,7 +2025,9 @@ union bpf_attr {
> FN(fib_lookup), \
> FN(sock_hash_update),   \
> FN(msg_redirect_hash),  \
> -   FN(sk_redirect_hash),
> +   FN(sk_redirect_hash),   \
> +

Re: [PATCH v4 2/3] media: rc: introduce BPF_PROG_LIRC_MODE2

2018-05-18 Thread Y Song

On Fri, May 18, 2018 at 7:07 AM, Sean Young  wrote:
> Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
> rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
> that the last key should be repeated.
>
> The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
> the target_fd must be the /dev/lircN device.
>
> Signed-off-by: Sean Young 

Acked-by: Yonghong Song 

> ---
>  drivers/media/rc/Kconfig|  13 ++
>  drivers/media/rc/Makefile   |   1 +
>  drivers/media/rc/bpf-lirc.c | 308 
>  drivers/media/rc/lirc_dev.c |  30 
>  drivers/media/rc/rc-core-priv.h |  22 +++
>  drivers/media/rc/rc-ir-raw.c|  12 +-
>  include/linux/bpf_rcdev.h   |  30 
>  include/linux/bpf_types.h   |   3 +
>  include/uapi/linux/bpf.h|  53 +-
>  kernel/bpf/syscall.c|   7 +
>  10 files changed, 476 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/media/rc/bpf-lirc.c
>  create mode 100644 include/linux/bpf_rcdev.h
>
> diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
> index eb2c3b6eca7f..d5b35a6ba899 100644
> --- a/drivers/media/rc/Kconfig
> +++ b/drivers/media/rc/Kconfig
> @@ -25,6 +25,19 @@ config LIRC
>passes raw IR to and from userspace, which is needed for
>IR transmitting (aka "blasting") and for the lirc daemon.
>
> +config BPF_LIRC_MODE2
> +   bool "Support for eBPF programs attached to lirc devices"
> +   depends on BPF_SYSCALL
> +   depends on RC_CORE=y
> +   depends on LIRC
> +   help
> +  Allow attaching eBPF programs to a lirc device using the bpf(2)
> +  syscall command BPF_PROG_ATTACH. This is supported for raw IR
> +  receivers.
> +
> +  These eBPF programs can be used to decode IR into scancodes, for
> +  IR protocols not supported by the kernel decoders.
> +
>  menuconfig RC_DECODERS
> bool "Remote controller decoders"
> depends on RC_CORE
> diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
> index 2e1c87066f6c..e0340d043fe8 100644
> --- a/drivers/media/rc/Makefile
> +++ b/drivers/media/rc/Makefile
> @@ -5,6 +5,7 @@ obj-y += keymaps/
>  obj-$(CONFIG_RC_CORE) += rc-core.o
>  rc-core-y := rc-main.o rc-ir-raw.o
>  rc-core-$(CONFIG_LIRC) += lirc_dev.o
> +rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o
>  obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
>  obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
>  obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
> diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
> new file mode 100644
> index ..c9673df2d9cd
> --- /dev/null
> +++ b/drivers/media/rc/bpf-lirc.c
> @@ -0,0 +1,308 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// bpf-lirc.c - handles bpf
> +//
> +// Copyright (C) 2018 Sean Young 
> +
> +#include 
> +#include 
> +#include 
> +#include "rc-core-priv.h"
> +
> +/*
> + * BPF interface for raw IR
> + */
> +const struct bpf_prog_ops lirc_mode2_prog_ops = {
> +};
> +
> +BPF_CALL_1(bpf_rc_repeat, u32*, sample)
> +{
> +   struct ir_raw_event_ctrl *ctrl;
> +
> +   ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
> +
> +   rc_repeat(ctrl->dev);
> +
> +   return 0;
> +}
> +
> +static const struct bpf_func_proto rc_repeat_proto = {
> +   .func  = bpf_rc_repeat,
> +   .gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
> +   .ret_type  = RET_INTEGER,
> +   .arg1_type = ARG_PTR_TO_CTX,
> +};
> +
> +/*
> + * Currently rc-core does not support 64-bit scancodes, but there are many
> + * known protocols with more than 32 bits. So, define the interface as u64
> + * as a future-proof.
> + */
> +BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode,
> +  u32, toggle)
> +{
> +   struct ir_raw_event_ctrl *ctrl;
> +
> +   ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
> +
> +   rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
> +
> +   return 0;
> +}
> +
> +static const struct bpf_func_proto rc_keydown_proto = {
> +   .func  = bpf_rc_keydown,
> +   .gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
> +   .ret_type  = RET_INTEGER,
> +   .arg1_type = ARG_PTR_TO_CTX,
> +   .arg2_type = ARG_ANYTHING,
> +   .arg3_type = ARG_ANYTHING,
> +   .arg4_type = ARG_ANYTHING,
> +};
> +
> +static const struct bpf_func_proto *
> +lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> +   switch (func_id) {
> +   case BPF_FUNC_rc_repeat:
> +   return _repeat_proto;
> +   case BPF_FUNC_rc_keydown:
> +   return _keydown_proto;
> +   case BPF_FUNC_map_lookup_elem:
> +   return _map_lookup_elem_proto;
> +   case BPF_FUNC_map_update_elem:
> +   return _map_update_elem_proto;
> +

Re: cascaded switch

2018-05-18 Thread Ran Shalit

On Fri, May 18, 2018 at 10:13 PM, Andrew Lunn  wrote:
> On Fri, May 18, 2018 at 09:35:38PM +0300, Ran Shalit wrote:
>> Hello,
>>
>> I am trying to understand the concept of cascaded switch.
>> I haven't find much information on this topic.
>>
>> Can anyone please explain the general concept, when is it used, and
>> why does the device tree need to know about cascaded switch ?
>
> Hi Ran
>
> I think you first need to define what you mean by cascaded switches.
>
Hi,

I mean the same terminology used in marvell's switch.(I don't think
there is more than one terminology for this, please correct me if
wrong).
Anyway, I can see examples how it is done, but I don't understand the
benefit of this constellation, and why device tree needs to be
familiar with it.

<   switch 1  >---port10port10- <  switch 2 >
 | | | | ||
port 1-9 |  port 1-9 |
 ||
 ||
--mdio--

The term "cascaded switches" is also used in dsa documentation in device tree:
https://www.kernel.org/doc/Documentation/networking/dsa/dsa.txt


Regard,
Ranran

>   Andrew

Re: [PATCH v4 1/3] bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not found

2018-05-18 Thread Y Song

On Fri, May 18, 2018 at 7:07 AM, Sean Young  wrote:
> This makes is it possible for bpf prog detach to return -ENOENT.
>
> Signed-off-by: Sean Young 

Acked-by: Yonghong Song

Re: [PATCH bpf v2 5/6] tools: bpftool: resolve calls without using imm field

2018-05-18 Thread Jakub Kicinski

On Fri, 18 May 2018 18:20:38 +0530, Sandipan Das wrote:
> Currently, we resolve the callee's address for a JITed function
> call by using the imm field of the call instruction as an offset
> from __bpf_call_base. If bpf_jit_kallsyms is enabled, we further
> use this address to get the callee's kernel symbol's name.
> 
> For some architectures, such as powerpc64, the imm field is not
> large enough to hold this offset. So, instead of assigning this
> offset to the imm field, the verifier now assigns the subprog
> id. Also, a list of kernel symbol addresses for all the JITed
> functions is provided in the program info. We now use the imm
> field as an index for this list to lookup a callee's symbol's
> address and resolve its name.
> 
> Suggested-by: Daniel Borkmann 
> Signed-off-by: Sandipan Das 
> ---
> v2:
>  - Order variables from longest to shortest
>  - Make sure that ksyms_ptr and ksyms_len are always initialized
>  - Simplify code

Thanks for the improvements!  Since there will be v3 two minor nit
picks still :)

>  tools/bpf/bpftool/prog.c  | 29 +
>  tools/bpf/bpftool/xlated_dumper.c | 10 +-
>  tools/bpf/bpftool/xlated_dumper.h |  2 ++
>  3 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 9bdfdf2d3fbe..e2f8f8f259fc 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -421,19 +421,26 @@ static int do_show(int argc, char **argv)
>  static int do_dump(int argc, char **argv)
>  {
>   struct bpf_prog_info info = {};
> + unsigned long *addrs = NULL;
>   struct dump_data dd = {};
>   __u32 len = sizeof(info);
>   unsigned int buf_size;
> + unsigned int nr_addrs;
>   char *filepath = NULL;
>   bool opcodes = false;
>   bool visual = false;
>   unsigned char *buf;
>   __u32 *member_len;
>   __u64 *member_ptr;
> + __u32 *ksyms_len;
> + __u64 *ksyms_ptr;
>   ssize_t n;
>   int err;
>   int fd;
>  
> + ksyms_len = _jited_ksyms;
> + ksyms_ptr = _ksyms;

I'm not sure why you need these, why not just access
info.nr_jited_ksyms and info.jited_ksyms directly?  "member" variables
are there because jited and xlated images get returned in different
member of struct bpf_prog_info.

>   if (is_prefix(*argv, "jited")) {
>   member_len = _prog_len;
>   member_ptr = _prog_insns;
> @@ -496,10 +503,22 @@ static int do_dump(int argc, char **argv)
>   return -1;
>   }
>  
> + nr_addrs = *ksyms_len;
> + if (nr_addrs) {
> + addrs = malloc(nr_addrs * sizeof(__u64));
> + if (!addrs) {
> + p_err("mem alloc failed");
> + close(fd);
> + goto err_free;
> + }
> + }
> +
>   memset(, 0, sizeof(info));
>  
>   *member_ptr = ptr_to_u64(buf);
>   *member_len = buf_size;
> + *ksyms_ptr = ptr_to_u64(addrs);
> + *ksyms_len = nr_addrs;
>  
>   err = bpf_obj_get_info_by_fd(fd, , );
>   close(fd);
> @@ -513,6 +532,11 @@ static int do_dump(int argc, char **argv)
>   goto err_free;
>   }
>  
> + if (*ksyms_len > nr_addrs) {
> + p_err("too many addresses returned");
> + goto err_free;
> + }
> +
>   if ((member_len == _prog_len &&
>info.jited_prog_insns == 0) ||
>   (member_len == _prog_len &&
> @@ -558,6 +582,9 @@ static int do_dump(int argc, char **argv)
>   dump_xlated_cfg(buf, *member_len);
>   } else {
>   kernel_syms_load();
> + dd.nr_jited_ksyms = *ksyms_len;
> + dd.jited_ksyms = (__u64 *) *ksyms_ptr;
> +
>   if (json_output)
>   dump_xlated_json(, buf, *member_len, opcodes);
>   else

> diff --git a/tools/bpf/bpftool/xlated_dumper.c 
> b/tools/bpf/bpftool/xlated_dumper.c
> index 7a3173b76c16..fb065b55db6d 100644
> --- a/tools/bpf/bpftool/xlated_dumper.c
> +++ b/tools/bpf/bpftool/xlated_dumper.c
> @@ -203,6 +207,10 @@ static const char *print_call(void *private_data,
>   unsigned long address = dd->address_call_base + insn->imm;
>   struct kernel_sym *sym;
>  
> + if (insn->src_reg == BPF_PSEUDO_CALL &&
> + (__u32) insn->imm < dd->nr_jited_ksyms)

Indentation seems off.

> + address = dd->jited_ksyms[insn->imm];
> +
>   sym = kernel_syms_search(dd, address);
>   if (insn->src_reg == BPF_PSEUDO_CALL)
>   return print_call_pcrel(dd, sym, address, insn);

pull request: bluetooth-next 2018-05-18

2018-05-18 Thread Johan Hedberg

Hi Dave,

Here's the first bluetooth-next pull request for the 4.18 kernel:

 - Refactoring of the btbcm driver
 - New USB IDs for QCA_ROME and LiteOn controllers
 - Buffer overflow fix if the controller sends invalid advertising data length
 - Various cleanups & fixes for Qualcomm controllers

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit 538e2de104cfb4ef1acb35af42427bff42adbe4d:

  Merge branch 'net-Allow-more-drivers-with-COMPILE_TEST' (2018-05-17 17:11:07 
-0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git 
for-upstream

for you to fetch changes up to df2445bf77833674ebf790d2e6fcfd1d389b8a7b:

  Bluetooth: Add a new 13d3:3496 QCA_ROME device (2018-05-18 06:37:52 +0200)


Amit Pundir (1):
  Bluetooth: hci_qca: Avoid missing rampatch failure with userspace fw 
loader

Chriz Chow (1):
  Bluetooth: Prevent buffer overflow for large advertisement data

Fabio Estevam (1):
  Bluetooth: hci_ldisc: Provide a 'default' switch case

Hans de Goede (8):
  Bluetooth: hci_bcm: Add broken-irq dmi blacklist and add Meegopad T08 to 
it
  Bluetooth: hci_bcm: Remove irq-active-low DMI quirk for the Thinkpad 8
  Bluetooth: btbcm: Stop using upper nibble of rev to chose between 
uart/USB paths
  Bluetooth: btbcm: Factor out common code to determine subversion
  Bluetooth: btbcm: Make btbcm_initialize() also work for USB connected 
devices
  Bluetooth: btbcm: Allow using btbcm_initialize() for reinit
  Bluetooth: btbcm: Remove duplicate code from btbcm_setup_patchram()
  Bluetooth: btbcm: btbcm_initialize(): Initialize hw_name to "BCM"

John Keeping (1):
  Bluetooth: use wait_event API instead of open-coding it

João Paulo Rechi Vita (1):
  Bluetooth: Add a new 13d3:3496 QCA_ROME device

Loic Poulain (3):
  Bluetooth: btqcomsmd: Fix rx/tx stats
  Bluetooth: Add __hci_cmd_send function
  Bluetooth: btqca: Add AR3002 rampatch support

Srinivas Kandagatla (1):
  arm64: dts: apq8096-db820c: Enable wlan and bt en pins

Thierry Escande (3):
  arm64: dts: apq8096-db820c: enable bluetooth node
  dt-bindings: net: bluetooth: Add qualcomm-bluetooth
  Bluetooth: hci_qca: Add serdev support

Vic Wei (1):
  Bluetooth: btusb: add ID for LiteOn 04ca:301a

 .../devicetree/bindings/net/qualcomm-bluetooth.txt |  30 +++
 arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi  |  26 +++
 .../boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi|  32 
 arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi   |  61 +++
 arch/arm64/boot/dts/qcom/msm8996.dtsi  |  10 +
 drivers/bluetooth/Kconfig  |   1 +
 drivers/bluetooth/btbcm.c  | 201 +++--
 drivers/bluetooth/btbcm.h  |   5 +-
 drivers/bluetooth/btqca.c  | 104 +--
 drivers/bluetooth/btqca.h  |  11 +-
 drivers/bluetooth/btqcomsmd.c  |  10 +
 drivers/bluetooth/btusb.c  |   2 +
 drivers/bluetooth/hci_bcm.c|  35 ++--
 drivers/bluetooth/hci_ldisc.c  |   2 +
 drivers/bluetooth/hci_qca.c| 116 +++-
 include/net/bluetooth/hci_core.h   |   2 +
 net/bluetooth/hci_core.c   |  31 
 net/bluetooth/hci_event.c  |  12 +-
 net/bluetooth/hci_request.c|  30 +--
 19 files changed, 480 insertions(+), 241 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt


signature.asc
Description: PGP signature

Re: [PATCH] isdn: eicon: fix a missing-check bug

2018-05-18 Thread Wenwen Wang

Thanks for your suggestion, David! I will revise the patch and resubmit it.

Wenwen

On Fri, May 11, 2018 at 2:50 PM, David Miller  wrote:
> From: Wenwen Wang 
> Date: Sat,  5 May 2018 14:32:46 -0500
>
>> To avoid such issues, this patch adds a check after the second copy in the
>> function diva_xdi_write(). If the adapter number is not equal to the one
>> obtained in the first copy, (-4) will be returned to divas_write(), which
>> will then return an error code -EINVAL.
>
> Better fix is to copy the msg header once into an on-stack buffer supplied
> by diva_write() to diva_xdi_open_adapter(), which is then passed on to
> diva_xdi_write() with an adjusted src pointer and length.

Re: cascaded switch

2018-05-18 Thread Andrew Lunn

On Fri, May 18, 2018 at 09:35:38PM +0300, Ran Shalit wrote:
> Hello,
> 
> I am trying to understand the concept of cascaded switch.
> I haven't find much information on this topic.
> 
> Can anyone please explain the general concept, when is it used, and
> why does the device tree need to know about cascaded switch ?

Hi Ran

I think you first need to define what you mean by cascaded switches.

  Andrew

[PATCH bpf-next 3/3] nfp: bpf: support arithmetic indirect right shift (BPF_ARSH | BPF_X)

2018-05-18 Thread Jakub Kicinski

From: Jiong Wang 

Code logic is similar with arithmetic right shift by constant, and NFP
get indirect shift amount through source A operand of PREV_ALU.

It is possible to fall back to logic right shift if the MSB is known to be
zero from range info, however there is no benefit to do this given logic
indirect right shift use the same number and cycle of instruction sequence.

Suppose the MSB of regX is the bit we want to replicate to fill in all the
vacant positions, and regY contains the shift amount, then we could use
single instruction to set up both.

  [alu, --, regY, OR, regX]

  --
  NOTE: the PREV_ALU result doesn't need to write to any destination
register.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 99 ++--
 1 file changed, 89 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index f73242c4da2f..8a92088df0d7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1919,29 +1919,26 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
  * told through PREV_ALU result.
  */
-static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
 {
-   const struct bpf_insn *insn = >insn;
-   u8 dst = insn->dst_reg * 2;
-
-   if (insn->imm < 32) {
+   if (shift_amt < 32) {
emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
-reg_b(dst), SHF_SC_R_DSHF, insn->imm);
+reg_b(dst), SHF_SC_R_DSHF, shift_amt);
/* Set signedness bit. */
emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
 reg_imm(0));
emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
-reg_b(dst + 1), SHF_SC_R_SHF, insn->imm);
-   } else if (insn->imm == 32) {
+reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
+   } else if (shift_amt == 32) {
/* NOTE: this also helps setting signedness bit. */
wrp_reg_mov(nfp_prog, dst, dst + 1);
emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
 reg_b(dst + 1), SHF_SC_R_SHF, 31);
-   } else if (insn->imm > 32) {
+   } else if (shift_amt > 32) {
emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
 reg_imm(0));
emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
-reg_b(dst + 1), SHF_SC_R_SHF, insn->imm - 32);
+reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
 reg_b(dst + 1), SHF_SC_R_SHF, 31);
}
@@ -1949,6 +1946,87 @@ static int ashr_imm64(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return 0;
 }
 
+static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const struct bpf_insn *insn = >insn;
+   u8 dst = insn->dst_reg * 2;
+
+   return __ashr_imm64(nfp_prog, dst, insn->imm);
+}
+
+static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+   /* NOTE: the first insn will set both indirect shift amount (source A)
+* and signedness bit (MSB of result).
+*/
+   emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
+   emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+  reg_b(dst + 1), SHF_SC_R_SHF);
+}
+
+static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+   /* NOTE: it is the same as logic shift because we don't need to shift in
+* signedness bit when the shift amount is less than 32.
+*/
+   return shr_reg64_lt32_low(nfp_prog, dst, src);
+}
+
+static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+   ashr_reg64_lt32_low(nfp_prog, dst, src);
+   ashr_reg64_lt32_high(nfp_prog, dst, src);
+}
+
+static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+   emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
+   emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
+  reg_b(dst + 1), SHF_SC_R_SHF);
+   emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+reg_b(dst + 1), SHF_SC_R_SHF, 31);
+}
+
+/* Like ashr_imm64, but need to use indirect shift. */
+static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const

[PATCH bpf-next 1/3] nfp: bpf: support logic indirect shifts (BPF_[L|R]SH | BPF_X)

2018-05-18 Thread Jakub Kicinski

From: Jiong Wang 

For indirect shifts, shift amount is not specified as constant, NFP needs
to get the shift amount through the low 5 bits of source A operand in
PREV_ALU, therefore extra instructions are needed compared with shifts by
constants.

Because NFP is 32-bit, so we are using register pair for 64-bit shifts and
therefore would need different instruction sequences depending on whether
shift amount is less than 32 or not.

NFP branch-on-bit-test instruction emitter is added by this patch and is
used for efficient runtime check on shift amount. We'd think the shift
amount is less than 32 if bit 5 is clear and greater or equal than 32
otherwise. Shift amount is greater than or equal to 64 will result in
undefined behavior.

This patch also use range info to avoid generating unnecessary runtime code
if we are certain shift amount is less than 32 or not.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 299 --
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  28 ++
 .../net/ethernet/netronome/nfp/bpf/offload.c  |   2 +
 .../net/ethernet/netronome/nfp/bpf/verifier.c |   8 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |  17 +-
 5 files changed, 322 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index a4d3da215863..4cff08771951 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -211,6 +211,60 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 
addr, u8 defer)
emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
 }
 
+static void
+__emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 
defer,
+ bool set, bool src_lmextn)
+{
+   u16 addr_lo, addr_hi;
+   u64 insn;
+
+   addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
+   addr_hi = addr != addr_lo;
+
+   insn = OP_BR_BIT_BASE |
+   FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
+   FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
+   FIELD_PREP(OP_BR_BIT_BV, set) |
+   FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
+   FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
+   FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
+   FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
+
+   nfp_prog_push(nfp_prog, insn);
+}
+
+static void
+emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
+u8 defer, bool set, enum nfp_relo_type relo)
+{
+   struct nfp_insn_re_regs reg;
+   int err;
+
+   /* NOTE: The bit to test is specified as an rotation amount, such that
+*   the bit to test will be placed on the MSB of the result when
+*   doing a rotate right. For bit X, we need right rotate X + 1.
+*/
+   bit += 1;
+
+   err = swreg_to_restricted(reg_none(), src, reg_imm(bit), , false);
+   if (err) {
+   nfp_prog->error = err;
+   return;
+   }
+
+   __emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
+ reg.src_lmextn);
+
+   nfp_prog->prog[nfp_prog->prog_len - 1] |=
+   FIELD_PREP(OP_RELO_TYPE, relo);
+}
+
+static void
+emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
+{
+   emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
+}
+
 static void
 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
 enum immed_width width, bool invert,
@@ -309,6 +363,19 @@ emit_shf(struct nfp_prog *nfp_prog, swreg dst,
   reg.dst_lmextn, reg.src_lmextn);
 }
 
+static void
+emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
+  swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
+{
+   if (sc == SHF_SC_R_ROT) {
+   pr_err("indirect shift is not allowed on rotation\n");
+   nfp_prog->error = -EFAULT;
+   return;
+   }
+
+   emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
+}
+
 static void
 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
@@ -1629,56 +1696,226 @@ static int neg_reg64(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return 0;
 }
 
-static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-   const struct bpf_insn *insn = >insn;
-   u8 dst = insn->dst_reg * 2;
-
-   if (insn->imm < 32) {
-   emit_shf(nfp_prog, reg_both(dst + 1),
-reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
-SHF_SC_R_DSHF, 32 - insn->imm);
-   emit_shf(nfp_prog, reg_both(dst),
-reg_none(), SHF_OP_NONE, reg_b(dst),
-

Re: [PATCH v2 1/1] sh_eth: add RGMII support

2018-05-18 Thread Andrew Lunn

On Fri, May 18, 2018 at 09:30:18PM +0300, Sergei Shtylyov wrote:
> The R-Car V3H (AKA R8A77980) GEther controller  adds support for the RGMII
> PHY interface mode as a new  value  for the RMII_MII register.
> 
> Based on the original (and large) patch by Vladimir Barinov.
> 
> Signed-off-by: Vladimir Barinov 
> Signed-off-by: Sergei Shtylyov 
> 
> ---
> Changes in version 2:
> - included PHY_INTERFACE_MODE_RGMII_{|RX|TX}ID in the RGMII *case*.
> 
>  drivers/net/ethernet/renesas/sh_eth.c |3 +++
>  1 file changed, 3 insertions(+)
> 
> Index: net-next/drivers/net/ethernet/renesas/sh_eth.c
> ===
> --- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
> +++ net-next/drivers/net/ethernet/renesas/sh_eth.c
> @@ -466,6 +466,9 @@ static void sh_eth_select_mii(struct net
>   u32 value;
>  
>   switch (mdp->phy_interface) {
> + case PHY_INTERFACE_MODE_RGMII ... PHY_INTERFACE_MODE_RGMII_TXID:
> + value = 0x3;
> + break;

Ah, your don't see that form used very often.
I just checked, clang/llvm should also support it.

Reviewed-by: Andrew Lunn 

Andrew

[PATCH bpf-next 2/3] nfp: bpf: support arithmetic right shift by constant (BPF_ARSH | BPF_K)

2018-05-18 Thread Jakub Kicinski

From: Jiong Wang 

Code logic is similar with logic right shift except we also need to set
PREV_ALU result properly, the MSB of which is the bit that will be
replicated to fill in all the vacant positions.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 34 
 drivers/net/ethernet/netronome/nfp/nfp_asm.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 4cff08771951..f73242c4da2f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1916,6 +1916,39 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return 0;
 }
 
+/* Code logic is the same as __shr_imm64 except ashr requires signedness bit
+ * told through PREV_ALU result.
+ */
+static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const struct bpf_insn *insn = >insn;
+   u8 dst = insn->dst_reg * 2;
+
+   if (insn->imm < 32) {
+   emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
+reg_b(dst), SHF_SC_R_DSHF, insn->imm);
+   /* Set signedness bit. */
+   emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
+reg_imm(0));
+   emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+reg_b(dst + 1), SHF_SC_R_SHF, insn->imm);
+   } else if (insn->imm == 32) {
+   /* NOTE: this also helps setting signedness bit. */
+   wrp_reg_mov(nfp_prog, dst, dst + 1);
+   emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+reg_b(dst + 1), SHF_SC_R_SHF, 31);
+   } else if (insn->imm > 32) {
+   emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
+reg_imm(0));
+   emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
+reg_b(dst + 1), SHF_SC_R_SHF, insn->imm - 32);
+   emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+reg_b(dst + 1), SHF_SC_R_SHF, 31);
+   }
+
+   return 0;
+}
+
 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
const struct bpf_insn *insn = >insn;
@@ -2742,6 +2775,7 @@ static const instr_cb_t instr_cb[256] = {
[BPF_ALU64 | BPF_LSH | BPF_K] = shl_imm64,
[BPF_ALU64 | BPF_RSH | BPF_X] = shr_reg64,
[BPF_ALU64 | BPF_RSH | BPF_K] = shr_imm64,
+   [BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
[BPF_ALU | BPF_MOV | BPF_X] =   mov_reg,
[BPF_ALU | BPF_MOV | BPF_K] =   mov_imm,
[BPF_ALU | BPF_XOR | BPF_X] =   xor_reg,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h 
b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index fa826bd9c668..f6677bc9875a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -174,6 +174,7 @@ enum shf_op {
SHF_OP_NONE = 0,
SHF_OP_AND = 2,
SHF_OP_OR = 5,
+   SHF_OP_ASHR = 6,
 };
 
 enum shf_sc {
-- 
2.17.0

[PATCH bpf-next 0/3] nfp: bpf: complete shift supports on NFP JIT

2018-05-18 Thread Jakub Kicinski

Jiong says:

NFP eBPF JIT is missing logic indirect shifts (both left and right) and
arithmetic right shift (both indirect shift and shift by constant).

This patch adds support for them.

For indirect shifts, shift amount is not specified as constant, NFP needs
to get the shift amount through the low 5 bits of source A operand in
PREV_ALU, therefore extra instructions are needed compared with shifts by
constants.

Because NFP is 32-bit, so we are using register pair for 64-bit shifts and
therefore would need different instruction sequences depending on whether
shift amount is less than 32 or not.

NFP branch-on-bit-test instruction emitter is added by this patch set and
is used for efficient runtime check on shift amount. We'd think the shift
amount is less than 32 if bit 5 is clear and greater or equal then 32
otherwise. Shift amount is greater than or equal to 64 will result in
undefined behavior.

This patch also use range info to avoid generating unnecessary runtime code
if we are certain shift amount is less than 32 or not.


Jiong Wang (3):
  nfp: bpf: support logic indirect shifts (BPF_[L|R]SH | BPF_X)
  nfp: bpf: support arithmetic right shift by constant (BPF_ARSH |
BPF_K)
  nfp: bpf: support arithmetic indirect right shift (BPF_ARSH | BPF_X)

 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 410 --
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  28 ++
 .../net/ethernet/netronome/nfp/bpf/offload.c  |   2 +
 .../net/ethernet/netronome/nfp/bpf/verifier.c |   8 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |  18 +-
 5 files changed, 435 insertions(+), 31 deletions(-)

-- 
2.17.0

[PATCH RFC net-next 1/1] tcp: close socket without reset on incoming data

2018-05-18 Thread Debabrata Banerjee

When TCP_CLOSE_NORST is set before a close(), offload sinking of
unwanted data to the kernel with low resource usage, with a timeout of
TCP_LINGER2. The socket will transition to FIN_WAIT1 and then FIN_WAIT2
where it will ack data until either the timeout is hit, or a RST or FIN
is received.

Signed-off-by: Debabrata Banerjee 
---
 include/linux/tcp.h  |  4 +++-
 include/uapi/linux/tcp.h |  2 +-
 net/ipv4/tcp.c   | 23 +--
 net/ipv4/tcp_input.c | 16 
 net/ipv4/tcp_minisocks.c | 15 +++
 5 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 72705eaf4b84..bd44bc99b480 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -226,7 +226,8 @@ struct tcp_sock {
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a 
cookie */
is_sack_reneg:1,/* in recovery from loss with SACK reneg? */
-   unused:2;
+   norst:1,/* Don't send RST on shutdown() socket */
+   unused:1;
u8  nonagle : 4,/* Disable Nagle algorithm? */
thin_lto: 1,/* Use linear timeouts for thin streams */
recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
@@ -429,6 +430,7 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
 #endif
+   int   tw_norst;
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..369f3402b669 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -124,8 +124,8 @@ enum {
 #define TCP_FASTOPEN_NO_COOKIE 34  /* Enable TFO without a TFO cookie */
 #define TCP_ZEROCOPY_RECEIVE   35
 #define TCP_INQ36  /* Notify bytes available to 
read as a cmsg on read */
-
 #define TCP_CM_INQ TCP_INQ
+#define TCP_CLOSE_NORST37  /* Don't send RST on close()'d 
socket */
 
 struct tcp_repair_opt {
__u32   opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a2ea0bbf867..29fe763002e5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2318,8 +2318,10 @@ void tcp_close(struct sock *sk, long timeout)
struct sk_buff *skb;
int data_was_unread = 0;
int state;
+   struct tcp_sock *tp;
 
lock_sock(sk);
+   tp = tcp_sk(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
 
if (sk->sk_state == TCP_LISTEN) {
@@ -2362,8 +2364,19 @@ void tcp_close(struct sock *sk, long timeout)
} else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
-   tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+
+   if (unlikely(tp->norst)) {
+   if (tcp_close_state(sk)) {
+   /* We will discard all new incoming data
+* set window to max of current or init.
+*/
+   tp->rcv_wnd = max(tp->rcv_wnd, MAX_TCP_WINDOW);
+   tcp_send_fin(sk);
+   }
+   } else {
+   tcp_set_state(sk, TCP_CLOSE);
+   tcp_send_active_reset(sk, sk->sk_allocation);
+   }
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -3040,6 +3053,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+   case TCP_CLOSE_NORST:
+   tp->norst = !!val;
+   break;
default:
err = -ENOPROTOOPT;
break;
@@ -3523,6 +3539,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return err;
}
 #endif
+   case TCP_CLOSE_NORST:
+   val = tp->norst;
+   break;
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf..e0aa6e126700 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6054,7 +6054,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
break;
}
 
-   if (tp->linger2 < 0) {
+   if (likely(!tp->norst) && tp->linger2 < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
@@ -6064,9

[PATCH RFC net-next 0/1] tcp: close socket without reset on incoming data

2018-05-18 Thread Debabrata Banerjee


There is a basic problem with TCP sockets, where sending and closing of
data is unreliable. One good example of this is a web server that wants
to send an error back on a HTTP POST and close the socket, however
assuming the POST was of any significant size what really happens is
that the browser gets a broken socket while it is trying to post, and
never reads the error, possible retrying the whole POST a number of
times. This has been well documented by other people, for example this
blog post:

https://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable

Without this patch, our server application has to hang on to a socket
sink all of the POST data, eating up memory and cpu. With this patch
the task is offloaded to the kernel, which uses only a timewait socket
to efficiently ack and discard any incoming data. We've been using a
similar patch internally for years, I think it has applications for
everyone.

Debabrata Banerjee (1):
  tcp: close socket without reset on incoming data

 include/linux/tcp.h  |  4 +++-
 include/uapi/linux/tcp.h |  2 +-
 net/ipv4/tcp.c   | 23 +--
 net/ipv4/tcp_input.c | 16 
 net/ipv4/tcp_minisocks.c | 15 +++
 5 files changed, 52 insertions(+), 8 deletions(-)

-- 
2.17.0

Re: WARNING in ip_recv_error

2018-05-18 Thread Willem de Bruijn

On Fri, May 18, 2018 at 2:46 PM, Willem de Bruijn
 wrote:
> On Fri, May 18, 2018 at 2:44 PM, Willem de Bruijn
>  wrote:
>> On Fri, May 18, 2018 at 1:09 PM, Willem de Bruijn
>>  wrote:
>>> On Fri, May 18, 2018 at 11:44 AM, David Miller  wrote:
 From: Eric Dumazet 
 Date: Fri, 18 May 2018 08:30:43 -0700

> We probably need to revert Willem patch 
> (7ce875e5ecb8562fd44040f69bda96c999e38bbc)

 Is it really valid to reach ip_recv_err with an ipv6 socket?
>>>
>>> I guess the issue is that setsockopt IPV6_ADDRFORM is not an
>>> atomic operation, so that the socket is neither fully ipv4 nor fully
>>> ipv6 by the time it reaches ip_recv_error.
>>>
>>>   sk->sk_socket->ops = _dgram_ops;
>>>   < HERE >
>>>   sk->sk_family = PF_INET;
>>>
>>> Even calling inet_recv_error to demux would not necessarily help.
>>>
>>> Safest would be to look up by skb->protocol, similar to what
>>> ipv6_recv_error does to handle v4-mapped-v6.
>>>
>>> Or to make that function safe with PF_INET and swap the order
>>> of the above two operations.
>>>
>>> All sound needlessly complicated for this rare socket option, but
>>> I don't have a better idea yet. Dropping on the floor is not nice,
>>> either.
>>
>> Ensuring that ip_recv_error correctly handles packets from either
>> socket and removing the warning should indeed be good.
>>
>> It is robust against v4-mapped packets from an AF_INET6 socket,
>> but see caveat on reconnect below.
>>
>> The code between ipv6_recv_error for v4-mapped addresses and
>> ip_recv_error is essentially the same, the main difference being
>> whether to return network headers as sockaddr_in with SOL_IP
>> or sockaddr_in6 with SOL_IPV6.
>>
>> There are very few other locations in the stack that explicitly test
>> sk_family in this way and thus would be vulnerable to races with
>> IPV6_ADDRFORM.
>>
>> I'm not sure whether it is possible for a udpv6 socket to queue a
>> real ipv6 packet on the error queue, disconnect, connect to an
>> ipv4 address, call IPV6_ADDRFORM and then call ip_recv_error
>> on a true ipv6 packet. That would return buggy data, e.g., in
>> msg_name.
>
> In do_ipv6_setsockopt IPV6_ADDRFORM we can test that the
> error queue is empty, and then take its lock for the duration of the
> operation.

Actually, no reason to hold the lock. This setsockopt holds the socket
lock, which connect would need, too. So testing that the queue
is empty after testing that it is connected to a v4 address is
sufficient to ensure that no ipv6 packets are queued for reception.

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4d780c7f0130..a975d6311341 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -199,6 +199,11 @@ static int do_ipv6_setsockopt(struct sock *sk,
int level, int optname,

if (ipv6_only_sock(sk) ||
!ipv6_addr_v4mapped(>sk_v6_daddr)) {
retv = -EADDRNOTAVAIL;
break;
}

+   if (!skb_queue_empty(>sk_error_queue)) {
+   retv = -EBUSY;
+   break;
+   }
+
fl6_free_socklist(sk);
__ipv6_sock_mc_close(sk);

After this it should be safe to remove the warning in ip_recv_error.

[net-next] Revert "ixgbe: release lock for the duration of ixgbe_suspend_close()"

2018-05-18 Thread Jeff Kirsher

This reverts commit 6710f970d9979d8f03f6e292bb729b2ee1526d0e.

Gotta love when developers have offline discussions, thinking everyone
is reading their responses/dialog.

The change had the potential for a number of race conditions on
shutdown, which is why we are reverting the change.

Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 5ddfb93ed491..a52d92e182ee 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6698,15 +6698,8 @@ static int __ixgbe_shutdown(struct pci_dev *pdev, bool 
*enable_wake)
rtnl_lock();
netif_device_detach(netdev);
 
-   if (netif_running(netdev)) {
-   /* Suspend takes a long time, device_shutdown may be
-* parallelized this function, so drop lock for the
-* duration of this call.
-*/
-   rtnl_unlock();
+   if (netif_running(netdev))
ixgbe_close_suspend(adapter);
-   rtnl_lock();
-   }
 
ixgbe_clear_interrupt_scheme(adapter);
rtnl_unlock();
-- 
2.17.0

Re: WARNING in ip_recv_error

2018-05-18 Thread Willem de Bruijn

On Fri, May 18, 2018 at 2:44 PM, Willem de Bruijn
 wrote:
> On Fri, May 18, 2018 at 1:09 PM, Willem de Bruijn
>  wrote:
>> On Fri, May 18, 2018 at 11:44 AM, David Miller  wrote:
>>> From: Eric Dumazet 
>>> Date: Fri, 18 May 2018 08:30:43 -0700
>>>
 We probably need to revert Willem patch 
 (7ce875e5ecb8562fd44040f69bda96c999e38bbc)
>>>
>>> Is it really valid to reach ip_recv_err with an ipv6 socket?
>>
>> I guess the issue is that setsockopt IPV6_ADDRFORM is not an
>> atomic operation, so that the socket is neither fully ipv4 nor fully
>> ipv6 by the time it reaches ip_recv_error.
>>
>>   sk->sk_socket->ops = _dgram_ops;
>>   < HERE >
>>   sk->sk_family = PF_INET;
>>
>> Even calling inet_recv_error to demux would not necessarily help.
>>
>> Safest would be to look up by skb->protocol, similar to what
>> ipv6_recv_error does to handle v4-mapped-v6.
>>
>> Or to make that function safe with PF_INET and swap the order
>> of the above two operations.
>>
>> All sound needlessly complicated for this rare socket option, but
>> I don't have a better idea yet. Dropping on the floor is not nice,
>> either.
>
> Ensuring that ip_recv_error correctly handles packets from either
> socket and removing the warning should indeed be good.
>
> It is robust against v4-mapped packets from an AF_INET6 socket,
> but see caveat on reconnect below.
>
> The code between ipv6_recv_error for v4-mapped addresses and
> ip_recv_error is essentially the same, the main difference being
> whether to return network headers as sockaddr_in with SOL_IP
> or sockaddr_in6 with SOL_IPV6.
>
> There are very few other locations in the stack that explicitly test
> sk_family in this way and thus would be vulnerable to races with
> IPV6_ADDRFORM.
>
> I'm not sure whether it is possible for a udpv6 socket to queue a
> real ipv6 packet on the error queue, disconnect, connect to an
> ipv4 address, call IPV6_ADDRFORM and then call ip_recv_error
> on a true ipv6 packet. That would return buggy data, e.g., in
> msg_name.

In do_ipv6_setsockopt IPV6_ADDRFORM we can test that the
error queue is empty, and then take its lock for the duration of the
operation.

Re: WARNING in ip_recv_error

2018-05-18 Thread Willem de Bruijn

On Fri, May 18, 2018 at 1:09 PM, Willem de Bruijn
 wrote:
> On Fri, May 18, 2018 at 11:44 AM, David Miller  wrote:
>> From: Eric Dumazet 
>> Date: Fri, 18 May 2018 08:30:43 -0700
>>
>>> We probably need to revert Willem patch 
>>> (7ce875e5ecb8562fd44040f69bda96c999e38bbc)
>>
>> Is it really valid to reach ip_recv_err with an ipv6 socket?
>
> I guess the issue is that setsockopt IPV6_ADDRFORM is not an
> atomic operation, so that the socket is neither fully ipv4 nor fully
> ipv6 by the time it reaches ip_recv_error.
>
>   sk->sk_socket->ops = _dgram_ops;
>   < HERE >
>   sk->sk_family = PF_INET;
>
> Even calling inet_recv_error to demux would not necessarily help.
>
> Safest would be to look up by skb->protocol, similar to what
> ipv6_recv_error does to handle v4-mapped-v6.
>
> Or to make that function safe with PF_INET and swap the order
> of the above two operations.
>
> All sound needlessly complicated for this rare socket option, but
> I don't have a better idea yet. Dropping on the floor is not nice,
> either.

Ensuring that ip_recv_error correctly handles packets from either
socket and removing the warning should indeed be good.

It is robust against v4-mapped packets from an AF_INET6 socket,
but see caveat on reconnect below.

The code between ipv6_recv_error for v4-mapped addresses and
ip_recv_error is essentially the same, the main difference being
whether to return network headers as sockaddr_in with SOL_IP
or sockaddr_in6 with SOL_IPV6.

There are very few other locations in the stack that explicitly test
sk_family in this way and thus would be vulnerable to races with
IPV6_ADDRFORM.

I'm not sure whether it is possible for a udpv6 socket to queue a
real ipv6 packet on the error queue, disconnect, connect to an
ipv4 address, call IPV6_ADDRFORM and then call ip_recv_error
on a true ipv6 packet. That would return buggy data, e.g., in
msg_name.

cascaded switch

2018-05-18 Thread Ran Shalit

Hello,

I am trying to understand the concept of cascaded switch.
I haven't find much information on this topic.

Can anyone please explain the general concept, when is it used, and
why does the device tree need to know about cascaded switch ?

Thank you,
ranran

[PATCH v2 3/3] sh_eth: add R8A77980 support

2018-05-18 Thread Sergei Shtylyov

Finally, add support for the DT probing of the R-Car V3H (AKA R8A77980) --
it's the only R-Car gen3 SoC having the GEther controller -- others have
only EtherAVB...

Based on the original (and large) patch by Vladimir Barinov.

Signed-off-by: Vladimir Barinov 
Signed-off-by: Sergei Shtylyov 
Reviewed-by: Simon Horman 

---
Changes in version 2:
- added Simon's tag.

 Documentation/devicetree/bindings/net/sh_eth.txt |1 
 drivers/net/ethernet/renesas/sh_eth.c|   44 +++
 2 files changed, 45 insertions(+)

Index: net-next/Documentation/devicetree/bindings/net/sh_eth.txt
===
--- net-next.orig/Documentation/devicetree/bindings/net/sh_eth.txt
+++ net-next/Documentation/devicetree/bindings/net/sh_eth.txt
@@ -14,6 +14,7 @@ Required properties:
  "renesas,ether-r8a7791"  if the device is a part of R8A7791 SoC.
  "renesas,ether-r8a7793"  if the device is a part of R8A7793 SoC.
  "renesas,ether-r8a7794"  if the device is a part of R8A7794 SoC.
+ "renesas,gether-r8a77980" if the device is a part of R8A77980 SoC.
  "renesas,ether-r7s72100" if the device is a part of R7S72100 SoC.
  "renesas,rcar-gen1-ether" for a generic R-Car Gen1 device.
  "renesas,rcar-gen2-ether" for a generic R-Car Gen2 or RZ/G1
Index: net-next/drivers/net/ethernet/renesas/sh_eth.c
===
--- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
+++ net-next/drivers/net/ethernet/renesas/sh_eth.c
@@ -753,6 +753,49 @@ static struct sh_eth_cpu_data rcar_gen2_
.rmiimode   = 1,
.magic  = 1,
 };
+
+/* R8A77980 */
+static struct sh_eth_cpu_data r8a77980_data = {
+   .soft_reset = sh_eth_soft_reset_gether,
+
+   .set_duplex = sh_eth_set_duplex,
+   .set_rate   = sh_eth_set_rate_gether,
+
+   .register_type  = SH_ETH_REG_GIGABIT,
+
+   .edtrr_trns = EDTRR_TRNS_GETHER,
+   .ecsr_value = ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD | ECSR_MPD,
+   .ecsipr_value   = ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP |
+ ECSIPR_MPDIP,
+   .eesipr_value   = EESIPR_RFCOFIP | EESIPR_ECIIP |
+ EESIPR_FTCIP | EESIPR_TDEIP | EESIPR_TFUFIP |
+ EESIPR_FRIP | EESIPR_RDEIP | EESIPR_RFOFIP |
+ EESIPR_RMAFIP | EESIPR_RRFIP |
+ EESIPR_RTLFIP | EESIPR_RTSFIP |
+ EESIPR_PREIP | EESIPR_CERFIP,
+
+   .tx_check   = EESR_FTC | EESR_CD | EESR_RTO,
+   .eesr_err_check = EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
+ EESR_RFE | EESR_RDE | EESR_RFRMER |
+ EESR_TFE | EESR_TDE | EESR_ECI,
+   .fdr_value  = 0x070f,
+
+   .apr= 1,
+   .mpr= 1,
+   .tpauser= 1,
+   .bculr  = 1,
+   .hw_swap= 1,
+   .nbst   = 1,
+   .rpadir = 1,
+   .rpadir_value   = 2 << 16,
+   .no_trimd   = 1,
+   .no_ade = 1,
+   .xdfar_rw   = 1,
+   .hw_checksum= 1,
+   .select_mii = 1,
+   .magic  = 1,
+   .cexcr  = 1,
+};
 #endif /* CONFIG_OF */
 
 static void sh_eth_set_rate_sh7724(struct net_device *ndev)
@@ -3134,6 +3177,7 @@ static const struct of_device_id sh_eth_
{ .compatible = "renesas,ether-r8a7791", .data = _gen2_data },
{ .compatible = "renesas,ether-r8a7793", .data = _gen2_data },
{ .compatible = "renesas,ether-r8a7794", .data = _gen2_data },
+   { .compatible = "renesas,gether-r8a77980", .data = _data },
{ .compatible = "renesas,ether-r7s72100", .data = _data },
{ .compatible = "renesas,rcar-gen1-ether", .data = _gen1_data },
{ .compatible = "renesas,rcar-gen2-ether", .data = _gen2_data },

[PATCH v2 2/3] sh_eth: add EDMR.NBST support

2018-05-18 Thread Sergei Shtylyov

The R-Car V3H (AKA R8A77980) GEther controller adds the DMA burst mode bit
(NBST) in EDMR and the manual tells to always set it before doing any DMA.

Based on the original (and large) patch by Vladimir Barinov.

Signed-off-by: Vladimir Barinov 
Signed-off-by: Sergei Shtylyov 
Reviewed-by: Simon Horman 

---
Changes in version 2:
- added Simon's tag.

 drivers/net/ethernet/renesas/sh_eth.c |4 
 drivers/net/ethernet/renesas/sh_eth.h |2 ++
 2 files changed, 6 insertions(+)

Index: net-next/drivers/net/ethernet/renesas/sh_eth.c
===
--- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
+++ net-next/drivers/net/ethernet/renesas/sh_eth.c
@@ -1434,6 +1434,10 @@ static int sh_eth_dev_init(struct net_de
 
sh_eth_write(ndev, mdp->cd->trscer_err_mask, TRSCER);
 
+   /* DMA transfer burst mode */
+   if (mdp->cd->nbst)
+   sh_eth_modify(ndev, EDMR, EDMR_NBST, EDMR_NBST);
+
if (mdp->cd->bculr)
sh_eth_write(ndev, 0x800, BCULR);   /* Burst sycle set */
 
Index: net-next/drivers/net/ethernet/renesas/sh_eth.h
===
--- net-next.orig/drivers/net/ethernet/renesas/sh_eth.h
+++ net-next/drivers/net/ethernet/renesas/sh_eth.h
@@ -184,6 +184,7 @@ enum GECMR_BIT {
 
 /* EDMR */
 enum DMAC_M_BIT {
+   EDMR_NBST = 0x80,
EDMR_EL = 0x40, /* Litte endian */
EDMR_DL1 = 0x20, EDMR_DL0 = 0x10,
EDMR_SRST_GETHER = 0x03,
@@ -505,6 +506,7 @@ struct sh_eth_cpu_data {
unsigned bculr:1;   /* EtherC have BCULR */
unsigned tsu:1; /* EtherC have TSU */
unsigned hw_swap:1; /* E-DMAC have DE bit in EDMR */
+   unsigned nbst:1;/* E-DMAC has NBST bit in EDMR */
unsigned rpadir:1;  /* E-DMAC have RPADIR */
unsigned no_trimd:1;/* E-DMAC DO NOT have TRIMD */
unsigned no_ade:1;  /* E-DMAC DO NOT have ADE bit in EESR */

[PATCH v2 1/1] sh_eth: add RGMII support

2018-05-18 Thread Sergei Shtylyov

The R-Car V3H (AKA R8A77980) GEther controller  adds support for the RGMII
PHY interface mode as a new  value  for the RMII_MII register.

Based on the original (and large) patch by Vladimir Barinov.

Signed-off-by: Vladimir Barinov 
Signed-off-by: Sergei Shtylyov 

---
Changes in version 2:
- included PHY_INTERFACE_MODE_RGMII_{|RX|TX}ID in the RGMII *case*.

 drivers/net/ethernet/renesas/sh_eth.c |3 +++
 1 file changed, 3 insertions(+)

Index: net-next/drivers/net/ethernet/renesas/sh_eth.c
===
--- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
+++ net-next/drivers/net/ethernet/renesas/sh_eth.c
@@ -466,6 +466,9 @@ static void sh_eth_select_mii(struct net
u32 value;
 
switch (mdp->phy_interface) {
+   case PHY_INTERFACE_MODE_RGMII ... PHY_INTERFACE_MODE_RGMII_TXID:
+   value = 0x3;
+   break;
case PHY_INTERFACE_MODE_GMII:
value = 0x2;
break;

[PATCH v2 0/3] Add Renesas R8A77980 GEther support

2018-05-18 Thread Sergei Shtylyov

Hello!

Here's a set of 3 patches against DaveM's 'net-next.git' repo. They (gradually)
add R8A77980 GEther support to the 'sh_eth' driver, starting with couple new
register bits/values introduced with this chip, and ending with adding a new
'struct sh_eth_cpu_data' instance connected to the new DT "compatible" prop
value...

[1/1] sh_eth: add RGMII support
[2/3] sh_eth: add EDMR.NBST support
[3/3] sh_eth: add R8A77980 support

MBR, Sergei

[PATCH] selftests: bpf: config: enable NET_SCH_INGRESS for xdp_meta.sh

2018-05-18 Thread Anders Roxell

When running bpf's selftest test_xdp_meta.sh it fails:
./test_xdp_meta.sh
Error: Specified qdisc not found.
selftests: test_xdp_meta [FAILED]

Need to enable CONFIG_NET_SCH_INGRESS and CONFIG_NET_CLS_ACT to get the
test to pass.

Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
Signed-off-by: Anders Roxell 
---
 tools/testing/selftests/bpf/config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/bpf/config 
b/tools/testing/selftests/bpf/config
index 983dd25d49f4..1eefe211a4a8 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -5,3 +5,5 @@ CONFIG_BPF_EVENTS=y
 CONFIG_TEST_BPF=m
 CONFIG_CGROUP_BPF=y
 CONFIG_NETDEVSIM=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_SCH_INGRESS=y
-- 
2.17.0

Re: [PATCH v2 net-next] net: stmmac: Populate missing callbacks in HWIF initialization

2018-05-18 Thread David Miller

From: Jose Abreu 
Date: Fri, 18 May 2018 16:54:38 +0100

> Some HW specific setups, like sun8i, do not populate all the necessary
> callbacks, which is what HWIF helpers were expecting.
> 
> Fix this by always trying to get the generic helpers and populate them
> if they were not previously populated by HW specific setup.
> 
> Signed-off-by: Jose Abreu 
> Fixes: 5f0456b43140 ("net: stmmac: Implement logic to automatically
> select HW Interface")

Please don't split up Fixes: tag lines like this in the future.  No matter
how long it is, keep it a single line.

> Reported-by: Corentin Labbe 
> Tested-by: Corentin Labbe 

Applied, thank you.

Re: [PATCH net-next] cxgb4: collect SGE PF/VF queue map

2018-05-18 Thread David Miller

From: Rahul Lakkireddy 
Date: Fri, 18 May 2018 19:12:53 +0530

> For T6, collect info on queue mapping to corresponding PF/VF in SGE.
> 
> Signed-off-by: Rahul Lakkireddy 
> Signed-off-by: Ganesh Goudar 

Applied.

Re: [PATCH net] cxgb4: fix offset in collecting TX rate limit info

2018-05-18 Thread David Miller

From: Rahul Lakkireddy 
Date: Fri, 18 May 2018 19:13:37 +0530

> Correct the indirect register offsets in collecting TX rate limit info
> in UP CIM logs.
> 
> Also, T5 doesn't support these indirect register offsets, so remove
> them from collection logic.
> 
> Fixes: be6e36d916b1 ("cxgb4: collect TX rate limit info in UP CIM logs")
> Signed-off-by: Rahul Lakkireddy 
> Signed-off-by: Ganesh Goudar 

Applied and queued up for -stable, thanks.

Re: [PATCH net] net: sched: red: avoid hashing NULL child

2018-05-18 Thread David Miller

From: Paolo Abeni 
Date: Fri, 18 May 2018 14:51:44 +0200

> Hangbin reported an Oops triggered by the syzkaller qdisc rules:
 ...
> When a red qdisc is updated with a 0 limit, the child qdisc is left
> unmodified, no additional scheduler is created in red_change(),
> the 'child' local variable is rightfully NULL and must not add it
> to the hash table.
> 
> This change addresses the above issue moving qdisc_hash_add() right
> after the child qdisc creation. It additionally removes unneeded checks
> for noop_qdisc.
> 
> Reported-by: Hangbin Liu 
> Fixes: 49b499718fa1 ("net: sched: make default fifo qdiscs appear in the 
> dump")
> Signed-off-by: Paolo Abeni 

Applied and queued up for -stable, thanks Paolo.

Re: [PATCH 1/4] arcnet: com20020: Add com20020 io mapped version

2018-05-18 Thread David Miller

From: Andrea Greco 
Date: Fri, 18 May 2018 14:18:41 +0200

> In com20020.c found this:
> /* FIXME: do this some other way! */
> if (!dev->dev_addr[0])
> dev->dev_addr[0] = arcnet_inb(ioaddr, 8);
> 
> NODE-ID, must be univoque, for all arcnet network.
> My previews idea was take random value but, this could create a
> collision over network.
> 
> A possible solution is:
> In case of collision com20020 set a bit in status register.
> Then peak a new NODE-ID and repeat this while correct NODE-ID is found.
> 
> Other ideas is pass it via DTS.
> But suppose have 2 same product in same network, same address same problem.
> For this reason i prefer left standard driver behavior.
> 
> Other ideas for solve this ?

Is there no way to obtain a unique value from the device?

If having a unique ID to talk on the ARCNET is so critical, there must
be some way to properly allocation and use a unique ID.

I guess this must be a general problem with this driver already.

You still need to address the issue of 'dev' being leaked on probe
error paths.

Thank you.

Re: [PATCH] net: mvpp2: typo and cosmetic fixes

2018-05-18 Thread David Miller

From: Antoine Tenart 
Date: Fri, 18 May 2018 14:34:51 +0200

> This patch on the Marvell PPv2 driver is only cosmetic. Two typos are
> removed as well as other cosmetic fixes, such as extra new lines or tabs
> vs spaces.
> 
> Suggested-by: Stefan Chulski 
> Signed-off-by: Antoine Tenart 

Applied, thanks.

Re: [PATCH net] sock_diag: fix use-after-free read in __sk_free

2018-05-18 Thread David Miller

From: Eric Dumazet 
Date: Fri, 18 May 2018 04:47:55 -0700

> We must not call sock_diag_has_destroy_listeners(sk) on a socket
> that has no reference on net structure.
 ...
> Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
> Signed-off-by: Eric Dumazet 
> Cc: Craig Gallek 
> Reported-by: syzbot 

Applied and queued up for -stable, thanks Eric.

Re: [PATCH] hippi: fix spelling mistake: "Framming" -> "Framing"

2018-05-18 Thread David Miller

From: Colin King 
Date: Fri, 18 May 2018 11:09:22 +0100

> From: Colin Ian King 
> 
> Trivial fix to spelling mistake in printk message text
> 
> Signed-off-by: Colin Ian King 

Applied.

Re: [PATCH v2] sh_eth: Change platform check to CONFIG_ARCH_RENESAS

2018-05-18 Thread David Miller

From: Geert Uytterhoeven 
Date: Fri, 18 May 2018 12:52:51 +0200

> Since commit 9b5ba0df4ea4f940 ("ARM: shmobile: Introduce ARCH_RENESAS")
> is CONFIG_ARCH_RENESAS a more appropriate platform check than the legacy
> CONFIG_ARCH_SHMOBILE, hence use the former.
> 
> Renesas SuperH SH-Mobile SoCs are still covered by the CONFIG_CPU_SH4
> check.
> 
> This will allow to drop ARCH_SHMOBILE on ARM and ARM64 in the near
> future.
> 
> Signed-off-by: Geert Uytterhoeven 
> Acked-by: Arnd Bergmann 
> Acked-by: Sergei Shtylyov 
> Reviewed-by: Simon Horman 

Applied.

Re: [PATCH] net/atheros: fix spelling mistake: "Ddescription" -> "Description"

2018-05-18 Thread David Miller

From: Colin King 
Date: Fri, 18 May 2018 10:22:06 +0100

> From: Colin Ian King 
> 
> Trivial fix to spelling mistakes in name field text
> 
> Signed-off-by: Colin Ian King 
> ---
>  drivers/net/ethernet/atheros/atl1e/atl1e_param.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_param.c 
> b/drivers/net/ethernet/atheros/atl1e/atl1e_param.c
> index fa314282c9ad..6a375f4bd054 100644
> --- a/drivers/net/ethernet/atheros/atl1e/atl1e_param.c
> +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_param.c
> @@ -192,7 +192,7 @@ void atl1e_check_options(struct atl1e_adapter *adapter)
>   {   /* Transmit Ring Size */
>   struct atl1e_option opt = {
>   .type = range_option,
> - .name = "Transmit Ddescription Count",
> + .name = "Transmit Description Count",

In this context, "Descriptor" would be the most appropriate word.

Thank you.

Re: [net-next 3/6] ixgbe: release lock for the duration of ixgbe_suspend_close()

2018-05-18 Thread Pavel Tatashin

On 05/18/2018 01:28 PM, Sergei Shtylyov wrote:
> On 05/18/2018 02:37 PM, Pavel Tatashin wrote:
> 
>>   * parallelized this function, so drop lock for the
>>>
>>> Parallelizing? Else the sentence doesn't parse for me. :-)
> 
>My comment hardly makes sense when you removed all the context...

Hi Sergei,

Ah gotcha:

+/* Suspend takes a long time, device_shutdown may be
+ * parallelized this function, so drop lock for the 

The comment should have been:

ixgbe_close_suspend() takes a long time, and device_shutdown may parallelize 
ixgbe_shutdown(), therefore drop lock to allow concurrent execution of 
ixgbe_close_suspend().

Anyway, as I said, this patch should be dropped.

Thank you,
Pavel

Re: [RFC PATCH net-next] tcp: tcp_rack_reo_wnd() can be static

2018-05-18 Thread David Miller

From: kbuild test robot 
Date: Fri, 18 May 2018 13:14:23 +0800

> Fixes: 20b654dfe1be ("tcp: support DUPACK threshold in RACK")
> Signed-off-by: kbuild test robot 

Looks good, applied, thanks!

Re: [net-next 3/6] ixgbe: release lock for the duration of ixgbe_suspend_close()

2018-05-18 Thread Sergei Shtylyov

On 05/18/2018 02:37 PM, Pavel Tatashin wrote:

>   * parallelized this function, so drop lock for the
>>
>> Parallelizing? Else the sentence doesn't parse for me. :-)

   My comment hardly makes sense when you removed all the context...

> Hi Sergei,
> 
> In a separate series I parallelized device_shutdown(), see:
> http://lkml.kernel.org/r/20180516024004.28977-1-pasha.tatas...@oracle.com
> 
> But, this particular patch should be dropped, as discussed in this thread:
> http://lkml.kernel.org/r/20180503035931.22439-2-pasha.tatas...@oracle.com
> 
> 
> Alexander Duyck, made a point that a generic RTNL scalability fix should be 
> done. This particular patch might introduce a race, since it relies on 
> assumption that RTNL is not needed in this place because  ixgbe_close() does 
> not have it, but Alexander Duyck, says that the callers of ixgbe_close() are 
> assumed to own this lock.

   My comment was about the English grammar only. :-)

> Thank you,
> Pavel

MBR, Sergei

Re: [PATCH bpf-next 0/5] fix test_sockmap

2018-05-18 Thread Daniel Borkmann

On 05/18/2018 06:54 PM, Shuah Khan wrote:
> On 05/18/2018 01:17 AM, Prashant Bhole wrote:
>> This series fixes bugs in test_sockmap code. They weren't caught
>> previously because failure in RX/TX thread was not notified to the
>> main thread.
>>
>> Also fixed data verification logic and slightly improved test output
>> such that parameters values (cork, apply, start, end) of failed test
>> can be easily seen.
>>
>> Note: Even after fixing above problems there are issues with tests
>> which set cork parameter. Tests fail (RX thread timeout) when cork
>> value is non-zero and overall data sent by TX thread isn't multiples
>> of cork value.
>>
>> Prashant Bhole (5):
>>   selftests/bpf: test_sockmap, check test failure
>>   selftests/bpf: test_sockmap, join cgroup in selftest mode
>>   selftests/bpf: test_sockmap, fix test timeout
>>   selftests/bpf: test_sockmap, fix data verification
>>   selftests/bpf: test_sockmap, print additional test options
>>
>>  tools/testing/selftests/bpf/test_sockmap.c | 76 
>> +++---
>>  1 file changed, 58 insertions(+), 18 deletions(-)
> 
> Please remember to cc linux-kselftest mailing list as well. I would like to 
> see
> all the test patches cc'ed to it. Linaro and other test users watch the 
> kselftest
> mailing list. I also have patchwork project now to manage the patch volume.
> 
> I am okay with patches going through net/bpf trees - there are always test
> dependencies on net/bpf trees.

Yep, routing all the BPF selftest patches via bpf/bpf-next tree is the only
viable model that works for us in BPF case, in fact also looks like BPF
selftests are the busiest subdir in #commits so avoiding merge conflicts is
crucial. Whenever appropriate, most fixes or new features are very often
accompanied in a patch set with extensive selftests for BPF, so it has a
deep dependency on the two trees.

Best,
Daniel

Re: [pull request][for-next 00/15] Mellanox, mlx5 core and netdev updates 2018-05-17

2018-05-18 Thread Jason Gunthorpe

On Fri, May 18, 2018 at 01:03:51PM -0400, David Miller wrote:
> From: Saeed Mahameed 
> Date: Thu, 17 May 2018 18:22:43 -0700
> 
> > Below you can find two pull requests,
> > 
> > 1. mlx5 core updates to be shared for both netdev and RDMA, (patches 1..9)
> >  which is based on the last mlx5-next pull request
> >  
> > The following changes since commit a8408f4e6db775e245f20edf12b13fd58cc03a1c:
> > 
> >   net/mlx5: fix spelling mistake: "modfiy" -> "modify" (2018-05-04 12:11:51 
> > -0700)
> > 
> > are available in the Git repository at:
> > 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git 
> > tags/mlx5-updates-2018-05-17
> > 
> > for you to fetch changes up to 10ff5359f883412728ba816046ee3a696625ca02:
> > 
> >   net/mlx5e: Explicitly set source e-switch in offloaded TC rules 
> > (2018-05-17 14:17:35 -0700)
> > 
> > 2. mlx5e netdev updates only for net-next branch (patches 10..15) based on 
> > net-next
> > and the above pull request.
> > 
> > The following changes since commit 538e2de104cfb4ef1acb35af42427bff42adbe4d:
> > 
> >   Merge branch 'net-Allow-more-drivers-with-COMPILE_TEST' (2018-05-17 
> > 17:11:07 -0400)
> > 
> > are available in the Git repository at:
> > 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
> > tags/mlx5e-updates-2018-05-17
> > 
> > for you to fetch changes up to a228060a7c9ab88597eeac131e4578595d5d46ae:
> > 
> >   net/mlx5e: Add HW vport counters to representor ethtool stats (2018-05-17 
> > 17:48:54 -0700)
> > 
> > Dave, for your convenience you can either pull 1. and then 2. or pull 2.
> > directly.
> 
> Looks good.
> 
> I pulled 1 then I pulled 2.  That seemed to work well.  Particularly
> it allowed me to capture the two different merge commit messages one
> by one.

Does this double up the merge commit though? I see this in Saeed's
tags/mlx5e-updates-2018-05-17 ?

commit 260ab7042e24ccd4407985c6e775e39d064fab2b
Merge: 538e2de104cfb4 10ff5359f88341
Author: Saeed Mahameed 
Date:   Thu May 17 17:47:09 2018 -0700

Merge tag 'mlx5-updates-2018-05-17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux

mlx5-updates-2018-05-17

mlx5 core dirver updates for both net-next and rdma-next branches.

From Christophe JAILLET, first three patches to use kvfree where needed.

From: Or Gerlitz 

Next six patches from Roi and Co adds support for merged
sriov e-switch which comes to serve cases where both PFs, VFs set
on them and both uplinks are to be used in single v-switch SW model.
When merged e-switch is supported, the per-port e-switch is logically
merged into one e-switch that spans both physical ports and all the VFs.

This model allows to offload TC eswitch rules between VFs belonging
to different PFs (and hence have different eswitch affinity), it also
sets the some of the foundations needed for uplink LAG support.

Signed-off-by: Saeed Mahameed 

And this in your tree:

commit 3888ea4e2f1fb2f61e5418adf4b8332107ac0c8f
Merge: 2c47a65b7009eb 10ff5359f88341
Author: David S. Miller 
Date:   Fri May 18 13:00:08 2018 -0400

Merge tag 'mlx5-updates-2018-05-17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux

Saeed Mahameed says:

mlx5-updates-2018-05-17

mlx5 core dirver updates for both net-next and rdma-next branches.

From Christophe JAILLET, first three patche to use kvfree where needed.

From: Or Gerlitz 

Next six patches from Roi and Co adds support for merged
sriov e-switch which comes to serve cases where both PFs, VFs set
on them and both uplinks are to be used in single v-switch SW model.
When merged e-switch is supported, the per-port e-switch is logically
merged into one e-switch that spans both physical ports and all the VFs.

This model allows to offload TC eswitch rules between VFs belonging
to different PFs (and hence have different eswitch affinity), it also
sets the some of the foundations needed for uplink LAG support.

Signed-off-by: David S. Miller 

I think the trouble is the Saeed needs to merge the 'core' stuff to
create the non-core patches for netdev (just like we want to do for
rdma)

So maybe netdev should take the #2 pull request and rdma should
take number #1?

This seems to be working OK from RDMA's side, we have much less netdev
stuff in our tree now which seems good!

Thanks,
Jason

Re: [PATCH net-next 0/9] net/smc: cleanups 2018-05-18

2018-05-18 Thread David Miller

From: Ursula Braun 
Date: Fri, 18 May 2018 09:34:09 +0200

> here are SMC patches for net-next providing restructuring and cleanup
> in different areas.

Series applied, thanks Ursula.

Re: WARNING in ip_recv_error

2018-05-18 Thread Willem de Bruijn

On Fri, May 18, 2018 at 11:44 AM, David Miller  wrote:
> From: Eric Dumazet 
> Date: Fri, 18 May 2018 08:30:43 -0700
>
>> We probably need to revert Willem patch 
>> (7ce875e5ecb8562fd44040f69bda96c999e38bbc)
>
> Is it really valid to reach ip_recv_err with an ipv6 socket?

I guess the issue is that setsockopt IPV6_ADDRFORM is not an
atomic operation, so that the socket is neither fully ipv4 nor fully
ipv6 by the time it reaches ip_recv_error.

  sk->sk_socket->ops = _dgram_ops;
  < HERE >
  sk->sk_family = PF_INET;

Even calling inet_recv_error to demux would not necessarily help.

Safest would be to look up by skb->protocol, similar to what
ipv6_recv_error does to handle v4-mapped-v6.

Or to make that function safe with PF_INET and swap the order
of the above two operations.

All sound needlessly complicated for this rare socket option, but
I don't have a better idea yet. Dropping on the floor is not nice,
either.

Re: [pull request][for-next 00/15] Mellanox, mlx5 core and netdev updates 2018-05-17

2018-05-18 Thread David Miller

From: Saeed Mahameed 
Date: Thu, 17 May 2018 18:22:43 -0700

> Below you can find two pull requests,
> 
> 1. mlx5 core updates to be shared for both netdev and RDMA, (patches 1..9)
>  which is based on the last mlx5-next pull request
>  
> The following changes since commit a8408f4e6db775e245f20edf12b13fd58cc03a1c:
> 
>   net/mlx5: fix spelling mistake: "modfiy" -> "modify" (2018-05-04 12:11:51 
> -0700)
> 
> are available in the Git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git 
> tags/mlx5-updates-2018-05-17
> 
> for you to fetch changes up to 10ff5359f883412728ba816046ee3a696625ca02:
> 
>   net/mlx5e: Explicitly set source e-switch in offloaded TC rules (2018-05-17 
> 14:17:35 -0700)
> 
> 2. mlx5e netdev updates only for net-next branch (patches 10..15) based on 
> net-next
> and the above pull request.
> 
> The following changes since commit 538e2de104cfb4ef1acb35af42427bff42adbe4d:
> 
>   Merge branch 'net-Allow-more-drivers-with-COMPILE_TEST' (2018-05-17 
> 17:11:07 -0400)
> 
> are available in the Git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
> tags/mlx5e-updates-2018-05-17
> 
> for you to fetch changes up to a228060a7c9ab88597eeac131e4578595d5d46ae:
> 
>   net/mlx5e: Add HW vport counters to representor ethtool stats (2018-05-17 
> 17:48:54 -0700)
> 
> Dave, for your convenience you can either pull 1. and then 2. or pull 2.
> directly.

Looks good.

I pulled 1 then I pulled 2.  That seemed to work well.  Particularly
it allowed me to capture the two different merge commit messages one
by one.

Is this basically how you want to handle things moving forward?

Thanks.

1 2 3 >

1 - 100 of 284 matches

Mail list logo