[PATCH 1/1] rds: ib: add error handle

2017-03-06 Thread Zhu Yanjun
In the function rds_ib_setup_qp, the error handle is missing. When some
error occurs, it is possible that memory leak occurs. As such, error
handle is added.

Cc: Joe Jin 
Reviewed-by: Junxiao Bi 
Reviewed-by: Guanglei Li 
Signed-off-by: Zhu Yanjun 
---
 net/rds/ib_cm.c | 47 ---
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index ce3775a..d47ae0f 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -442,7 +442,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_cq = NULL;
ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
rdsdebug("ib_create_cq send failed: %d\n", ret);
-   goto out;
+   goto rds_ibdev_out;
}
 
ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
@@ -456,19 +456,19 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_recv_cq = NULL;
ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
rdsdebug("ib_create_cq recv failed: %d\n", ret);
-   goto out;
+   goto send_cq_out;
}
 
ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
if (ret) {
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-   goto out;
+   goto recv_cq_out;
}
 
ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
if (ret) {
rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
-   goto out;
+   goto recv_cq_out;
}
 
/* XXX negotiate max send/recv with remote? */
@@ -494,7 +494,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, );
if (ret) {
rdsdebug("rdma_create_qp failed: %d\n", ret);
-   goto out;
+   goto recv_cq_out;
}
 
ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
@@ -504,7 +504,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
-   goto out;
+   goto qp_out;
}
 
ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
@@ -514,7 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
-   goto out;
+   goto send_hdrs_dma_out;
}
 
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
@@ -522,7 +522,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
-   goto out;
+   goto recv_hdrs_dma_out;
}
 
ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct 
rds_ib_send_work),
@@ -530,7 +530,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
-   goto out;
+   goto ack_dma_out;
}
 
ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct 
rds_ib_recv_work),
@@ -538,7 +538,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
-   goto out;
+   goto sends_out;
}
 
rds_ib_recv_init_ack(ic);
@@ -546,8 +546,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
 ic->i_send_cq, ic->i_recv_cq);
 
-out:
+   return ret;
+
+sends_out:
+   vfree(ic->i_sends);
+ack_dma_out:
+   ib_dma_free_coherent(dev, sizeof(struct rds_header),
+ic->i_ack, ic->i_ack_dma);
+recv_hdrs_dma_out:
+   ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
+   sizeof(struct rds_header),
+   ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
+send_hdrs_dma_out:
+   ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
+   sizeof(struct rds_header),
+   ic->i_send_hdrs, ic->i_send_hdrs_dma);
+qp_out:
+   rdma_destroy_qp(ic->i_cm_id);
+recv_cq_out:
+   if (!ib_destroy_cq(ic->i_recv_cq))
+   ic->i_recv_cq = NULL;
+send_cq_out:
+   if (!ib_destroy_cq(ic->i_send_cq))
+   ic->i_send_cq = NULL;
+rds_ibdev_out:
+   rds_ib_remove_conn(rds_ibdev, conn);
rds_ib_dev_put(rds_ibdev);
+
return 

Re: [PATCH] udp: avoid ufo handling on IP payload compression packets

2017-03-06 Thread Herbert Xu
On Mon, Mar 06, 2017 at 07:16:57AM +0100, Steffen Klassert wrote:
>
> > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> > index b67719f..18383ef 100644
> > --- a/net/ipv4/ip_output.c
> > +++ b/net/ipv4/ip_output.c
> > @@ -960,7 +960,10 @@ static int __ip_append_data(struct sock *sk,
> > cork->length += length;
> > if length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
> > (sk->sk_protocol == IPPROTO_UDP) &&
> > -   (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
> > +   (rt->dst.dev->features & NETIF_F_UFO) &&
> > +#ifdef CONFIG_XFRM
> > +   !rt->dst.xfrm &&
> > +#endif
> 
> Please fix IPcomp to use rt->dst.header_len instead off adding
> this ifdef to the generic networking code.

It's not that simple though.  IPComp's header_len is set to zero
because we opportunistically drop the IPComp header when the total
compressed length exceeds the original packet length.  That is,
we only ever do IPComp when it does not cause the packet to expand.

So it seems that we need another way of indicating the presence of
XFRM.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: netlink: GPF in netlink_unicast

2017-03-06 Thread Richard Guy Briggs
On 2017-03-06 10:10, Cong Wang wrote:
> On Mon, Mar 6, 2017 at 2:54 AM, Dmitry Vyukov  wrote:
> > Hello,
> >
> > I've got the following crash while running syzkaller fuzzer on
> > net-next/8d70eeb84ab277377c017af6a21d0a337025dede:
> >
> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> > general protection fault:  [#1] SMP KASAN
> > Dumping ftrace buffer:
> >(ftrace buffer empty)
> > Modules linked in:
> > CPU: 0 PID: 883 Comm: kauditd Not tainted 4.10.0+ #6
> > Hardware name: Google Google Compute Engine/Google Compute Engine,
> > BIOS Google 01/01/2011
> > task: 8801d79f0240 task.stack: 8801d7a2
> > RIP: 0010:sock_sndtimeo include/net/sock.h:2162 [inline]
> > RIP: 0010:netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249
> > RSP: 0018:8801d7a27c38 EFLAGS: 00010206
> > RAX: 0056 RBX: 8801d7a27cd0 RCX: 
> > RDX:  RSI:  RDI: 02b0
> > RBP: 8801d7a27cf8 R08: ed00385cf286 R09: ed00385cf286
> > R10: 0006 R11: ed00385cf285 R12: 
> > R13: dc00 R14: 8801c2fc3c80 R15: 014000c0
> > FS:  () GS:8801dbe0() knlGS:
> > CS:  0010 DS:  ES:  CR0: 80050033
> > CR2: 20cfd000 CR3: 0001c758f000 CR4: 001406f0
> > Call Trace:
> >  kauditd_send_unicast_skb+0x3c/0x70 kernel/audit.c:482
> >  kauditd_thread+0x174/0xb00 kernel/audit.c:599
> >  kthread+0x326/0x3f0 kernel/kthread.c:229
> >  ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430
> > Code: 44 89 fe e8 56 15 ff ff 8b 8d 70 ff ff ff 49 89 c6 31 c0 85 c9
> > 75 27 e8 b2 b2 f4 fd 49 8d bc 24 b0 02 00 00 48 89 f8 48 c1 e8 03 <42>
> > 80 3c 28 00 0f 85 37 06 00 00 49 8b 84 24 b0 02 00 00 4c 8d
> > RIP: sock_sndtimeo include/net/sock.h:2162 [inline] RSP: 8801d7a27c38
> > RIP: netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249 RSP:
> > 8801d7a27c38
> > ---[ end trace ad1bba9d457430b6 ]---
> > Kernel panic - not syncing: Fatal exception
> >
> >
> > This is not reproducible and seems to be caused by an elusive race.
> > However, looking at the code I don't see any proper protection of
> > audit_sock (other than the if (!audit_pid) which is obviously not
> > enough to protect against races).
> 
> audit_cmd_mutex is supposed to protect it, I think.
> But kauditd_send_unicast_skb() seems not holding this mutex.

H, I wonder if it makes sense to wrap most of the contents of the
outer while loop in kauditd_thread in the audit_cmd_mutex, or around the
first two innter while loops and the "if (auditd)" condition after the
"quick_loop:" label.  The condition on auditd is supposed to catch that
case.  We don't want it locked while playing with the scheduler at the
bottom of that function.

> Richard?

- RGB

--
Richard Guy Briggs 
Kernel Security Engineering, Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635


Re: [PATCH net-next 1/5] ldmvsw: better use of link up and down on ldom vswitch

2017-03-06 Thread Shannon Nelson



On 3/6/2017 3:53 PM, Florian Fainelli wrote:

On 03/06/2017 03:15 PM, Shannon Nelson wrote:

When an ldom VM is bound, the network vswitch infrastructure is set up for
it, but was being forced 'UP' by the userland switch configuration script.
When 'UP' but not actually connected to a running VM, the ipv6 neighbor
probes fail (not a horrible thing) and start cluttering up the kernel logs.
Funny thing: these are debug messages that never actually show up, but
we do see the net_ratelimited messages that say N callbacks were
suppressed.

This patch defers the netif_carrier_on() until an actual link has been
established with the VM, as indicated by receiving an LDC_EVENT_UP from
the underlying LDC protocol.  Similarly, we take the link down when we
see the LDC_EVENT_RESET.

Orabug: 25525312

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/ldmvsw.c |   10 +++---
 drivers/net/ethernet/sun/sunvnet_common.c |   14 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sun/ldmvsw.c 
b/drivers/net/ethernet/sun/ldmvsw.c
index 89952de..c6f6d59 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -41,8 +41,8 @@
 static u8 vsw_port_hwaddr[ETH_ALEN] = {0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};

 #define DRV_MODULE_NAME"ldmvsw"
-#define DRV_MODULE_VERSION "1.1"
-#define DRV_MODULE_RELDATE "February 3, 2017"
+#define DRV_MODULE_VERSION "1.2"
+#define DRV_MODULE_RELDATE "March 4, 2017"

 static char version[] =
DRV_MODULE_NAME " " DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")";
@@ -133,7 +133,6 @@ static void vsw_poll_controller(struct net_device *dev)
 #endif

 static const struct net_device_ops vsw_ops = {
-   .ndo_open   = sunvnet_open_common,


Is this change intentional? It was not entirely obvious where you would
be setting ::ndo_open in other places.


Yes, it is correct.  It does look a bit odd, but nearly all the work is 
done in the _probe(), and now the carrier_on happens a little later when 
the LDC_EVENT_UP is received, so there's no longer a need for the 
_open() call.


sln






Re: [PATCH net 0/3] rds: tcp: fix various rds-tcp issues during netns create/delete sequences

2017-03-06 Thread santosh.shilim...@oracle.com

On 3/4/17 8:57 AM, Sowmini Varadhan wrote:

Dmitry Vyukov reported some syszkaller panics during netns deletion.

While I have not been able to reproduce those exact panics, my attempts
to do so uncovered a few other problems, which are fixed patch 2 and
patch 3 of this patch series. In addition, as mentioned in,
 https://www.spinics.net/lists/netdev/msg422997.html
code-inspection points that the rds_connection needs to take an explicit
refcnt on the struct net so that it is held down until all cleanup is
completed for netns removal, and this is fixed by patch1.


Hopefully Dmitry can try the series and see if it fixes the issue(s).
The fixes looks good to me.

FWIW, Acked-by: Santosh Shilimkar 




Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-06 Thread David Ahern
On 3/6/17 11:51 AM, Dmitry Vyukov wrote:
> We hit it several thousand times, but we get only several dozens of
> crashes per day on ~80 VMs. So if you try to reproduce it on a single
> machine it can take days for a single crash.
> If you are ready to go that route, here are some instructions on
> setting up syzkaller:
> https://github.com/google/syzkaller
> You also need kernel built with CONFIG_KASAN.

ack and I have it setup on ubuntu 16.10 which has a fairly new compiler.

> I am ready to help with resolving any issues.
> 
> Another possible route is if you give me a patch with some additional
> WARNINGs. Then I can deploy it to bots and collect stacks.

try the attached.
diff --git a/include/net/dst.h b/include/net/dst.h
index 049af33da3b6..d164eb8ceab8 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -58,6 +58,7 @@ struct dst_entry {
 #define DST_XFRM_TUNNEL0x0080
 #define DST_XFRM_QUEUE 0x0100
 #define DST_METADATA   0x0200
+#define DST_IN_FIB 0x0400
 
short   error;
 
diff --git a/net/core/dst.c b/net/core/dst.c
index 960e503b5a52..c98447fe8510 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -232,6 +232,9 @@ void __dst_free(struct dst_entry *dst)
 {
spin_lock_bh(_garbage.lock);
___dst_free(dst);
+if (dst->flags & DST_IN_FIB)
+   pr_warn("dst %p is marked as in fib\n", dst);
+//WARN_ON(dst->flags & DST_IN_FIB);
dst->next = dst_garbage.list;
dst_garbage.list = dst;
if (dst_garbage.timer_inc > DST_GC_INC) {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e4266746e4a2..a4d55ba00a43 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -155,6 +155,7 @@ static void node_free(struct fib6_node *fn)
 
 static void rt6_rcu_free(struct rt6_info *rt)
 {
+WARN_ON(rt->dst.flags & DST_IN_FIB);
call_rcu(>dst.rcu_head, dst_rcu_free);
 }
 
@@ -878,6 +879,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
return err;
 
rt->dst.rt6_next = iter;
+   rt->dst.flags |= DST_IN_FIB;
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(>rt6i_ref);
@@ -907,6 +909,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
*ins = rt;
rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next;
+   rt->dst.flags |= DST_IN_FIB;
atomic_inc(>rt6i_ref);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -974,6 +977,20 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 !atomic_read(>dst.__refcnt)))
return -EINVAL;
 
+if (rt->dst.ops->family != AF_INET6) {
+   pr_warn("fib6_add: adding rt with family is %d dst flags %x\n",
+   rt->dst.ops->family, rt->dst.flags);
+
+   WARN_ON(1);
+}
+/* dst.next really should not be set at this point */
+if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
+   pr_warn("fib6_add: adding rt with bad next -- family %d dst flags %x\n",
+   rt->dst.next->ops->family, rt->dst.next->flags);
+
+   WARN_ON(1);
+}
+
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0;
@@ -1444,6 +1461,7 @@ static void fib6_del_route(struct fib6_node *fn, struct 
rt6_info **rtp,
read_unlock(>ipv6.fib6_walker_lock);
 
rt->dst.rt6_next = NULL;
+   rt->dst.flags &= ~DST_IN_FIB;
 
/* If it was last route, expunge its radix tree node */
if (!fn->leaf) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 229bfcc451ef..e91d7871ccfc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1135,6 +1135,8 @@ struct rt6_info *ip6_pol_route(struct net *net, struct 
fib6_table *table,
 
dst_hold(_rt->dst);
 
+   uncached_rt->dst.flags &= ~DST_IN_FIB;
+
trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
return uncached_rt;
 
@@ -1160,6 +1162,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct 
fib6_table *table,
dst_release(>dst);
}
 
+   pcpu_rt->dst.flags &= ~DST_IN_FIB;
trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
return pcpu_rt;
 


[PATCH net-next 0/5] sunvnet: better connection management

2017-03-06 Thread Shannon Nelson
These patches remove some problems in handling of carrier state
with the ldmvsw vswitch, remove  an xoff misuse in sunvnet, and
add stats for debug and tracking of point-to-point connections
between the ldom VMs.

Shannon Nelson (5):
  ldmvsw: better use of link up and down on ldom vswitch
  sunvnet: add stats to track ldom to ldom packets and bytes
  sunvnet: track port queues correctly
  sunvnet: count multicast packets
  sunvnet: xoff not needed when removing port link

 drivers/net/ethernet/sun/ldmvsw.c |   10 ++-
 drivers/net/ethernet/sun/sunvnet.c|  114 +
 drivers/net/ethernet/sun/sunvnet_common.c |   50 +++--
 drivers/net/ethernet/sun/sunvnet_common.h |   26 --
 4 files changed, 180 insertions(+), 20 deletions(-)



[PATCH v3 net-next] liquidio: improve UDP TX performance

2017-03-06 Thread Felix Manlunas
From: VSR Burru 

Improve UDP TX performance by:
* reducing the ring size from 2K to 512
* replacing the numerous streaming DMA allocations for info buffers and
  gather lists with one large consistent DMA allocation per ring

BQL is not effective here.  We reduced the ring size because there is heavy
overhead with dma_map_single every so often.  With iommu=on, dma_map_single
in PF Tx data path was taking longer time (~700usec) for every ~250
packets.  Debugged intel_iommu code, and found that PF driver is utilizing
too many static IO virtual address mapping entries (for gather list entries
and info buffers): about 100K entries for two PF's each using 8 rings.
Also, finding an empty entry (in rbtree of device domain's iova mapping in
kernel) during Tx path becomes a bottleneck every so often; the loop to
find the empty entry goes through over 40K iterations; this is too costly
and was the major overhead.  Overhead is low when this loop quits quickly.

Netperf benchmark numbers before and after patch:

PF UDP TX
+++++-+
|||  Before|  After | |
| Number ||  Patch |  Patch | |
|  of| Packet | Throughput | Throughput | Percent |
| Flows  |  Size  |  (Gbps)|  (Gbps)| Change  |
+++++-+
||   360  |   0.52 |   0.93 |  +78.9  |
|   1|  1024  |   1.62 |   2.84 |  +75.3  |
||  1518  |   2.44 |   4.21 |  +72.5  |
+++++-+
||   360  |   0.45 |   1.59 | +253.3  |
|   4|  1024  |   1.34 |   5.48 | +308.9  |
||  1518  |   2.27 |   8.31 | +266.1  |
+++++-+
||   360  |   0.40 |   1.61 | +302.5  |
|   8|  1024  |   1.64 |   4.24 | +158.5  |
||  1518  |   2.87 |   6.52 | +127.2  |
+++++-+


VF UDP TX
+++++-+
|||  Before|  After | |
| Number ||  Patch |  Patch | |
|  of| Packet | Throughput | Throughput | Percent |
| Flows  |  Size  |  (Gbps)|  (Gbps)| Change  |
+++++-+
||   360  |   1.28 |   1.49 |  +16.4  |
|   1|  1024  |   4.44 |   4.39 |   -1.1  |
||  1518  |   6.08 |   6.51 |   +7.1  |
+++++-+
||   360  |   2.35 |   2.35 |0.0  |
|   4|  1024  |   6.41 |   8.07 |  +25.9  |
||  1518  |   9.56 |   9.54 |   -0.2  |
+++++-+
||   360  |   3.41 |   3.65 |   +7.0  |
|   8|  1024  |   9.35 |   9.34 |   -0.1  |
||  1518  |   9.56 |   9.57 |   +0.1  |
+++++-+

Signed-off-by: VSR Burru 
Signed-off-by: Felix Manlunas 
Signed-off-by: Derek Chickles 
Signed-off-by: Raghu Vatsavayi 
---
Patch Changlog:
 v3: Add to patch comments description of bottleneck found then mitigated.
 v2: Add before and after benchmark numbers to the patch explanation.

 drivers/net/ethernet/cavium/liquidio/lio_main.c| 110 ++---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 104 ++-
 .../net/ethernet/cavium/liquidio/octeon_config.h   |   6 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c |  17 +---
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h |   4 +-
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  42 
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  43 +---
 7 files changed, 144 insertions(+), 182 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index be9c0e3..92f46b1 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -152,7 +152,7 @@ struct octnic_gather {
 */
struct octeon_sg_entry *sg;
 
-   u64 sg_dma_ptr;
+   dma_addr_t sg_dma_ptr;
 };
 
 struct handshake {
@@ -734,6 +734,9 @@ static void delete_glists(struct lio *lio)
struct octnic_gather *g;
int i;
 
+   kfree(lio->glist_lock);
+   lio->glist_lock = NULL;
+
if (!lio->glist)
return;
 
@@ -741,23 +744,26 @@ static void delete_glists(struct lio *lio)
do {
g = (struct octnic_gather *)
list_delete_head(>glist[i]);
-   if (g) {
-   if (g->sg) {
-   

Re: [PATCH] net: smsc: epic100: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Mon, 27 Feb 2017 23:43:14 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH net-next] tools: hv: Add clean up function for Ubuntu config

2017-03-06 Thread David Miller
From: Haiyang Zhang 
Date: Fri, 24 Feb 2017 17:30:32 +

> This patch adds a function to clean up duplicate config info
> on Ubuntu.
> 
> Signed-off-by: Haiyang Zhang 

Applied, thanks.


Re: [PATCH net 0/5] NAPI support for Hyper-V

2017-03-06 Thread Stephen Hemminger
On Mon, 06 Mar 2017 17:13:33 -0800 (PST)
David Miller  wrote:

> From: Stephen Hemminger 
> Date: Mon, 27 Feb 2017 10:26:46 -0800
> 
> > These patches enable NAPI, GRO and napi_alloc_skb for Hyper-V netvsc
> > driver.  
> 
> Series applied to net-next.

Great, but how do we coordinate allow use of the vmbus iterator functions
for storage?  The way things are going that would mean waiting until 4.13 for
that.


Re: [PATCH net-next v3 0/3] net: ethernet: bgmac: PM support and clean-ups

2017-03-06 Thread David Miller
From: Jon Mason 
Date: Tue, 28 Feb 2017 13:50:58 -0500

> Changes in v3:
> * Corrected a bug Florian found and added his Reviewed-by
> 
> Changes in v2:
> * Reworked the PM patch with Florian's suggestions
> 
> 
> Add code to support Power Management (only tested on NS2), and add some
> code clean-ups

Series applied to net-next, thanks.


Re: [PATCH] net: axienet: use eth_hw_addr_random()

2017-03-06 Thread David Miller
From: Tobias Klauser 
Date: Tue, 28 Feb 2017 12:21:12 +0100

> Use eth_hw_addr_random() to set a random MAC address in order to make
> sure ndev->addr_assign_type will be properly set to NET_ADDR_RANDOM.
> 
> Signed-off-by: Tobias Klauser 

Applied to net-next, thanks.


Re: [PATCH] net: silan: sc92031: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Mon, 27 Feb 2017 22:50:25 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: smsc: smc91c92_cs: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Fri,  3 Mar 2017 23:39:35 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: sis: sis190: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Mon, 27 Feb 2017 23:06:41 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: sis: sis900: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Mon, 27 Feb 2017 23:17:37 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH net 0/5] NAPI support for Hyper-V

2017-03-06 Thread David Miller
From: Stephen Hemminger 
Date: Mon, 27 Feb 2017 10:26:46 -0800

> These patches enable NAPI, GRO and napi_alloc_skb for Hyper-V netvsc
> driver.

Series applied to net-next.


Re: [PATCH net]ipv6: Provide ipv6 version of "disable_policy" sysctl

2017-03-06 Thread David Miller
From: David Forster 
Date: Thu, 23 Feb 2017 16:27:18 +

> This provides equivalent functionality to the existing ipv4
> "disable_policy" systcl. ie. Allows IPsec processing to be skipped
> on terminating packets on a per-interface basis.
> 
> Signed-off-by: David Forster 

Applied to net-next, thanks David.


RE: [PATCH v2] can: m_can: enable transmission of FD frame on latest version

2017-03-06 Thread Wenyou.Yang
HI Oliver, 

> -Original Message-
> From: Oliver Hartkopp [mailto:socket...@hartkopp.net]
> Sent: 2017年3月7日 5:26
> To: Marc Kleine-Budde ; Wenyou Yang - A41535
> ; Wolfgang Grandegger 
> Cc: Alexandre Belloni ; Florian Fainelli
> ; Quentin Schulz ;
> Wenyou Yang - A41535 ; Nicolas Ferre
> ; linux-...@vger.kernel.org; netdev@vger.kernel.org;
> linux-ker...@vger.kernel.org
> Subject: Re: [PATCH v2] can: m_can: enable transmission of FD frame on latest
> version
> 
> @Wenyou Yang: Can you please test the two patches posted here:

Tested on SAMA5D2 SoC, It works.

> 
> [PATCH 1/2] can: m_can: handle bitrate setup on IP core >= 3.1.x
> http://marc.info/?l=linux-can=148883529927720=2
> 
> [PATCH 2/2] can: m_can: handle frame transmission on IP core >= 3.1.x
> http://marc.info/?l=linux-can=148883529927718=2
> 
> Tnx & regards,
> Oliver

Thank you.

Best Regards,
Wenyou Yang


Re: [PATCH] net: realtek: r8169: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Thu, 23 Feb 2017 22:34:43 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: sgi: ioc3-eth: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Sun, 26 Feb 2017 22:48:59 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: realtek: 8139cp: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Wed, 22 Feb 2017 08:50:27 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: rocker: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Sun, 26 Feb 2017 19:00:29 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH] net: realtek: 8139too: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread David Miller
From: Philippe Reynes 
Date: Thu, 23 Feb 2017 00:14:08 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> As I don't have the hardware, I'd be very pleased if
> someone may test this patch.
> 
> Signed-off-by: Philippe Reynes 

Applied.


Re: [PATCH 21/29] drivers, s390: convert fc_fcp_pkt.ref_cnt from atomic_t to refcount_t

2017-03-06 Thread Benjamin Block
On Mon, Mar 06, 2017 at 04:27:11PM +0100, Johannes Thumshirn wrote:
> On 03/06/2017 03:21 PM, Elena Reshetova wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
> 
> The subject is wrong, should be something like "scsi: libfc convert
> fc_fcp_pkt.ref_cnt from atomic_t to refcount_t" but not s390.
> 

Yes please, I was extremely confused for a moment here.



Beste Grüße / Best regards,
  - Benjamin Block
-- 
Linux on z Systems Development / IBM Systems & Technology Group
  IBM Deutschland Research & Development GmbH 
Vorsitz. AufsR.: Martina Koederitz /Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen / Registergericht: AmtsG Stuttgart, HRB 243294



Re: [PATCH net-next 1/5] ldmvsw: better use of link up and down on ldom vswitch

2017-03-06 Thread Florian Fainelli
On 03/06/2017 03:15 PM, Shannon Nelson wrote:
> When an ldom VM is bound, the network vswitch infrastructure is set up for
> it, but was being forced 'UP' by the userland switch configuration script.
> When 'UP' but not actually connected to a running VM, the ipv6 neighbor
> probes fail (not a horrible thing) and start cluttering up the kernel logs.
> Funny thing: these are debug messages that never actually show up, but
> we do see the net_ratelimited messages that say N callbacks were
> suppressed.
> 
> This patch defers the netif_carrier_on() until an actual link has been
> established with the VM, as indicated by receiving an LDC_EVENT_UP from
> the underlying LDC protocol.  Similarly, we take the link down when we
> see the LDC_EVENT_RESET.
> 
> Orabug: 25525312
> 
> Signed-off-by: Shannon Nelson 
> ---
>  drivers/net/ethernet/sun/ldmvsw.c |   10 +++---
>  drivers/net/ethernet/sun/sunvnet_common.c |   14 ++
>  2 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/sun/ldmvsw.c 
> b/drivers/net/ethernet/sun/ldmvsw.c
> index 89952de..c6f6d59 100644
> --- a/drivers/net/ethernet/sun/ldmvsw.c
> +++ b/drivers/net/ethernet/sun/ldmvsw.c
> @@ -41,8 +41,8 @@
>  static u8 vsw_port_hwaddr[ETH_ALEN] = {0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
>  
>  #define DRV_MODULE_NAME  "ldmvsw"
> -#define DRV_MODULE_VERSION   "1.1"
> -#define DRV_MODULE_RELDATE   "February 3, 2017"
> +#define DRV_MODULE_VERSION   "1.2"
> +#define DRV_MODULE_RELDATE   "March 4, 2017"
>  
>  static char version[] =
>   DRV_MODULE_NAME " " DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")";
> @@ -133,7 +133,6 @@ static void vsw_poll_controller(struct net_device *dev)
>  #endif
>  
>  static const struct net_device_ops vsw_ops = {
> - .ndo_open   = sunvnet_open_common,

Is this change intentional? It was not entirely obvious where you would
be setting ::ndo_open in other places.
-- 
Florian


Re: net: BUG in unix_notinflight

2017-03-06 Thread Cong Wang
On Mon, Mar 6, 2017 at 2:40 AM, Dmitry Vyukov  wrote:
> Now with a nice single-threaded C reproducer!

Excellent...

>
> // autogenerated by syzkaller (http://github.com/google/syzkaller)
> #define _GNU_SOURCE
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
>
> void test()
> {
>   long r[54];
>   memset(r, -1, sizeof(r));
>   syscall(__NR_mmap, 0x2000ul, 0xfff000ul, 0x3ul, 0x32ul, -1, 0);
>   r[1] = syscall(__NR_socketpair, 0x1ul, 0x5ul, 0x0ul, 0x20521ff8ul);
>   r[2] = *(uint32_t*)0x20521ff8;
>   r[3] = *(uint32_t*)0x20521ffc;
>   r[5] = syscall(__NR_open, "/dev/net/tun", 0x20ul);
>   r[6] = syscall(__NR_socketpair, 0x1ul, 0x5ul, 0x0ul,
>  0x20d85000ul, 0, 0, 0, 0, 0);
>   r[7] = *(uint32_t*)0x20d85000;
>   (*(uint64_t*)0x2fc8 = (uint64_t)0x2000);
>   (*(uint32_t*)0x2fd0 = (uint32_t)0xa);
>   (*(uint64_t*)0x2fd8 = (uint64_t)0x2005d000);
>   (*(uint64_t*)0x2fe0 = (uint64_t)0x8);
>   (*(uint64_t*)0x2fe8 = (uint64_t)0x2ff0);
>   (*(uint64_t*)0x2ff0 = (uint64_t)0x1);
>   (*(uint32_t*)0x2ff8 = (uint32_t)0x0);
>   (*(uint16_t*)0x2000 = (uint16_t)0x1);
>   memcpy((void*)0x2002, "\x2e\x2f\x66\x69\x6c\x65\x30\x00", 8);
>   (*(uint64_t*)0x2005d000 = (uint64_t)0x20784f06);
>   (*(uint64_t*)0x2005d008 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d010 = (uint64_t)0x209a5f78);
>   (*(uint64_t*)0x2005d018 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d020 = (uint64_t)0x20ec3ffc);
>   (*(uint64_t*)0x2005d028 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d030 = (uint64_t)0x2057e000);
>   (*(uint64_t*)0x2005d038 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d040 = (uint64_t)0x200c9f9d);
>   (*(uint64_t*)0x2005d048 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d050 = (uint64_t)0x20331000);
>   (*(uint64_t*)0x2005d058 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d060 = (uint64_t)0x206a1f7b);
>   (*(uint64_t*)0x2005d068 = (uint64_t)0x0);
>   (*(uint64_t*)0x2005d070 = (uint64_t)0x20e7f000);
>   (*(uint64_t*)0x2005d078 = (uint64_t)0x0);
>   (*(uint64_t*)0x2ff0 = (uint64_t)0x18);
>   (*(uint32_t*)0x2ff8 = (uint32_t)0x1);
>   (*(uint32_t*)0x2ffc = (uint32_t)0x1);
>   (*(uint32_t*)0x20001000 = r[5]);
>   (*(uint32_t*)0x20001004 = r[7]);
>   syscall(__NR_sendmsg, r[7], 0x2fc8ul, 0x0ul);
>   (*(uint64_t*)0x2fc8 = (uint64_t)0x2000);
>   (*(uint32_t*)0x2fd0 = (uint32_t)0x8);
>   (*(uint64_t*)0x2fd8 = (uint64_t)0x20026000);
>   (*(uint64_t*)0x2fe0 = (uint64_t)0x0);
>   (*(uint64_t*)0x2fe8 = (uint64_t)0x2ff0);
>   (*(uint64_t*)0x2ff0 = (uint64_t)0x1);
>   (*(uint32_t*)0x2ff8 = (uint32_t)0x0);
>   (*(uint16_t*)0x2000 = (uint16_t)0x0);
>   (*(uint8_t*)0x2002 = (uint8_t)0x0);
>   (*(uint32_t*)0x2004 = (uint32_t)0x4e20);
>   (*(uint64_t*)0x2ff0 = (uint64_t)0x18);
>   (*(uint32_t*)0x2ff8 = (uint32_t)0x1);
>   (*(uint32_t*)0x2ffc = (uint32_t)0x1);
>   (*(uint32_t*)0x20001000 = r[2]);
>   syscall(__NR_sendmsg, r[3], 0x2fc8ul, 0x0ul);
> }
>
> int main()
> {
>   int i, pid, status;
>   for (i = 0; i < 4; i++) {
> if (fork() == 0) {
>   for (;;) {
> pid = fork();
> if (pid == 0) {
>   test();
>   exit(0);
> }
> while (waitpid(pid, , __WALL) != pid) {}
>   }
> }
>   }
>   sleep(100);
>   return 0;
> }
>
>
>
> New report from linux-next/c0b7b2b33bd17f7155956d0338ce92615da686c9
>
> [ cut here ]
> kernel BUG at net/unix/garbage.c:149!
> invalid opcode:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 1806 Comm: syz-executor7 Not tainted 4.10.0-next-20170303+ #6
> Hardware name: Google Google Compute Engine/Google Compute Engine,
> BIOS Google 01/01/2011
> task: 880121c64740 task.stack: 88012c9e8000
> RIP: 0010:unix_notinflight+0x417/0x5d0 net/unix/garbage.c:149
> RSP: 0018:88012c9ef0f8 EFLAGS: 00010297
> RAX: 880121c64740 RBX: 11002593de23 RCX: 8801c490c628
> RDX:  RSI: 11002593de27 RDI: 8557e504
> RBP: 88012c9ef220 R08: 0001 R09: 
> R10: dc00 R11: ed002593de55 R12: 8801c490c0c0
> R13: 88012c9ef1f8 R14: 85101620 R15: dc00
> FS:  013d3940() GS:8801dbe0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 01fd8cd8 CR3: 0001cce69000 CR4: 001426f0
> Call Trace:
>  unix_detach_fds.isra.23+0xfa/0x170 net/unix/af_unix.c:1490
>  unix_destruct_scm+0xf4/0x200 net/unix/af_unix.c:1499

The problem here is there is no lock protecting concurrent unix_detach_fds()
even though unix_notinflight() is already serialized, if we call
unix_notinflight()
twice on the same file pointer, we trigger this bug...

I don't know what is the right lock here to serialize it.


[PATCH] net: ipv6: Remove redundant RTA_OIF in multipath routes

2017-03-06 Thread David Ahern
Dinesh reported that RTA_MULTIPATH nexthops are 8-bytes larger with IPv6
than IPv4. The recent refactoring for multipath support in netlink
messages does discriminate between non-multipath which needs the OIF
and multipath which adds a rtnexthop struct for each hop making the
RTA_OIF attribute redundant. Resolve by adding a flag to the info
function to skip the oif for multipath.

Fixes: beb1afac518d ("net: ipv6: Add support to dump multipath routes
   via RTA_MULTIPATH attribute")
Reported-by: Dinesh Dutt 
Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 229bfcc451ef..35c58b669ebd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3299,7 +3299,6 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
+ NLA_ALIGN(sizeof(struct rtnexthop))
+ nla_total_size(16) /* RTA_GATEWAY */
-   + nla_total_size(4)  /* RTA_OIF */
+ lwtunnel_get_encap_size(rt->dst.lwtstate);
 
nexthop_len *= rt->rt6i_nsiblings;
@@ -3323,7 +3322,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 }
 
 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
-   unsigned int *flags)
+   unsigned int *flags, bool skip_oif)
 {
if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
*flags |= RTNH_F_LINKDOWN;
@@ -3336,7 +3335,8 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct 
rt6_info *rt,
goto nla_put_failure;
}
 
-   if (rt->dst.dev &&
+   /* not needed for multipath encoding b/c it has a rtnexthop struct */
+   if (!skip_oif && rt->dst.dev &&
nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
goto nla_put_failure;
 
@@ -3350,6 +3350,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct 
rt6_info *rt,
return -EMSGSIZE;
 }
 
+/* add multipath next hop */
 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 {
struct rtnexthop *rtnh;
@@ -3362,7 +3363,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct 
rt6_info *rt)
rtnh->rtnh_hops = 0;
rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
 
-   if (rt6_nexthop_info(skb, rt, ) < 0)
+   if (rt6_nexthop_info(skb, rt, , true) < 0)
goto nla_put_failure;
 
rtnh->rtnh_flags = flags;
@@ -3515,7 +3516,7 @@ static int rt6_fill_node(struct net *net,
 
nla_nest_end(skb, mp);
} else {
-   if (rt6_nexthop_info(skb, rt, >rtm_flags) < 0)
+   if (rt6_nexthop_info(skb, rt, >rtm_flags, false) < 0)
goto nla_put_failure;
}
 
-- 
2.1.4



[PATCH] net: tundra: tsi108: use new api ethtool_{get|set}_link_ksettings

2017-03-06 Thread Philippe Reynes
The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

As I don't have the hardware, I'd be very pleased if
someone may test this patch.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/tundra/tsi108_eth.c |   14 --
 1 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c 
b/drivers/net/ethernet/tundra/tsi108_eth.c
index c558399..5ac6eaa 100644
--- a/drivers/net/ethernet/tundra/tsi108_eth.c
+++ b/drivers/net/ethernet/tundra/tsi108_eth.c
@@ -1499,27 +1499,29 @@ static void tsi108_init_mac(struct net_device *dev)
TSI_WRITE(TSI108_EC_INTMASK, ~0);
 }
 
-static int tsi108_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int tsi108_get_link_ksettings(struct net_device *dev,
+struct ethtool_link_ksettings *cmd)
 {
struct tsi108_prv_data *data = netdev_priv(dev);
unsigned long flags;
int rc;
 
spin_lock_irqsave(>txlock, flags);
-   rc = mii_ethtool_gset(>mii_if, cmd);
+   rc = mii_ethtool_get_link_ksettings(>mii_if, cmd);
spin_unlock_irqrestore(>txlock, flags);
 
return rc;
 }
 
-static int tsi108_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int tsi108_set_link_ksettings(struct net_device *dev,
+const struct ethtool_link_ksettings *cmd)
 {
struct tsi108_prv_data *data = netdev_priv(dev);
unsigned long flags;
int rc;
 
spin_lock_irqsave(>txlock, flags);
-   rc = mii_ethtool_sset(>mii_if, cmd);
+   rc = mii_ethtool_set_link_ksettings(>mii_if, cmd);
spin_unlock_irqrestore(>txlock, flags);
 
return rc;
@@ -1535,8 +1537,8 @@ static int tsi108_do_ioctl(struct net_device *dev, struct 
ifreq *rq, int cmd)
 
 static const struct ethtool_ops tsi108_ethtool_ops = {
.get_link   = ethtool_op_get_link,
-   .get_settings   = tsi108_get_settings,
-   .set_settings   = tsi108_set_settings,
+   .get_link_ksettings = tsi108_get_link_ksettings,
+   .set_link_ksettings = tsi108_set_link_ksettings,
 };
 
 static const struct net_device_ops tsi108_netdev_ops = {
-- 
1.7.4.4



[PATCH net-next 4/5] sunvnet: count multicast packets

2017-03-06 Thread Shannon Nelson
Make sure multicast packets get counted in the device.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/sunvnet_common.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet_common.c 
b/drivers/net/ethernet/sun/sunvnet_common.c
index 8afc659..7eb53af 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -410,6 +410,8 @@ static int vnet_rx_one(struct vnet_port *port, struct 
vio_net_desc *desc)
 
skb->ip_summed = port->switch_port ? CHECKSUM_NONE : CHECKSUM_PARTIAL;
 
+   if (unlikely(is_multicast_ether_addr(eth_hdr(skb)->h_dest)))
+   dev->stats.multicast++;
dev->stats.rx_packets++;
dev->stats.rx_bytes += len;
port->stats.rx_packets++;
-- 
1.7.1



[PATCH net-next 2/5] sunvnet: add stats to track ldom to ldom packets and bytes

2017-03-06 Thread Shannon Nelson
In this driver, there is a "port" created for the connection to each of
the other ldoms; a netdev queue is mapped to each port, and they are
collected under a single netdev.  The generic netdev statistics show
us all the traffic in and out of our network device, but don't show
individual queue/port stats.  This patch breaks out the traffic counts
for the individual ports and gives us a little view into the state of
those connections.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/sunvnet.c|  114 +
 drivers/net/ethernet/sun/sunvnet_common.c |6 ++
 drivers/net/ethernet/sun/sunvnet_common.h |   15 
 3 files changed, 135 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c 
b/drivers/net/ethernet/sun/sunvnet.c
index 4cc2571..80de4fa 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -77,11 +77,125 @@ static void vnet_set_msglevel(struct net_device *dev, u32 
value)
vp->msg_enable = value;
 }
 
+static const struct {
+   const char string[ETH_GSTRING_LEN];
+} ethtool_stats_keys[] = {
+   { "rx_packets" },
+   { "tx_packets" },
+   { "rx_bytes" },
+   { "tx_bytes" },
+   { "rx_errors" },
+   { "tx_errors" },
+   { "rx_dropped" },
+   { "tx_dropped" },
+   { "multicast" },
+   { "rx_length_errors" },
+   { "rx_frame_errors" },
+   { "rx_missed_errors" },
+   { "tx_carrier_errors" },
+   { "nports" },
+};
+
+static int vnet_get_sset_count(struct net_device *dev, int sset)
+{
+   struct vnet *vp = (struct vnet *)netdev_priv(dev);
+
+   switch (sset) {
+   case ETH_SS_STATS:
+   return ARRAY_SIZE(ethtool_stats_keys)
+   + (NUM_VNET_PORT_STATS * vp->nports);
+   default:
+   return -EOPNOTSUPP;
+   }
+}
+
+static void vnet_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+   struct vnet *vp = (struct vnet *)netdev_priv(dev);
+   struct vnet_port *port;
+   char *p = (char *)buf;
+
+   switch (stringset) {
+   case ETH_SS_STATS:
+   memcpy(buf, _stats_keys, sizeof(ethtool_stats_keys));
+   p += sizeof(ethtool_stats_keys);
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(port, >port_list, list) {
+   snprintf(p, ETH_GSTRING_LEN, "p%u.%s-%pM",
+port->q_index, port->switch_port ? "s" : "q",
+port->raddr);
+   p += ETH_GSTRING_LEN;
+   snprintf(p, ETH_GSTRING_LEN, "p%u.rx_packets",
+port->q_index);
+   p += ETH_GSTRING_LEN;
+   snprintf(p, ETH_GSTRING_LEN, "p%u.tx_packets",
+port->q_index);
+   p += ETH_GSTRING_LEN;
+   snprintf(p, ETH_GSTRING_LEN, "p%u.rx_bytes",
+port->q_index);
+   p += ETH_GSTRING_LEN;
+   snprintf(p, ETH_GSTRING_LEN, "p%u.tx_bytes",
+port->q_index);
+   p += ETH_GSTRING_LEN;
+   snprintf(p, ETH_GSTRING_LEN, "p%u.event_up",
+port->q_index);
+   p += ETH_GSTRING_LEN;
+   snprintf(p, ETH_GSTRING_LEN, "p%u.event_reset",
+port->q_index);
+   p += ETH_GSTRING_LEN;
+   }
+   rcu_read_unlock();
+   break;
+   default:
+   WARN_ON(1);
+   break;
+   }
+}
+
+static void vnet_get_ethtool_stats(struct net_device *dev,
+  struct ethtool_stats *estats, u64 *data)
+{
+   struct vnet *vp = (struct vnet *)netdev_priv(dev);
+   struct vnet_port *port;
+   int i = 0;
+
+   data[i++] = dev->stats.rx_packets;
+   data[i++] = dev->stats.tx_packets;
+   data[i++] = dev->stats.rx_bytes;
+   data[i++] = dev->stats.tx_bytes;
+   data[i++] = dev->stats.rx_errors;
+   data[i++] = dev->stats.tx_errors;
+   data[i++] = dev->stats.rx_dropped;
+   data[i++] = dev->stats.tx_dropped;
+   data[i++] = dev->stats.multicast;
+   data[i++] = dev->stats.rx_length_errors;
+   data[i++] = dev->stats.rx_frame_errors;
+   data[i++] = dev->stats.rx_missed_errors;
+   data[i++] = dev->stats.tx_carrier_errors;
+   data[i++] = vp->nports;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(port, >port_list, list) {
+   data[i++] = port->q_index;
+   data[i++] = port->stats.rx_packets;
+   data[i++] = port->stats.tx_packets;
+   data[i++] = port->stats.rx_bytes;
+   data[i++] = port->stats.tx_bytes;
+   data[i++] = 

[PATCH net-next 5/5] sunvnet: xoff not needed when removing port link

2017-03-06 Thread Shannon Nelson
The sunvnet netdev is connected to the controlling ldom's vswitch
for network bridging.  However, for higher performance between ldoms,
there also is a channel between each client ldom.  These connections are
represented in the sunvnet driver by a queue for each ldom.  The driver
uses select_queue to tell the stack which queue to use by tracking the mac
addresses on the other end of each port.  When a connected ldom shuts down,
the driver receives an LDC_EVENT_RESET and the port is removed from the
driver, thus a queue with no ldom on the other end will never be selected
for Tx.

The driver was trying to reinforce the "don't use this queue" notion with
netif_tx_stop_queue() and netif_tx_wake_queue(), which really should only
be used to signal a Tx queue is full (aka XOFF).  This misuse of queue
state resulted in NETDEV WATCHDOG messages and lots of unnecessary calls
into the driver's tx_timeout handler.  Simply removing these takes care
of the problem.

Orabug: 25190537

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/sunvnet_common.c |4 
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet_common.c 
b/drivers/net/ethernet/sun/sunvnet_common.c
index 7eb53af..1788d6f 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -1749,16 +1749,12 @@ void sunvnet_port_add_txq_common(struct vnet_port *port)
vp->nports++;
vp->q_used[smallest]++;
port->q_index = smallest;
-   netif_tx_wake_queue(netdev_get_tx_queue(VNET_PORT_TO_NET_DEVICE(port),
-   port->q_index));
 }
 EXPORT_SYMBOL_GPL(sunvnet_port_add_txq_common);
 
 void sunvnet_port_rm_txq_common(struct vnet_port *port)
 {
port->vp->nports--;
-   netif_tx_stop_queue(netdev_get_tx_queue(VNET_PORT_TO_NET_DEVICE(port),
-   port->q_index));
port->vp->q_used[port->q_index]--;
port->q_index = 0;
 }
-- 
1.7.1



[PATCH net-next 1/5] ldmvsw: better use of link up and down on ldom vswitch

2017-03-06 Thread Shannon Nelson
When an ldom VM is bound, the network vswitch infrastructure is set up for
it, but was being forced 'UP' by the userland switch configuration script.
When 'UP' but not actually connected to a running VM, the ipv6 neighbor
probes fail (not a horrible thing) and start cluttering up the kernel logs.
Funny thing: these are debug messages that never actually show up, but
we do see the net_ratelimited messages that say N callbacks were
suppressed.

This patch defers the netif_carrier_on() until an actual link has been
established with the VM, as indicated by receiving an LDC_EVENT_UP from
the underlying LDC protocol.  Similarly, we take the link down when we
see the LDC_EVENT_RESET.

Orabug: 25525312

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/ldmvsw.c |   10 +++---
 drivers/net/ethernet/sun/sunvnet_common.c |   14 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sun/ldmvsw.c 
b/drivers/net/ethernet/sun/ldmvsw.c
index 89952de..c6f6d59 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -41,8 +41,8 @@
 static u8 vsw_port_hwaddr[ETH_ALEN] = {0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
 
 #define DRV_MODULE_NAME"ldmvsw"
-#define DRV_MODULE_VERSION "1.1"
-#define DRV_MODULE_RELDATE "February 3, 2017"
+#define DRV_MODULE_VERSION "1.2"
+#define DRV_MODULE_RELDATE "March 4, 2017"
 
 static char version[] =
DRV_MODULE_NAME " " DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")";
@@ -133,7 +133,6 @@ static void vsw_poll_controller(struct net_device *dev)
 #endif
 
 static const struct net_device_ops vsw_ops = {
-   .ndo_open   = sunvnet_open_common,
.ndo_stop   = sunvnet_close_common,
.ndo_set_rx_mode= vsw_set_rx_mode,
.ndo_set_mac_address= sunvnet_set_mac_addr_common,
@@ -365,6 +364,11 @@ static int vsw_port_probe(struct vio_dev *vdev, const 
struct vio_device_id *id)
napi_enable(>napi);
vio_port_up(>vio);
 
+   /* assure no carrier until we receive an LDC_EVENT_UP,
+* even if the vsw config script tries to force us up
+*/
+   netif_carrier_off(dev);
+
netdev_info(dev, "LDOM vsw-port %pM\n", dev->dev_addr);
 
pr_info("%s: PORT ( remote-mac %pM%s )\n", dev->name,
diff --git a/drivers/net/ethernet/sun/sunvnet_common.c 
b/drivers/net/ethernet/sun/sunvnet_common.c
index fa2d11c..bba0691 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -747,6 +747,13 @@ static int vnet_event_napi(struct vnet_port *port, int 
budget)
 
/* RESET takes precedent over any other event */
if (port->rx_event & LDC_EVENT_RESET) {
+   /* a link went down */
+
+   if (port->vsw == 1) {
+   netif_tx_stop_all_queues(dev);
+   netif_carrier_off(dev);
+   }
+
vio_link_state_change(vio, LDC_EVENT_RESET);
vnet_port_reset(port);
vio_port_up(vio);
@@ -766,6 +773,13 @@ static int vnet_event_napi(struct vnet_port *port, int 
budget)
}
 
if (port->rx_event & LDC_EVENT_UP) {
+   /* a link came up */
+
+   if (port->vsw == 1) {
+   netif_carrier_on(port->dev);
+   netif_tx_start_all_queues(port->dev);
+   }
+
vio_link_state_change(vio, LDC_EVENT_UP);
port->rx_event = 0;
return 0;
-- 
1.7.1



[PATCH net-next 3/5] sunvnet: track port queues correctly

2017-03-06 Thread Shannon Nelson
Track our used and unused queue indices correctly.  Otherwise, as ports
dropped out and returned, they all eventually ended up with the same
queue index.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/sunvnet_common.c |   24 
 drivers/net/ethernet/sun/sunvnet_common.h |   11 ++-
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet_common.c 
b/drivers/net/ethernet/sun/sunvnet_common.c
index a8e7506..8afc659 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -1728,11 +1728,25 @@ void sunvnet_poll_controller_common(struct net_device 
*dev, struct vnet *vp)
 void sunvnet_port_add_txq_common(struct vnet_port *port)
 {
struct vnet *vp = port->vp;
-   int n;
+   int smallest = 0;
+   int i;
+
+   /* find the first least-used q
+* When there are more ldoms than q's, we start to
+* double up on ports per queue.
+*/
+   for (i = 0; i < VNET_MAX_TXQS; i++) {
+   if (vp->q_used[i] == 0) {
+   smallest = i;
+   break;
+   }
+   if (vp->q_used[i] < vp->q_used[smallest])
+   smallest = i;
+   }
 
-   n = vp->nports++;
-   n = n & (VNET_MAX_TXQS - 1);
-   port->q_index = n;
+   vp->nports++;
+   vp->q_used[smallest]++;
+   port->q_index = smallest;
netif_tx_wake_queue(netdev_get_tx_queue(VNET_PORT_TO_NET_DEVICE(port),
port->q_index));
 }
@@ -1743,5 +1757,7 @@ void sunvnet_port_rm_txq_common(struct vnet_port *port)
port->vp->nports--;
netif_tx_stop_queue(netdev_get_tx_queue(VNET_PORT_TO_NET_DEVICE(port),
port->q_index));
+   port->vp->q_used[port->q_index]--;
+   port->q_index = 0;
 }
 EXPORT_SYMBOL_GPL(sunvnet_port_rm_txq_common);
diff --git a/drivers/net/ethernet/sun/sunvnet_common.h 
b/drivers/net/ethernet/sun/sunvnet_common.h
index 006eaf8..d4b4a4e 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.h
+++ b/drivers/net/ethernet/sun/sunvnet_common.h
@@ -112,22 +112,15 @@ struct vnet_mcast_entry {
 };
 
 struct vnet {
-   /* Protects port_list and port_hash.  */
-   spinlock_t  lock;
-
+   spinlock_t  lock; /* Protects port_list and port_hash.  */
struct net_device   *dev;
-
u32 msg_enable;
-
+   u16 q_used[VNET_MAX_TXQS];
struct list_headport_list;
-
struct hlist_head   port_hash[VNET_PORT_HASH_SIZE];
-
struct vnet_mcast_entry *mcast_list;
-
struct list_headlist;
u64 local_mac;
-
int nports;
 };
 
-- 
1.7.1



[PATCH v2] {net,IB}/{rxe,usnic}: Utilize generic mac to eui32 function

2017-03-06 Thread Yuval Shaia
This logic seems to be duplicated in (at least) three separate files.
Move it to one place so code can be re-use.

Signed-off-by: Yuval Shaia 
---
v0 -> v1:
* Add missing #include
* Rename to genaddrconf_ifid_eui48
v1 -> v2:
* Reset eui[0] to default if dev_id is used
---
 drivers/infiniband/hw/usnic/usnic_common_util.h | 11 +++
 drivers/infiniband/sw/rxe/rxe_net.c | 11 ++-
 include/net/addrconf.h  | 19 +--
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h 
b/drivers/infiniband/hw/usnic/usnic_common_util.h
index b54986d..09871da 100644
--- a/drivers/infiniband/hw/usnic/usnic_common_util.h
+++ b/drivers/infiniband/hw/usnic/usnic_common_util.h
@@ -34,6 +34,8 @@
 #ifndef USNIC_CMN_UTIL_H
 #define USNIC_CMN_UTIL_H
 
+#include 
+
 static inline void
 usnic_mac_to_gid(const char *const mac, char *raw_gid)
 {
@@ -57,14 +59,7 @@ usnic_mac_ip_to_gid(const char *const mac, const __be32 
inaddr, char *raw_gid)
raw_gid[1] = 0x80;
memset(_gid[2], 0, 2);
memcpy(_gid[4], , 4);
-   raw_gid[8] = mac[0]^2;
-   raw_gid[9] = mac[1];
-   raw_gid[10] = mac[2];
-   raw_gid[11] = 0xff;
-   raw_gid[12] = 0xfe;
-   raw_gid[13] = mac[3];
-   raw_gid[14] = mac[4];
-   raw_gid[15] = mac[5];
+   genaddrconf_ifid_eui48(_gid[8], mac);
 }
 
 static inline void
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c 
b/drivers/infiniband/sw/rxe/rxe_net.c
index d8610960..90285c8 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -86,18 +87,10 @@ struct rxe_recv_sockets recv_sockets;
 
 static __be64 rxe_mac_to_eui64(struct net_device *ndev)
 {
-   unsigned char *mac_addr = ndev->dev_addr;
__be64 eui64;
unsigned char *dst = (unsigned char *)
 
-   dst[0] = mac_addr[0] ^ 2;
-   dst[1] = mac_addr[1];
-   dst[2] = mac_addr[2];
-   dst[3] = 0xff;
-   dst[4] = 0xfe;
-   dst[5] = mac_addr[3];
-   dst[6] = mac_addr[4];
-   dst[7] = mac_addr[5];
+   genaddrconf_ifid_eui48(dst, ndev->dev_addr);
 
return eui64;
 }
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 17c6fd8..cdfa73f 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -103,12 +103,21 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct 
net_device *dev,
 u32 addr_flags, bool sllao, bool tokenized,
 __u32 valid_lft, u32 prefered_lft);
 
+static inline void genaddrconf_ifid_eui48(u8 *eui, const char *const addr)
+{
+   memcpy(eui, addr, 3);
+   eui[0] ^= 2;
+   eui[3] = 0xFF;
+   eui[4] = 0xFE;
+   memcpy(eui + 5, addr + 3, 3);
+}
+
 static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
 {
if (dev->addr_len != ETH_ALEN)
return -1;
-   memcpy(eui, dev->dev_addr, 3);
-   memcpy(eui + 5, dev->dev_addr + 3, 3);
+
+   genaddrconf_ifid_eui48(eui, dev->dev_addr);
 
/*
 * The zSeries OSA network cards can be shared among various
@@ -124,13 +133,11 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct 
net_device *dev)
 * scope according to RFC2373.
 */
if (dev->dev_id) {
+   eui[0] = dev->dev_addr[0];
eui[3] = (dev->dev_id >> 8) & 0xFF;
eui[4] = dev->dev_id & 0xFF;
-   } else {
-   eui[3] = 0xFF;
-   eui[4] = 0xFE;
-   eui[0] ^= 2;
}
+
return 0;
 }
 
-- 
2.7.4



Re: [PATCH net] bpf: disable broken write protection on i386

2017-03-06 Thread Daniel Borkmann

On 03/06/2017 07:52 PM, David Miller wrote:

From: Daniel Borkmann 
Date: Mon, 06 Mar 2017 19:35:47 +0100


I can do a few more tests with the kernel I have. I'm also totally
fine if we drop this patch; it's just rc1, so there's plenty of time
till a final release.


I would really prefer we get to the bottom of this rather than apply
the quick band-aid on this one.


Ok, sounds reasonable.


Re: [PATCH net-next] packet: fix panic in __packet_set_timestamp on tpacket_v3 in tx mode

2017-03-06 Thread chetan loke
On Mon, Mar 6, 2017 at 9:45 AM, Willem de Bruijn
 wrote:
> On Mon, Mar 6, 2017 at 12:13 PM, chetan loke  wrote:

 Gosh. Can we also replace this BUG() into something less aggressive ?
>>>
>>>
>>> There are currently 5 of these WARN() + BUG() constructs and 1 BUG()-only
>>> for the 'default' TPACKET version spread all over af_packet, so probably
>>> makes sense to rather make all of them less aggressive.
>>>
>>>
>>
>> Very few consumers actually go looking in the kernel logs to see the
>> error-warnings and report them back here.
>>
>> This severity will get them to report the incident which in this case
>> got fixed??
>
> But BUG_ONs in the datapath can cause outages in real production
> environments. This should not happen for recoverable failures. For
> users who cannot be bothered to check their logs, there is sysctl
> kernel.panic_on_warn.


Completely understand(and you should have failover to handle these
outages). But then are you ok giving incorrect info to the
application?

For this specific bug: it is so basic that you should hit this bug 1st
time everytime when you are adding support or porting a new header.
Correct?

And so from that point of view, this BUG_ON has served its purpose.


Re: [PATCH net-next] packet: fix panic in __packet_set_timestamp on tpacket_v3 in tx mode

2017-03-06 Thread Willem de Bruijn
> Gosh. Can we also replace this BUG() into something less aggressive ?


 There are currently 5 of these WARN() + BUG() constructs and 1 BUG()-only
 for the 'default' TPACKET version spread all over af_packet, so probably
 makes sense to rather make all of them less aggressive.


>>>
>>> Very few consumers actually go looking in the kernel logs to see the
>>> error-warnings and report them back here.
>>>
>>> This severity will get them to report the incident which in this case
>>> got fixed??
>>
>> But BUG_ONs in the datapath can cause outages in real production
>> environments. This should not happen for recoverable failures. For
>> users who cannot be bothered to check their logs, there is sysctl
>> kernel.panic_on_warn.
>
>
> Completely understand(and you should have failover to handle these
> outages).

Not for correlated failures where all systems can hit the same path.
This is especially dangerous when remote packets or untrusted
local users can trigger a BUG-enabled path.

> But then are you ok giving incorrect info to the
> application?

No, we should certainly signal an error. For instance, returning
TP_STATUS_WRONG_FORMAT instead of TP_STATUS_AVAILABLE.

> For this specific bug: it is so basic that you should hit this bug 1st
> time everytime when you are adding support or porting a new header.
> Correct?

Agreed, but that is small consolation if an unprivileged user (say, in
a namespace) finds out that it can trigger the codepath.

But I agree that this particular BUG_ON is one of the easier to
reason about.


Re: [PATCH RFC net-next v2 1/4] skbuff: add stub to help computing crc32c on SCTP packets

2017-03-06 Thread Davide Caratti
On Tue, 2017-02-28 at 14:46 -0800, Alexander Duyck wrote:
> On Tue, Feb 28, 2017 at 2:32 AM, Davide Caratti  wrote:
> > 
> > sctp_compute_checksum requires crc32c symbol (provided by libcrc32c), so
> > it can't be used in net core. Like it has been done previously with other
> > symbols (e.g. ipv6_dst_lookup), introduce a stub struct skb_checksum_ops
> > to allow computation of SCTP checksum in net core after sctp.ko (and thus
> > libcrc32c) has been loaded.
> 
> At a minimum the name really needs to change.  SCTP does not do
> checksums.  It does a CRC, and a CRC is a very different thing.  The
> fact that somebody decided that offloading a CRC could use the same
> framework is very unfortunate, and your patch descriptions in this
> whole set are calling out a CRC as checksums which it is not.

hello Alexander,

thank you for contributing to this topic. I see there has been a similar
discussion some months ago
(https://www.mail-archive.com/netdev@vger.kernel.org/msg94955.html).

> I don't want to see anything "checksum" or "csum" related in the
> naming when it comes to dealing with SCTP unless we absolutely have
> to have it.  So any function names or structures with sctp in the name
> should call out "crc32" or "crc", please don't use checksum.

On Wed, 2017-03-01 at 10:53 +, David Laight wrote:
> Then also change all the places that refer the IP 1's compliment
> checksum to ipchecksum.

(but crc32 uses a different polynomial than crc32c! :-) ) I understand 
your concerns, nevertheless we are writing to a member of struct sctphdr
whose name is 'checksum' since the earliest introduction of SCTP; moreover,
similar terminology ('crc32c checksum') is used throughout all RFC4960.
That's why I don't think anybody will be confused by usage of 'csum' or
'checksum' words.

On Tue, 2017-02-28 at 19:17 -0800, Tom Herbert wrote:
> I agree that internal functions to sctp should not refer to checksum,
> but I think we need to take care to be consistent with any external
> API (even if somebody made a mistake defining it this way :-) ). As
> you know the checksum interface must be very precisely defined, there
> is no leeway for ambiguity.

We can make the new symbols more generic removing 'sctp' from the
symbol name, and writing explicitly that skb needs crc32c (rather than
skb does not need internet checksum).

Proposal:
we use crc32c, possibly combined with 'csum' or 'checksum', just like
it has been done in RFC4960.  So, symbol names can be replaced as follows:

RFC v2 name  | RFC v3 name
-+-
warn_sctp_csum_update| warn_crc32c_csum_update
warn_sctp_csum_combine   | warn_crc32c_csum_combine
sctp_csum_stub   | crc32c_csum_stub
sctp_csum_ops| crc32c_csum_ops
skb_sctp_csum_help   | skb_crc32c_csum_help
skb->csum_not_inet   | skb->crc32c_csum

please let me know if the proposal can be acceptable from your point of view.

On Tue, 2017-02-28 at 11:50 -0800, Tom Herbert wrote:
> Unfortunately this potentially pushes the skbuf flags over 32 bits if
> I count correctly. I suggest that you rename csum_bad to
> csum_not_inet. Looks like csum_bad is only set by a grand total of one
> driver and I don't believe that is enough to justify its existence.
> It's probably a good time to remove it.

you are right: find below the current layout obtained with 'allyesconfig':

short unsigned int queue_mapping;   /*   140 2 */
unsigned char  __cloned_offset[0];  /*   142 0 */
unsigned char  cloned:1;/*   142: 7  1 */
unsigned char  nohdr:1; /*   142: 6  1 */
unsigned char  fclone:2;/*   142: 4  1 */
unsigned char  peeked:1;/*   142: 3  1 */
unsigned char  head_frag:1; /*   142: 2  1 */
unsigned char  xmit_more:1; /*   142: 1  1 */
unsigned char  __unused:1;  /*   142: 0  1 */

/* XXX 1 byte hole, try to pack */
unsigned int   headers_start[0];/*   144 0 */
unsigned char  __pkt_type_offset[0];/*   144 0 */
unsigned char  pkt_type:3;  /*   144: 5  1 */

<...>

unsigned char  ipvs_property:1; /*   147: 7  1 */
unsigned char  inner_protocol_type:1;   /*   147: 6  1 */
unsigned char  remcsum_offload:1;   /*   147: 5  1 */
unsigned char  offload_fwd_mark:1;  /*   147: 4  1 */
unsigned char  tc_skip_classify:1;  /*   147: 3  1 */
unsigned char  tc_at_ingress:1; /*   147: 2  1 */
unsigned char  tc_redirected:1; /*   147: 1  1 */
unsigned char  

Re: [RFC PATCH 1/2] af_packet: direct dma for packet ineterface

2017-03-06 Thread chetan loke
On Tue, Jan 31, 2017 at 9:09 PM, John Fastabend
 wrote:

>> If I understand correctly, the difficulty lies in v3 requiring that the
>> timer "close" the block when the timer expires. That may not be worth
>> implementing, indeed.
>>
>
> Yep that is where I just gave up and decided it wasn't worth it.
>

Without a support for timeout, when a user-space app has to do its own
book-keeping or lets say  - shutdown for maintenance/upgrade, then how
can they(app) unblock from this operation? Because if the link is idle
then the DMA may never happen because there are no frames on the wire.
So is there a way to handle this?

Chetan


Re: [PATCH 08/26] brcmsmac: make some local variables 'static const' to reduce stack size

2017-03-06 Thread Arnd Bergmann
On Mon, Mar 6, 2017 at 5:19 PM, Kalle Valo  wrote:
> Arend Van Spriel  writes:
>
>> On 2-3-2017 17:38, Arnd Bergmann wrote:
>>> With KASAN and a couple of other patches applied, this driver is one
>>> of the few remaining ones that actually use more than 2048 bytes of
>>> kernel stack:
>>>
>>> broadcom/brcm80211/brcmsmac/phy/phy_n.c: In function 
>>> 'wlc_phy_workarounds_nphy_gainctrl':
>>> broadcom/brcm80211/brcmsmac/phy/phy_n.c:16065:1: warning: the frame size of 
>>> 3264 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>>> broadcom/brcm80211/brcmsmac/phy/phy_n.c: In function 
>>> 'wlc_phy_workarounds_nphy':
>>> broadcom/brcm80211/brcmsmac/phy/phy_n.c:17138:1: warning: the frame size of 
>>> 2864 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>>>
>>> Here, I'm reducing the stack size by marking as many local variables as
>>> 'static const' as I can without changing the actual code.
>>
>> Acked-by: Arend van Spriel 
>
> Arnd, via which tree are you planning to submit these? I'm not sure
> what I should do with the wireless drivers patches from this series.

I'm not quite sure myself yet. I'd probably want the first few patches that
do most of the work get merged through Andrew's linux-mm tree once
we have come to agreement on them. The driver specific patches like
the brcmsmac ones depend on the introduction of noinline_for_kasan
or noinline_if_stackbloat and could either go in along with the first
set, or as a follow-up through the normal maintainer trees.

   Arnd


Re: [PATCH v2] can: m_can: enable transmission of FD frame on latest version

2017-03-06 Thread Oliver Hartkopp

@Wenyou Yang: Can you please test the two patches posted here:

[PATCH 1/2] can: m_can: handle bitrate setup on IP core >= 3.1.x
http://marc.info/?l=linux-can=148883529927720=2

[PATCH 2/2] can: m_can: handle frame transmission on IP core >= 3.1.x
http://marc.info/?l=linux-can=148883529927718=2

Tnx & regards,
Oliver



[PATCH net-next] tg3: Add the ability to conditionally build w/ HWMON

2017-03-06 Thread Florian Fainelli
Introduce a Kconfig option: CONFIG_TIGON3_HWMON which allows to build
in/out support for thermal sensors reported by Tigon3 NICs.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/Kconfig | 8 +++-
 drivers/net/ethernet/broadcom/tg3.c   | 7 +++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/Kconfig 
b/drivers/net/ethernet/broadcom/Kconfig
index 940fb24bba21..96413808c726 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -109,7 +109,6 @@ config TIGON3
tristate "Broadcom Tigon3 support"
depends on PCI
select PHYLIB
-   select HWMON
imply PTP_1588_CLOCK
---help---
  This driver supports Broadcom Tigon3 based gigabit Ethernet cards.
@@ -117,6 +116,13 @@ config TIGON3
  To compile this driver as a module, choose M here: the module
  will be called tg3.  This is recommended.
 
+config TIGON3_HWMON
+   bool "Broadcom Tigon3 HWMON support"
+   default y
+   depends on TIGON3 && HWMON && !(TIGON3=y && HWMON=m)
+   ---help---
+ Say Y if you want to expose the thermal sensor on Tigon3 devices.
+
 config BNX2X
tristate "Broadcom NetXtremeII 10Gb support"
depends on PCI
diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index 30d1eb9ebec9..f395b951f5e7 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -825,6 +825,7 @@ static int tg3_ape_event_lock(struct tg3 *tp, u32 
timeout_us)
return timeout_us ? 0 : -EBUSY;
 }
 
+#ifdef CONFIG_TIGON3_HWMON
 static int tg3_ape_wait_for_event(struct tg3 *tp, u32 timeout_us)
 {
u32 i, apedata;
@@ -904,6 +905,7 @@ static int tg3_ape_scratchpad_read(struct tg3 *tp, u32 
*data, u32 base_off,
 
return 0;
 }
+#endif
 
 static int tg3_ape_send_event(struct tg3 *tp, u32 event)
 {
@@ -10744,6 +10746,7 @@ static int tg3_init_hw(struct tg3 *tp, bool reset_phy)
return tg3_reset_hw(tp, reset_phy);
 }
 
+#ifdef CONFIG_TIGON3_HWMON
 static void tg3_sd_scan_scratchpad(struct tg3 *tp, struct tg3_ocir *ocir)
 {
int i;
@@ -10826,6 +10829,10 @@ static void tg3_hwmon_open(struct tg3 *tp)
dev_err(>dev, "Cannot register hwmon device, aborting\n");
}
 }
+#else
+static inline void tg3_hwmon_close(struct tg3 *tp) { }
+static inline void tg3_hwmon_open(struct tg3 *tp) { }
+#endif /* CONFIG_TIGON3_HWMON */
 
 
 #define TG3_STAT_ADD32(PSTAT, REG) \
-- 
2.9.3



[patch net-next 0/2] mlxsw: cosmetics

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Couple of cosmetic mlxsw patches

Ido Schimmel (1):
  mlxsw: pci: Remove unused bit

Jiri Pirko (1):
  mlxsw: spectrum: Fix helper function and port variable names

 drivers/net/ethernet/mellanox/mlxsw/cmd.h  | 12 
 drivers/net/ethernet/mellanox/mlxsw/pci.c  |  2 --
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 23 ---
 3 files changed, 12 insertions(+), 25 deletions(-)

-- 
2.7.4



[patch net-next 2/2] mlxsw: pci: Remove unused bit

2017-03-06 Thread Jiri Pirko
From: Ido Schimmel 

The overrun ignore bit isn't supported by the device's firmware and was
recently removed from the programmer's reference manual (PRM).

Remove it from the driver as well.

Signed-off-by: Ido Schimmel 
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/cmd.h | 12 
 drivers/net/ethernet/mellanox/mlxsw/pci.c |  2 --
 2 files changed, 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h 
b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
index a1b4842..479511c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
@@ -1043,13 +1043,6 @@ MLXSW_ITEM32(cmd_mbox, sw2hw_cq, cv, 0x00, 28, 4);
  */
 MLXSW_ITEM32(cmd_mbox, sw2hw_cq, c_eqn, 0x00, 24, 1);
 
-/* cmd_mbox_sw2hw_cq_oi
- * When set, overrun ignore is enabled. When set, updates of
- * CQ consumer counter (poll for completion) or Request completion
- * notifications (Arm CQ) DoorBells should not be rung on that CQ.
- */
-MLXSW_ITEM32(cmd_mbox, sw2hw_cq, oi, 0x00, 12, 1);
-
 /* cmd_mbox_sw2hw_cq_st
  * Event delivery state machine
  * 0x0 - FIRED
@@ -1132,11 +1125,6 @@ static inline int mlxsw_cmd_sw2hw_eq(struct mlxsw_core 
*mlxsw_core,
  */
 MLXSW_ITEM32(cmd_mbox, sw2hw_eq, int_msix, 0x00, 24, 1);
 
-/* cmd_mbox_sw2hw_eq_oi
- * When set, overrun ignore is enabled.
- */
-MLXSW_ITEM32(cmd_mbox, sw2hw_eq, oi, 0x00, 12, 1);
-
 /* cmd_mbox_sw2hw_eq_st
  * Event delivery state machine
  * 0x0 - FIRED
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c 
b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index a223c85..ffeb746 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -580,7 +580,6 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, 
char *mbox,
 
mlxsw_cmd_mbox_sw2hw_cq_cv_set(mbox, 0); /* CQE ver 0 */
mlxsw_cmd_mbox_sw2hw_cq_c_eqn_set(mbox, MLXSW_PCI_EQ_COMP_NUM);
-   mlxsw_cmd_mbox_sw2hw_cq_oi_set(mbox, 0);
mlxsw_cmd_mbox_sw2hw_cq_st_set(mbox, 0);
mlxsw_cmd_mbox_sw2hw_cq_log_cq_size_set(mbox, ilog2(q->count));
for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
@@ -755,7 +754,6 @@ static int mlxsw_pci_eq_init(struct mlxsw_pci *mlxsw_pci, 
char *mbox,
}
 
mlxsw_cmd_mbox_sw2hw_eq_int_msix_set(mbox, 1); /* MSI-X used */
-   mlxsw_cmd_mbox_sw2hw_eq_oi_set(mbox, 0);
mlxsw_cmd_mbox_sw2hw_eq_st_set(mbox, 1); /* armed */
mlxsw_cmd_mbox_sw2hw_eq_log_eq_size_set(mbox, ilog2(q->count));
for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
-- 
2.7.4



[patch net-next 1/2] mlxsw: spectrum: Fix helper function and port variable names

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Commit dd82364c3ab9 ("mlxsw: Flip to the new dev walk API") did some
small changes in mlxsw code, but it did not respect the naming
conventions. So fix this now.

Signed-off-by: Jiri Pirko 
Reviewed-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 16484f2..ae18067 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3326,13 +3326,13 @@ bool mlxsw_sp_port_dev_check(const struct net_device 
*dev)
return dev->netdev_ops == _sp_port_netdev_ops;
 }
 
-static int mlxsw_lower_dev_walk(struct net_device *lower_dev, void *data)
+static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data)
 {
-   struct mlxsw_sp_port **port = data;
+   struct mlxsw_sp_port **p_mlxsw_sp_port = data;
int ret = 0;
 
if (mlxsw_sp_port_dev_check(lower_dev)) {
-   *port = netdev_priv(lower_dev);
+   *p_mlxsw_sp_port = netdev_priv(lower_dev);
ret = 1;
}
 
@@ -3341,15 +3341,15 @@ static int mlxsw_lower_dev_walk(struct net_device 
*lower_dev, void *data)
 
 static struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device 
*dev)
 {
-   struct mlxsw_sp_port *port;
+   struct mlxsw_sp_port *mlxsw_sp_port;
 
if (mlxsw_sp_port_dev_check(dev))
return netdev_priv(dev);
 
-   port = NULL;
-   netdev_walk_all_lower_dev(dev, mlxsw_lower_dev_walk, );
+   mlxsw_sp_port = NULL;
+   netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, _sp_port);
 
-   return port;
+   return mlxsw_sp_port;
 }
 
 static struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev)
@@ -3362,15 +3362,16 @@ static struct mlxsw_sp *mlxsw_sp_lower_get(struct 
net_device *dev)
 
 static struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct 
net_device *dev)
 {
-   struct mlxsw_sp_port *port;
+   struct mlxsw_sp_port *mlxsw_sp_port;
 
if (mlxsw_sp_port_dev_check(dev))
return netdev_priv(dev);
 
-   port = NULL;
-   netdev_walk_all_lower_dev_rcu(dev, mlxsw_lower_dev_walk, );
+   mlxsw_sp_port = NULL;
+   netdev_walk_all_lower_dev_rcu(dev, mlxsw_sp_lower_dev_walk,
+ _sp_port);
 
-   return port;
+   return mlxsw_sp_port;
 }
 
 struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev)
-- 
2.7.4



[patch net] mlxsw: spectrum_flower: Remove bogus warns in mlxsw_sp_flower_destroy

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

This warnings may be hit even in case they should not - in case user
puts a TC-flower rule which failed to be offloaded. So just remove them.

Reported-by: Petr Machata 
Reported-by: Ido Schimmel 
Fixes: commit 7aa0f5aa9030 ("mlxsw: spectrum: Implement TC flower offload")
Signed-off-by: Jiri Pirko 
Acked-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 22ab429..ae6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -303,11 +303,11 @@ void mlxsw_sp_flower_destroy(struct mlxsw_sp_port 
*mlxsw_sp_port, bool ingress,
ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, mlxsw_sp_port->dev,
   ingress,
   MLXSW_SP_ACL_PROFILE_FLOWER);
-   if (WARN_ON(IS_ERR(ruleset)))
+   if (IS_ERR(ruleset))
return;
 
rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie);
-   if (!WARN_ON(!rule)) {
+   if (rule) {
mlxsw_sp_acl_rule_del(mlxsw_sp, rule);
mlxsw_sp_acl_rule_destroy(mlxsw_sp, rule);
}
-- 
2.7.4



Re: [PATCH v2] selinux: check for address length in selinux_socket_bind()

2017-03-06 Thread Eric Dumazet
On Mon, 2017-03-06 at 19:46 +0100, Alexander Potapenko wrote:
> KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of
> uninitialized memory in selinux_socket_bind():
> 
...
> Signed-off-by: Alexander Potapenko 
> ---
> Changes since v1:
>  - fixed patch description
>  - fixed addrlen tests to match those in inet_bind() and inet6_bind()
>(per comment from Eric Dumazet)
> ---
>  security/selinux/hooks.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 0a4b4b040e0a..ddc4aca6c840 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -4351,10 +4351,19 @@ static int selinux_socket_bind(struct socket *sock, 
> struct sockaddr *address, in
>   u32 sid, node_perm;
>  
>   if (family == PF_INET) {
> + if (addrlen < sizeof(struct sockaddr_in)) {
> + err = -EINVAL;
> + goto out;
> + }
>   addr4 = (struct sockaddr_in *)address;
>   snum = ntohs(addr4->sin_port);
>   addrp = (char *)>sin_addr.s_addr;
> +
>   } else {
> + if (addrlen < SIN6_LEN_RFC2133) {
> + err = -EINVAL;
> + goto out;
> + }
>   addr6 = (struct sockaddr_in6 *)address;
>   snum = ntohs(addr6->sin6_port);
>   addrp = (char *)>sin6_addr.s6_addr;

Acked-by: Eric Dumazet 




Re: netlink: GPF in netlink_unicast

2017-03-06 Thread Cong Wang
On Mon, Mar 6, 2017 at 2:54 AM, Dmitry Vyukov  wrote:
> Hello,
>
> I've got the following crash while running syzkaller fuzzer on
> net-next/8d70eeb84ab277377c017af6a21d0a337025dede:
>
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 883 Comm: kauditd Not tainted 4.10.0+ #6
> Hardware name: Google Google Compute Engine/Google Compute Engine,
> BIOS Google 01/01/2011
> task: 8801d79f0240 task.stack: 8801d7a2
> RIP: 0010:sock_sndtimeo include/net/sock.h:2162 [inline]
> RIP: 0010:netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249
> RSP: 0018:8801d7a27c38 EFLAGS: 00010206
> RAX: 0056 RBX: 8801d7a27cd0 RCX: 
> RDX:  RSI:  RDI: 02b0
> RBP: 8801d7a27cf8 R08: ed00385cf286 R09: ed00385cf286
> R10: 0006 R11: ed00385cf285 R12: 
> R13: dc00 R14: 8801c2fc3c80 R15: 014000c0
> FS:  () GS:8801dbe0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 20cfd000 CR3: 0001c758f000 CR4: 001406f0
> Call Trace:
>  kauditd_send_unicast_skb+0x3c/0x70 kernel/audit.c:482
>  kauditd_thread+0x174/0xb00 kernel/audit.c:599
>  kthread+0x326/0x3f0 kernel/kthread.c:229
>  ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430
> Code: 44 89 fe e8 56 15 ff ff 8b 8d 70 ff ff ff 49 89 c6 31 c0 85 c9
> 75 27 e8 b2 b2 f4 fd 49 8d bc 24 b0 02 00 00 48 89 f8 48 c1 e8 03 <42>
> 80 3c 28 00 0f 85 37 06 00 00 49 8b 84 24 b0 02 00 00 4c 8d
> RIP: sock_sndtimeo include/net/sock.h:2162 [inline] RSP: 8801d7a27c38
> RIP: netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249 RSP:
> 8801d7a27c38
> ---[ end trace ad1bba9d457430b6 ]---
> Kernel panic - not syncing: Fatal exception
>
>
> This is not reproducible and seems to be caused by an elusive race.
> However, looking at the code I don't see any proper protection of
> audit_sock (other than the if (!audit_pid) which is obviously not
> enough to protect against races).

audit_cmd_mutex is supposed to protect it, I think.
But kauditd_send_unicast_skb() seems not holding this mutex.

Richard?


Re: [PATCH net-next RFC 2/4] virtio-net: transmit napi

2017-03-06 Thread Michael S. Tsirkin
On Mon, Mar 06, 2017 at 10:55:22AM -0800, David Miller wrote:
> From: Willem de Bruijn 
> Date: Mon, 6 Mar 2017 12:50:19 -0500
> 
> >>>   drivers/net/virtio_net.c | 73
> >>> 
> >>>   1 file changed, 61 insertions(+), 12 deletions(-)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 8c21e9a4adc7..9a9031640179 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -33,6 +33,8 @@
> >>>   static int napi_weight = NAPI_POLL_WEIGHT;
> >>>   module_param(napi_weight, int, 0444);
> >>>   +static int napi_tx_weight = NAPI_POLL_WEIGHT;
> >>> +
> >>
> >>
> >> Maybe we should use module_param for this? Or in the future, use
> >> tx-frames-irq for a per-device configuration.
> > 
> > This option should eventually just go away, and napi tx become the
> > standard mode.
> > 
> > In the short term, while we evaluate it on varied workloads, a
> > module_param sounds good to me. In general that is frowned
> > upon, as it leads to different configuration interfaces for each
> > device driver. But that should not be a concern in this limited
> > case.
> 
> In any event, do we really need a TX weight at all?
> 
> I guess you tried this, but why doesn't it not work to just do
> all TX work unconditionally in a NAPI poll pass?  This is how
> we encourage all NIC drivers to handle this.

This seems to be more or less what this driver does already.
So I suspect it can just ignore the weight.

-- 
MST


Re: [PATCH v6 6/6] 6lowpan: Fix IID format for Bluetooth

2017-03-06 Thread Alexander Aring

Hi,

sorry, I decided now to take a look into the patch... Currently I have
no time to do anything... normally. That's why I simple said "everything
okay" in irc.

On 03/02/2017 02:23 PM, Luiz Augusto von Dentz wrote:
> From: Luiz Augusto von Dentz 
> 
> According to RFC 7668 U/L bit shall not be used:
> 
> https://wiki.tools.ietf.org/html/rfc7668#section-3.2.2 [Page 10]:
> 
>In the figure, letter 'b' represents a bit from the
>Bluetooth device address, copied as is without any changes on any
>bit.  This means that no bit in the IID indicates whether the
>underlying Bluetooth device address is public or random.
> 
>|0  1|1  3|3  4|4  6|
>|0  5|6  1|2  7|8  3|
>+++++
>|||1110||
>+++++
> 
> Because of this the code cannot figure out the address type from the IP
> address anymore thus it makes no sense to use peer_lookup_ba as it needs
> the peer address type.
> 
> Signed-off-by: Luiz Augusto von Dentz 
> Reviewed-by: Stefan Schmidt 
> Acked-by: Jukka Rissanen 
> ---
>  include/net/6lowpan.h   |  4 ---
>  include/net/addrconf.h  |  7 -
>  net/bluetooth/6lowpan.c | 79 
> -
>  3 files changed, 18 insertions(+), 72 deletions(-)
> 
> diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
> index c5792cb..a713780 100644
> --- a/include/net/6lowpan.h
> +++ b/include/net/6lowpan.h
> @@ -211,10 +211,6 @@ static inline void 
> lowpan_iphc_uncompress_eui48_lladdr(struct in6_addr *ipaddr,
>   ipaddr->s6_addr[11] = 0xFF;
>   ipaddr->s6_addr[12] = 0xFE;
>   memcpy(>s6_addr[13], lladdr + 3, 3);
> - /* second bit-flip (Universe/Local)
> -  * is done according RFC2464
> -  */
> - ipaddr->s6_addr[8] ^= 0x02;
>  }
>  
>  #ifdef DEBUG
> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
> index 17c6fd8..3931fd2 100644
> --- a/include/net/addrconf.h
> +++ b/include/net/addrconf.h
> @@ -129,7 +129,12 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct 
> net_device *dev)
>   } else {
>   eui[3] = 0xFF;
>   eui[4] = 0xFE;
> - eui[0] ^= 2;
> +
> + /*
> +  * According to RFC 7668 U/L bit shall not be toggled.
> +  */
> + if (dev->type != ARPHRD_6LOWPAN)
> + eui[0] ^= 2;
>   }
>   return 0;
>  }

I don't know why you make this in such function which is called by a lot
of others dev->types which are not ARPHRD_6LOWPAN.

What addrconf does now is a switch case over dev->type and then calling
addrconf_ifid_eui48 for some cases and recheck again dev->type for a
special case of ARPHRD_6LOWPAN.

ARPHRD_6LOWPAN is also somehow wrong because you need check on ARPHRD_6LOWPAN 
and
LLTYPE_BTLE. As I mentioned the IPv6-over-foo adaptation describes how
to generate IID and not the dev->addr_len.

---

Second thing is we bitflip u/l and then we bitflip again because we
don't want it and do it again...

---

Anyway, I see it will working for you... but it's very confusing for me.
Don't use addrconf_ifid_eui48 here, grab 2-3 code lines from which does:

1. copy first mac addr bytes
2. set FF FE
3. copy last mac addr bytes

and this according LLTYPE_BTLE and not dev->addr_len

---

For me it's not acceptable because you doing dev->type handling which is
already evaluated before and other subsystems will do such check as
well. We can avoid this check on others subsystems. If your IID is
different generated than a normal EUI48, then simple don't use
addrconf_ifid_eui48.

---

We already talked about to place the 8 IID bytes into 6lowpan netdev private
space. If we do that later, we need to touch addrconf_ifid_eui48
again..

- Alex


[PATCH net-next] net: use proper lockdep annotation in __sk_dst_set()

2017-03-06 Thread Eric Dumazet
From: Eric Dumazet 

__sk_dst_set() must be called while we own the socket.

We can get proper lockdep coverage using lockdep_sock_is_held()
and rcu_dereference_protected()

Signed-off-by: Eric Dumazet 
---
 include/net/sock.h |7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 
5e5997654db6454f82179cc35c4bc22e89d0c06f..6db7693b9e61854abaa461706f2678c6d429b73f
 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1780,11 +1780,8 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
 
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
-   /*
-* This can be called while sk is owned by the caller only,
-* with no state that can be checked in a rcu_dereference_check() cond
-*/
-   old_dst = rcu_dereference_raw(sk->sk_dst_cache);
+   old_dst = rcu_dereference_protected(sk->sk_dst_cache,
+   lockdep_sock_is_held(sk));
rcu_assign_pointer(sk->sk_dst_cache, dst);
dst_release(old_dst);
 }




Re: [PATCH net-next RFC 0/4] virtio-net tx napi

2017-03-06 Thread Michael S. Tsirkin
On Fri, Mar 03, 2017 at 09:39:05AM -0500, Willem de Bruijn wrote:
> From: Willem de Bruijn 
> 
> Add napi for virtio-net transmit completion processing. Based on
> previous patchsets by Jason Wang:
> 
>   [RFC V7 PATCH 0/7] enable tx interrupts for virtio-net
>   http://lkml.iu.edu/hypermail/linux/kernel/1505.3/00245.html
> 
> This patchset is not ready for submission yet, but it is time for
> another checkpoint. Among others, it requires more testing with
> more diverse workloads.
> 
> 
> Before commit b0c39dbdc204 ("virtio_net: don't free buffers in xmit
> ring") the virtio-net driver would free transmitted packets on
> transmission of new packets in ndo_start_xmit and, to catch the edge
> case when no new packet is sent, also in a timer at 10HZ.
> 
> A timer can cause long stalls. VIRTIO_F_NOTIFY_ON_EMPTY avoids stalls
> due to low free descriptor count. It does not address a stalls due to
> low socket SO_SNDBUF. Increasing timer frequency decreases that stall
> time, but increases interrupt rate and, thus, cycle count.
> 
> Currently, with no timer, packets are freed only at ndo_start_xmit.
> Latency of consume_skb is now unbounded. To avoid a deadlock if a sock
> reaches SO_SNDBUF, packets are orphaned on tx. This breaks TCP small
> queues.
> 
> Reenable TCP small queues by removing the orphan. Instead of using a
> timer, convert the driver to regular tx napi. This does not have the
> unresolved stall issue and does not have any frequency to tune.
> 
> By keeping interrupts enabled by default, napi increases tx
> interrupt rate. VIRTIO_F_EVENT_IDX avoids sending an interrupt if
> one is already unacknowledged, so makes this more feasible today.
> Combine that with two optimizations that bring interrupt rate
> back in line with the existing code:
> 
> Interrupt coalescing delays interrupts until a number of events
> accrue or a timer fires.
> 
> Tx completion cleaning on rx interrupts elides most explicit tx
> interrupts by relying on the fact that many rx interrupts fire.
> 
> Tested by running {1, 10, 100} TCP_STREAM and TCP_RR tests from a
> guest to a server on the host, on an x86_64 Haswell. The guest
> runs 4 vCPUs pinned to 4 cores. vhost and the test server are
> pinned to a core each.
> 
> All results are the median of 5 runs, with variance well < 10%.
> Used neper (github.com/google/neper) as test process. Tests used
> experimental_zcopy=0. This is likely no longer needed.
> 
> Napi increases single stream throughput, but increases cycle cost
> across the board. Interrupt moderation ("+vhost") reverts both, if
> not fully. For this workload with ACKs in the return path, the
> last optimization ("at-rx") is more effective. For UDP this is
> likely not true.

I am inclined to say coalescing is more problematic because under light
load it causes timers to fire on host, causing exits if any VM is
running on the same core. Some people might have spare cores, etc,
but generally I think at-rx might be less disruptive.

UDP testing would be required to determine how effective it is,
current numbers look nice.

>  upstream napi   +vhost   +at-rx +v+at-rx
> Stream:
>   1x:
>   Mbps  3018238782301063800232842
>   Gcycles 405  499  386  403  417
> 
>   10x:
>   Mbps  4044140575416384026041299
>   Gcycles 438  545  430  416  416
> 
>   100x:
>   Mbps  3404934697347633463734259
>   Gcycles 441  545  433  415  422
> 
> Latency (us):
>   1x:
>   p50  24   24   24   21   24
>   p99  27   27   27   26   27
>   Gcycles 299  430  432  312  297
> 
> 10x:
>   p50  30   31   31   42   31
>   p99  40   46   48   52   42
>   Gcycles 347  423  471  322  463
> 
> 100x:
>   p50 155  151  163  306  161
>   p99 337  329  352  361  349
>   Gcycles 340  421  463  306  441
> 
> 
> Lower throughput at 100x vs 10x can be (at least in part)
> explained by looking at bytes per packet sent (nstat). It likely
> also explains the lower throughput of 1x for some variants.
> 
> upstream:
> 
>  N=1   bytes/pkt=16581
>  N=10  bytes/pkt=61513
>  N=100 bytes/pkt=51558
> 
> at_rx:
> 
>  N=1   bytes/pkt=65204
>  N=10  bytes/pkt=65148
>  N=100 bytes/pkt=56840
> 
> For this experiment, vhost has 64 frames and usecs thresholds.
> Configuring this from the guest requires additional patches to qemu.
> Temporary patch:
> 
>   @@ -846,9 +845,6 @@ static int vhost_net_open(struct inode *inode, struct 
> file *f)
>   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, 
> dev);
>   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, 
> dev);
>  
>   -   

Re: [PATCH net-next RFC 2/4] virtio-net: transmit napi

2017-03-06 Thread Willem de Bruijn
On Mon, Mar 6, 2017 at 2:33 PM, Michael S. Tsirkin  wrote:
> On Mon, Mar 06, 2017 at 10:55:22AM -0800, David Miller wrote:
>> From: Willem de Bruijn 
>> Date: Mon, 6 Mar 2017 12:50:19 -0500
>>
>> >>>   drivers/net/virtio_net.c | 73
>> >>> 
>> >>>   1 file changed, 61 insertions(+), 12 deletions(-)
>> >>>
>> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> >>> index 8c21e9a4adc7..9a9031640179 100644
>> >>> --- a/drivers/net/virtio_net.c
>> >>> +++ b/drivers/net/virtio_net.c
>> >>> @@ -33,6 +33,8 @@
>> >>>   static int napi_weight = NAPI_POLL_WEIGHT;
>> >>>   module_param(napi_weight, int, 0444);
>> >>>   +static int napi_tx_weight = NAPI_POLL_WEIGHT;
>> >>> +
>> >>
>> >>
>> >> Maybe we should use module_param for this? Or in the future, use
>> >> tx-frames-irq for a per-device configuration.
>> >
>> > This option should eventually just go away, and napi tx become the
>> > standard mode.
>> >
>> > In the short term, while we evaluate it on varied workloads, a
>> > module_param sounds good to me. In general that is frowned
>> > upon, as it leads to different configuration interfaces for each
>> > device driver. But that should not be a concern in this limited
>> > case.
>>
>> In any event, do we really need a TX weight at all?
>>
>> I guess you tried this, but why doesn't it not work to just do
>> all TX work unconditionally in a NAPI poll pass?  This is how
>> we encourage all NIC drivers to handle this.
>
> This seems to be more or less what this driver does already.
> So I suspect it can just ignore the weight.

Okay. Then we still need a boolean to toggle tx napi until we're
sure that the old path can be deprecated.


Re: [PATCH v2] can: m_can: enable transmission of FD frame on latest version

2017-03-06 Thread Oliver Hartkopp

Hi Marc,

On 03/06/2017 11:53 AM, Marc Kleine-Budde wrote:

On 03/06/2017 03:21 AM, Wenyou Yang wrote:

Enables the transmission of CAN FD frames on M_CAN IP core >= v3.1.x
and with the bit rate switching.

Tested on M_CAN IP 3.1.0 (CREL = 0x31040730) of SAMA5D2 SoC.


Does this patch work still with the old version of the silicon?


The bits that were added in the TX FIFO element are 'reserved' in the 
old silicon - so it should not harm.


This code enables

 if (priv->can.ctrlmode & CAN_CTRLMODE_FD)
-   cccr |= CCCR_CME_CANFD_BRS << CCCR_CME_SHIFT;
+   cccr |= (CCCR_CME_CANFD_BRS | CCCR_CME_CANFD) << CCCR_CME_SHIFT;

the CAN FD support in the new silicon.

This register is set for the old silicon EVERY time a CAN frame is sent.
So this change should not harm the old silicon either.

In fact I was told that the v3.0.x IP core is rather seldom in the wild.
Although I don't have a v3.0.x to test it should work from the 
documentation side of view.


Reviewed-by: Oliver Hartkopp 

If we would like to make it really better, the code in 
m_can_start_xmit() should only fiddle with the M_CAN_CCCR register when 
working with the v3.0.x silicon.


In fact I would suggest to use the

if (m_can_read_core_rev(priv) < M_CAN_COREREL_3_1_0)

method from

http://marc.info/?l=linux-can=148716783119090=2

to split the code in m_can_start_xmit() accordingly.

@Wenyou Yang: Can you please send a v3 which splits the tx function?

Regards,
Oliver


Re: [PATCH net-next RFC 2/4] virtio-net: transmit napi

2017-03-06 Thread David Miller
From: Willem de Bruijn 
Date: Mon, 6 Mar 2017 12:50:19 -0500

>>>   drivers/net/virtio_net.c | 73
>>> 
>>>   1 file changed, 61 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 8c21e9a4adc7..9a9031640179 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -33,6 +33,8 @@
>>>   static int napi_weight = NAPI_POLL_WEIGHT;
>>>   module_param(napi_weight, int, 0444);
>>>   +static int napi_tx_weight = NAPI_POLL_WEIGHT;
>>> +
>>
>>
>> Maybe we should use module_param for this? Or in the future, use
>> tx-frames-irq for a per-device configuration.
> 
> This option should eventually just go away, and napi tx become the
> standard mode.
> 
> In the short term, while we evaluate it on varied workloads, a
> module_param sounds good to me. In general that is frowned
> upon, as it leads to different configuration interfaces for each
> device driver. But that should not be a concern in this limited
> case.

In any event, do we really need a TX weight at all?

I guess you tried this, but why doesn't it not work to just do
all TX work unconditionally in a NAPI poll pass?  This is how
we encourage all NIC drivers to handle this.



Re: [PATCH net] bpf: disable broken write protection on i386

2017-03-06 Thread David Miller
From: Daniel Borkmann 
Date: Mon, 06 Mar 2017 19:35:47 +0100

> I can do a few more tests with the kernel I have. I'm also totally
> fine if we drop this patch; it's just rc1, so there's plenty of time
> till a final release.

I would really prefer we get to the bottom of this rather than apply
the quick band-aid on this one.

Thanks.


Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-06 Thread Dmitry Vyukov
On Mon, Mar 6, 2017 at 6:31 PM, David Ahern  wrote:
> On 3/4/17 1:15 PM, Eric Dumazet wrote:
>> On Sat, 2017-03-04 at 19:57 +0100, Dmitry Vyukov wrote:
>>> On Fri, Mar 3, 2017 at 8:12 PM, David Ahern  
>>> wrote:
 On 3/3/17 6:39 AM, Dmitry Vyukov wrote:
> I am getting heap out-of-bounds reports in
> fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone while running
> syzkaller fuzzer on 86292b33d4b79ee03e2f43ea0381ef85f077c760. They all
> follow the same pattern: an object of size 216 is allocated from
> ip_dst_cache slab, and then accessed at offset 272/276 withing
> fib6_walk. Looks like type confusion. Unfortunately this is not
> reproducible.

 I'll take a look this weekend or Monday at the latest.
>>>
>>>
>>> I've got some additional useful info on this. I think this is
>>> use-after-free rather than out-of-bounds. I've collected stack where
>>> the route was disposed with call_rcu, see the last "Disposed" stack.
>>> The crash happens when cmpxchg in rt_cache_route replaces an existing
>>> route. And that route seems to have some existing pointers to it
>>> (rt->dst.rt6_next) which fib6_walk uses to get to it after its
>>> deletion.
>>
>> rt_cache_route() deals with IPv4 routes.
>>
>> We somehow mix IPv4 and IPv6 dsts in IPv6 tree.
>>
>> We need to add type safety at IPV6 route insertions to catch the
>> offender.
>>
>
> I've seen something like this before -- a rt was on the gc list but
> still linked in the tables because of some reference.
>
> Dmitry: you seem to have reproduced this a few times. Can you share how
> to run whatever tests you are using?


We hit it several thousand times, but we get only several dozens of
crashes per day on ~80 VMs. So if you try to reproduce it on a single
machine it can take days for a single crash.
If you are ready to go that route, here are some instructions on
setting up syzkaller:
https://github.com/google/syzkaller
You also need kernel built with CONFIG_KASAN.
I am ready to help with resolving any issues.

Another possible route is if you give me a patch with some additional
WARNINGs. Then I can deploy it to bots and collect stacks.


[PATCH v2] selinux: check for address length in selinux_socket_bind()

2017-03-06 Thread Alexander Potapenko
KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of
uninitialized memory in selinux_socket_bind():

==
BUG: KMSAN: use of unitialized memory
inter: 0
CPU: 3 PID: 1074 Comm: packet2 Tainted: GB   4.8.0-rc6+ #1916
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  8800882ffb08 825759c8 8800882ffa48
 818bf551 85bab870 0092 85bab550
  0092 bb0009bb 0002
Call Trace:
 [< inline >] __dump_stack lib/dump_stack.c:15
 [] dump_stack+0x238/0x290 lib/dump_stack.c:51
 [] kmsan_report+0x276/0x2e0 mm/kmsan/kmsan.c:1008
 [] __msan_warning+0x5b/0xb0 mm/kmsan/kmsan_instr.c:424
 [] selinux_socket_bind+0xf41/0x1080 
security/selinux/hooks.c:4288
 [] security_socket_bind+0x1ec/0x240 security/security.c:1240
 [] SYSC_bind+0x358/0x5f0 net/socket.c:1366
 [] SyS_bind+0x82/0xa0 net/socket.c:1356
 [] do_syscall_64+0x58/0x70 arch/x86/entry/common.c:292
 [] entry_SYSCALL64_slow_path+0x25/0x25 
arch/x86/entry/entry_64.o:?
chained origin: ba6009bb
 [] save_stack_trace+0x27/0x50 arch/x86/kernel/stacktrace.c:67
 [< inline >] kmsan_save_stack_with_flags mm/kmsan/kmsan.c:322
 [< inline >] kmsan_save_stack mm/kmsan/kmsan.c:337
 [] kmsan_internal_chain_origin+0x118/0x1e0 
mm/kmsan/kmsan.c:530
 [] __msan_set_alloca_origin4+0xc3/0x130 
mm/kmsan/kmsan_instr.c:380
 [] SYSC_bind+0x129/0x5f0 net/socket.c:1356
 [] SyS_bind+0x82/0xa0 net/socket.c:1356
 [] do_syscall_64+0x58/0x70 arch/x86/entry/common.c:292
 [] return_from_SYSCALL_64+0x0/0x6a 
arch/x86/entry/entry_64.o:?
origin description: address@SYSC_bind (origin=b8c00900)
==

(the line numbers are relative to 4.8-rc6, but the bug persists upstream)

, when I run the following program as root:

===
  #include 
  #include 
  #include 

  int main(int argc, char *argv[]) {
struct sockaddr addr;
int size = 0;
if (argc > 1) {
  size = atoi(argv[1]);
}
memset(, 0, sizeof(addr));
int fd = socket(PF_INET6, SOCK_DGRAM, IPPROTO_IP);
bind(fd, , size);
return 0;
  }
===

(for different values of |size| other error reports are printed).

This happens because bind() unconditionally copies |size| bytes of
|addr| to the kernel, leaving the rest uninitialized. Then
security_socket_bind() reads the IP address bytes, including the
uninitialized ones, to determine the port, or e.g. pass them further to
sel_netnode_find(), which uses them to calculate a hash.

Signed-off-by: Alexander Potapenko 
---
Changes since v1:
 - fixed patch description
 - fixed addrlen tests to match those in inet_bind() and inet6_bind()
   (per comment from Eric Dumazet)
---
 security/selinux/hooks.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 0a4b4b040e0a..ddc4aca6c840 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4351,10 +4351,19 @@ static int selinux_socket_bind(struct socket *sock, 
struct sockaddr *address, in
u32 sid, node_perm;
 
if (family == PF_INET) {
+   if (addrlen < sizeof(struct sockaddr_in)) {
+   err = -EINVAL;
+   goto out;
+   }
addr4 = (struct sockaddr_in *)address;
snum = ntohs(addr4->sin_port);
addrp = (char *)>sin_addr.s_addr;
+
} else {
+   if (addrlen < SIN6_LEN_RFC2133) {
+   err = -EINVAL;
+   goto out;
+   }
addr6 = (struct sockaddr_in6 *)address;
snum = ntohs(addr6->sin6_port);
addrp = (char *)>sin6_addr.s6_addr;
-- 
2.12.0.rc1.440.g5b76565f74-goog



Re: [Xen-devel] [PATCH 29/29] drivers, xen: convert grant_map.users from atomic_t to refcount_t

2017-03-06 Thread Boris Ostrovsky
On 03/06/2017 09:21 AM, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> Signed-off-by: Elena Reshetova 
> Signed-off-by: Hans Liljestrand 
> Signed-off-by: Kees Cook 
> Signed-off-by: David Windsor 
> ---
>  drivers/xen/gntdev.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)

Reviewed-by: Boris Ostrovsky 





Re: [PATCH net] bpf: disable broken write protection on i386

2017-03-06 Thread Kees Cook
On Fri, Mar 3, 2017 at 7:23 PM, Daniel Borkmann  wrote:
> Since d2852a224050 ("arch: add ARCH_HAS_SET_MEMORY config") and
> 9d876e79df6a ("bpf: fix unlocking of jited image when module ronx
> not set") that uses the former, Fengguang reported random corruptions
> on his i386 test machine [1]. On i386 there is no JIT available,
> and since his kernel config doesn't have kernel modules enabled,
> there was also no DEBUG_SET_MODULE_RONX enabled before which would
> set interpreted bpf_prog image as read-only like we do in various
> other cases for quite some time now, e.g. x86_64, arm64, etc. Thus,
> the difference with above commits was that we now used set_memory_ro()
> and set_memory_rw() on i386, which resulted in these issues. When
> reproducing this with Fengguang's config and qemu image, I changed
> lib/test_bpf.c to be run during boot instead of relying on trinity
> to fiddle with cBPF.
>
> The issues I saw with the BPF test suite when set_memory_ro() and
> set_memory_rw() is used to write protect image on i386 is that after
> a number of tests I noticed a corruption happening in bpf_prog_realloc().
> Specifically, fp_old's content gets corrupted right *after* the
> (unrelated) __vmalloc() call and contains only zeroes right after
> the call instead of the original prog data. fp_old should have been
> freed later on via __bpf_prog_free() *after* we copied all the data
> over to the newly allocated fp. Result looks like:
>
>   [...]
>   [   13.107240] test_bpf: #249 JMP_JSET_X: if (0x3 & 0x2) return 1 jited:0 
> 17 PASS
>   [   13.108182] test_bpf: #250 JMP_JSET_X: if (0x3 & 0x) return 1 
> jited:0 17 PASS
>   [   13.109206] test_bpf: #251 JMP_JA: Jump, gap, jump, ... jited:0 16 PASS
>   [   13.110493] test_bpf: #252 BPF_MAXINSNS: Maximum possible literals 
> jited:0 12 PASS
>   [   13.111885] test_bpf: #253 BPF_MAXINSNS: Single literal jited:0 8 PASS
>   [   13.112804] test_bpf: #254 BPF_MAXINSNS: Run/add until end jited:0 6341 
> PASS
>   [   13.177195] test_bpf: #255 BPF_MAXINSNS: Too many instructions PASS
>   [   13.177689] test_bpf: #256 BPF_MAXINSNS: Very long jump jited:0 9 PASS
>   [   13.178611] test_bpf: #257 BPF_MAXINSNS: Ctx heavy transformations
>   [   13.178713] BUG: unable to handle kernel NULL pointer dereference at 
> 0034
>   [   13.179740] IP: bpf_prog_realloc+0x5b/0x90
>   [   13.180017] *pde = 
>   [   13.180017]
>   [   13.180017] Oops: 0002 [#1] DEBUG_PAGEALLOC
>   [   13.180017] CPU: 0 PID: 1 Comm: swapper Not tainted 
> 4.10.0-57268-gd627975-dirty #50
>   [   13.180017] task: 401ec000 task.stack: 401f2000
>   [   13.180017] EIP: bpf_prog_realloc+0x5b/0x90
>   [   13.180017] EFLAGS: 00210246 CPU: 0
>   [   13.180017] EAX:  EBX: 57ae1000 ECX:  EDX: 57ae1000
>   [   13.180017] ESI: 0019 EDI: 57b07000 EBP: 401f3e74 ESP: 401f3e68
>   [   13.180017]  DS: 007b ES: 007b FS:  GS:  SS: 0068
>   [   13.180017] CR0: 80050033 CR2: 0034 CR3: 12cb1000 CR4: 0610
>   [   13.180017] DR0:  DR1:  DR2:  DR3: 
>   [   13.180017] DR6: fffe0ff0 DR7: 0400
>   [   13.180017] Call Trace:
>   [   13.180017]  bpf_prepare_filter+0x317/0x3a0
>   [   13.180017]  bpf_prog_create+0x65/0xa0
>   [   13.180017]  test_bpf_init+0x1ca/0x628
>   [   13.180017]  ? test_hexdump_init+0xb5/0xb5
>   [   13.180017]  do_one_initcall+0x7c/0x11c
>   [...]
>
> When using trinity from Fengguang's reproducer, the corruptions were
> at inconsistent places, presumably from code dealing with allocations
> and seeing similar effects as mentioned above.
>
> Not using set_memory_ro() and set_memory_rw() lets the test suite
> run just fine as expected, thus it looks like using set_memory_*()
> on i386 seems broken and mentioned commits just uncovered it. Also,
> for checking, I enabled DEBUG_RODATA_TEST for that kernel.
>
> Latter shows that memory protecting the kernel seems not working either
> on i386 (!). Test suite output:
>
>   [...]
>   [   12.692836] Write protecting the kernel text: 13416k
>   [   12.693309] Write protecting the kernel read-only data: 5292k
>   [   12.693802] rodata_test: test data was not read only
>   [...]
>
> Work-around to not enable ARCH_HAS_SET_MEMORY for i386 is not optimal
> as it doesn't fix the issue in presumably broken set_memory_*(), but
> it at least avoids people avoid having to deal with random corruptions
> that are hard to track down for the time being until a real fix can
> be found.

Wow. Uhm, so, something must be _really_ broken. i386 should have no
problem with using the set_memory_*() functions. The fact that
DEBUG_RODATA_TEST failed is also pretty crazy, but may be unrelated
(that test was just refactored too).

Is it possible that it's just the enabling of set_memory_*() for the
non-modular case? The ARCH_HAS_SET_MEMORY commit is just a convenience
config; i386 has had those functions for a while now, and they're the
same between x86_64 and i386. O_o Perhaps they aren't 

Re: [PATCH net] bpf: disable broken write protection on i386

2017-03-06 Thread Daniel Borkmann

On 03/06/2017 07:11 PM, Kees Cook wrote:

On Fri, Mar 3, 2017 at 7:23 PM, Daniel Borkmann  wrote:

Since d2852a224050 ("arch: add ARCH_HAS_SET_MEMORY config") and
9d876e79df6a ("bpf: fix unlocking of jited image when module ronx
not set") that uses the former, Fengguang reported random corruptions
on his i386 test machine [1]. On i386 there is no JIT available,
and since his kernel config doesn't have kernel modules enabled,
there was also no DEBUG_SET_MODULE_RONX enabled before which would
set interpreted bpf_prog image as read-only like we do in various
other cases for quite some time now, e.g. x86_64, arm64, etc. Thus,
the difference with above commits was that we now used set_memory_ro()
and set_memory_rw() on i386, which resulted in these issues. When
reproducing this with Fengguang's config and qemu image, I changed
lib/test_bpf.c to be run during boot instead of relying on trinity
to fiddle with cBPF.

The issues I saw with the BPF test suite when set_memory_ro() and
set_memory_rw() is used to write protect image on i386 is that after
a number of tests I noticed a corruption happening in bpf_prog_realloc().
Specifically, fp_old's content gets corrupted right *after* the
(unrelated) __vmalloc() call and contains only zeroes right after
the call instead of the original prog data. fp_old should have been
freed later on via __bpf_prog_free() *after* we copied all the data
over to the newly allocated fp. Result looks like:

   [...]
   [   13.107240] test_bpf: #249 JMP_JSET_X: if (0x3 & 0x2) return 1 jited:0 17 
PASS
   [   13.108182] test_bpf: #250 JMP_JSET_X: if (0x3 & 0x) return 1 
jited:0 17 PASS
   [   13.109206] test_bpf: #251 JMP_JA: Jump, gap, jump, ... jited:0 16 PASS
   [   13.110493] test_bpf: #252 BPF_MAXINSNS: Maximum possible literals 
jited:0 12 PASS
   [   13.111885] test_bpf: #253 BPF_MAXINSNS: Single literal jited:0 8 PASS
   [   13.112804] test_bpf: #254 BPF_MAXINSNS: Run/add until end jited:0 6341 
PASS
   [   13.177195] test_bpf: #255 BPF_MAXINSNS: Too many instructions PASS
   [   13.177689] test_bpf: #256 BPF_MAXINSNS: Very long jump jited:0 9 PASS
   [   13.178611] test_bpf: #257 BPF_MAXINSNS: Ctx heavy transformations
   [   13.178713] BUG: unable to handle kernel NULL pointer dereference at 
0034
   [   13.179740] IP: bpf_prog_realloc+0x5b/0x90
   [   13.180017] *pde = 
   [   13.180017]
   [   13.180017] Oops: 0002 [#1] DEBUG_PAGEALLOC
   [   13.180017] CPU: 0 PID: 1 Comm: swapper Not tainted 
4.10.0-57268-gd627975-dirty #50
   [   13.180017] task: 401ec000 task.stack: 401f2000
   [   13.180017] EIP: bpf_prog_realloc+0x5b/0x90
   [   13.180017] EFLAGS: 00210246 CPU: 0
   [   13.180017] EAX:  EBX: 57ae1000 ECX:  EDX: 57ae1000
   [   13.180017] ESI: 0019 EDI: 57b07000 EBP: 401f3e74 ESP: 401f3e68
   [   13.180017]  DS: 007b ES: 007b FS:  GS:  SS: 0068
   [   13.180017] CR0: 80050033 CR2: 0034 CR3: 12cb1000 CR4: 0610
   [   13.180017] DR0:  DR1:  DR2:  DR3: 
   [   13.180017] DR6: fffe0ff0 DR7: 0400
   [   13.180017] Call Trace:
   [   13.180017]  bpf_prepare_filter+0x317/0x3a0
   [   13.180017]  bpf_prog_create+0x65/0xa0
   [   13.180017]  test_bpf_init+0x1ca/0x628
   [   13.180017]  ? test_hexdump_init+0xb5/0xb5
   [   13.180017]  do_one_initcall+0x7c/0x11c
   [...]

When using trinity from Fengguang's reproducer, the corruptions were
at inconsistent places, presumably from code dealing with allocations
and seeing similar effects as mentioned above.

Not using set_memory_ro() and set_memory_rw() lets the test suite
run just fine as expected, thus it looks like using set_memory_*()
on i386 seems broken and mentioned commits just uncovered it. Also,
for checking, I enabled DEBUG_RODATA_TEST for that kernel.

Latter shows that memory protecting the kernel seems not working either
on i386 (!). Test suite output:

   [...]
   [   12.692836] Write protecting the kernel text: 13416k
   [   12.693309] Write protecting the kernel read-only data: 5292k
   [   12.693802] rodata_test: test data was not read only
   [...]

Work-around to not enable ARCH_HAS_SET_MEMORY for i386 is not optimal
as it doesn't fix the issue in presumably broken set_memory_*(), but
it at least avoids people avoid having to deal with random corruptions
that are hard to track down for the time being until a real fix can
be found.


Wow. Uhm, so, something must be _really_ broken. i386 should have no
problem with using the set_memory_*() functions. The fact that


That was my understanding as well. ;)


DEBUG_RODATA_TEST failed is also pretty crazy, but may be unrelated
(that test was just refactored too).


I'll double check DEBUG_RODATA_TEST on x86_64 to make sure it succeeds
there; have only tested that one on i386.


Is it possible that it's just the enabling of set_memory_*() for the
non-modular case? The ARCH_HAS_SET_MEMORY commit is just a convenience
config; i386 has had those functions for a 

Re: [PATCH 1/2] net: sched: make default fifo qdiscs appear in the dump

2017-03-06 Thread David Miller
From: Jiri Kosina 
Date: Mon, 6 Mar 2017 12:03:38 +0100 (CET)

> Ah, right you are, thanks. The complete fix is not super trivial, as it 
> needs some more surgery to tc_dump_qdisc_root(), tc_dump_tclass_root() and 
> qdisc_match_from_root() (see 69012ae42 for some details).
> 
> There are two options:
> 
> - this gets fixed in two phases, in first everything *but* noop qdisc gets 
>   dumped (in the "give me everything" dump) and later we finalize it by
>   teaching the above functions about noop_qdisc as well
> 
> - I extend this patchset to handle noop qdisc from the very beginning; 
>   I am unlikely to find time for this during coming weeks though. But OTOH
>   this whole thing is very low priority anyway
> 
> What do you think?

I'm not too hot on this whole idea because the only way you can emit
the noop_qdisc is to "dup" it by allocating a new qdisc so that you
can link it in.  This has two downsides:

1) Extra overhead and memory usage

2) All of the simple checks against _qdisc might not be
   so simply any more.


Re: [PATCH net] dccp: fix use-after-free in dccp_feat_activate_values

2017-03-06 Thread Cong Wang
On Sun, Mar 5, 2017 at 10:42 PM, Eric Dumazet  wrote:
> On Sun, 2017-03-05 at 21:38 -0800, Cong Wang wrote:
>
>> Do you really want to disable BH again here?
>>
>> dccp_check_req() should be always called on RX path where BH
>> is already disabled and BH can't be disabled twice?
>
> What makes you think BH can't be disabled twice ?
>
> Look, I prefer being cautious here, no need to get another DCCP bug
> report later.

Hmm, I thought BH should have a local_bh_save() to save its context
but looking into its implementation it uses the preempt count to determine
if BH is disabled or not, unlikely hardirq's. Sorry for the noise.


Re: [PATCH net-next] packet: fix panic in __packet_set_timestamp on tpacket_v3 in tx mode

2017-03-06 Thread Willem de Bruijn
On Mon, Mar 6, 2017 at 12:13 PM, chetan loke  wrote:
>>>
>>> Gosh. Can we also replace this BUG() into something less aggressive ?
>>
>>
>> There are currently 5 of these WARN() + BUG() constructs and 1 BUG()-only
>> for the 'default' TPACKET version spread all over af_packet, so probably
>> makes sense to rather make all of them less aggressive.
>>
>>
>
> Very few consumers actually go looking in the kernel logs to see the
> error-warnings and report them back here.
>
> This severity will get them to report the incident which in this case
> got fixed??

But BUG_ONs in the datapath can cause outages in real production
environments. This should not happen for recoverable failures. For
users who cannot be bothered to check their logs, there is sysctl
kernel.panic_on_warn.


Re: [PATCH net-next RFC 4/4] virtio-net: clean tx descriptors from rx napi

2017-03-06 Thread Willem de Bruijn
>> +static void virtnet_poll_cleantx(struct receive_queue *rq)
>> +{
>> +   struct virtnet_info *vi = rq->vq->vdev->priv;
>> +   unsigned int index = vq2rxq(rq->vq);
>> +   struct send_queue *sq = >sq[index];
>> +   struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
>> +
>> +   __netif_tx_lock(txq, smp_processor_id());
>> +   free_old_xmit_skbs(sq, sq->napi.weight);
>> +   __netif_tx_unlock(txq);
>
>
> Should we check tx napi weight here? Or this was treated as an independent
> optimization?

Good point. This was not intended to run in no-napi mode as is.
With interrupts disabled most of the time in that mode, I don't
expect it to be worthwhile using in that case. I'll add the check
for sq->napi.weight != 0.

>> +
>> +   if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>> +   netif_wake_subqueue(vi->dev, vq2txq(sq->vq));
>> +}
>> +
>>   static int virtnet_poll(struct napi_struct *napi, int budget)
>>   {
>> struct receive_queue *rq =
>> @@ -1039,6 +1056,8 @@ static int virtnet_poll(struct napi_struct *napi,
>> int budget)
>> received = virtnet_receive(rq, budget);
>>   + virtnet_poll_cleantx(rq);
>> +
>
>
> Better to do the before virtnet_receive() consider refill may allocate
> memory for rx buffers.

Will do.

> Btw, if this is proved to be more efficient. In the future we may consider
> to:
>
> 1) use a single interrupt for both rx and tx
> 2) use a single napi to handle both rx and tx

Agreed, I think that's sensible.


Re: [PATCH net-next RFC 4/4] virtio-net: clean tx descriptors from rx napi

2017-03-06 Thread Willem de Bruijn
On Mon, Mar 6, 2017 at 12:43 PM, Willem de Bruijn
 wrote:
>>> +static void virtnet_poll_cleantx(struct receive_queue *rq)
>>> +{
>>> +   struct virtnet_info *vi = rq->vq->vdev->priv;
>>> +   unsigned int index = vq2rxq(rq->vq);
>>> +   struct send_queue *sq = >sq[index];
>>> +   struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
>>> +
>>> +   __netif_tx_lock(txq, smp_processor_id());
>>> +   free_old_xmit_skbs(sq, sq->napi.weight);
>>> +   __netif_tx_unlock(txq);
>>
>>
>> Should we check tx napi weight here? Or this was treated as an independent
>> optimization?
>
> Good point. This was not intended to run in no-napi mode as is.
> With interrupts disabled most of the time in that mode, I don't
> expect it to be worthwhile using in that case. I'll add the check
> for sq->napi.weight != 0.

I'm wrong here. Rx interrupts are not disabled, of course. It is
probably worth benchmarking, then.


Re: [PATCH net-next RFC 2/4] virtio-net: transmit napi

2017-03-06 Thread Willem de Bruijn
>>   drivers/net/virtio_net.c | 73
>> 
>>   1 file changed, 61 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 8c21e9a4adc7..9a9031640179 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -33,6 +33,8 @@
>>   static int napi_weight = NAPI_POLL_WEIGHT;
>>   module_param(napi_weight, int, 0444);
>>   +static int napi_tx_weight = NAPI_POLL_WEIGHT;
>> +
>
>
> Maybe we should use module_param for this? Or in the future, use
> tx-frames-irq for a per-device configuration.

This option should eventually just go away, and napi tx become the
standard mode.

In the short term, while we evaluate it on varied workloads, a
module_param sounds good to me. In general that is frowned
upon, as it leads to different configuration interfaces for each
device driver. But that should not be a concern in this limited
case.


[PATCH] vrf: Fix use-after-free in vrf_xmit

2017-03-06 Thread David Ahern
KASAN detected a use-after-free:

[  269.467067] BUG: KASAN: use-after-free in vrf_xmit+0x7f1/0x827 [vrf] at addr 
8800350a21c0
[  269.467067] Read of size 4 by task ssh/1879
[  269.467067] CPU: 1 PID: 1879 Comm: ssh Not tainted 4.10.0+ #249
[  269.467067] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.7.5-20140531_083030-gandalf 04/01/2014
[  269.467067] Call Trace:
[  269.467067]  dump_stack+0x81/0xb6
[  269.467067]  kasan_object_err+0x21/0x78
[  269.467067]  kasan_report+0x2f7/0x450
[  269.467067]  ? vrf_xmit+0x7f1/0x827 [vrf]
[  269.467067]  ? ip_output+0xa4/0xdb
[  269.467067]  __asan_load4+0x6b/0x6d
[  269.467067]  vrf_xmit+0x7f1/0x827 [vrf]
...

Which corresponds to the skb access after xmit handling. Fix by saving
skb->len and using the saved value to update stats.

Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver")
Signed-off-by: David Ahern 
---
 drivers/net/vrf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 22379da63400..fea687f35b5a 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -340,6 +340,7 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, 
struct net_device *dev)
 
 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+   int len = skb->len;
netdev_tx_t ret = is_ip_tx_frame(skb, dev);
 
if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
@@ -347,7 +348,7 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct 
net_device *dev)
 
u64_stats_update_begin(>syncp);
dstats->tx_pkts++;
-   dstats->tx_bytes += skb->len;
+   dstats->tx_bytes += len;
u64_stats_update_end(>syncp);
} else {
this_cpu_inc(dev->dstats->tx_drps);
-- 
2.1.4



Re: [PATCH net-next] packet: fix panic in __packet_set_timestamp on tpacket_v3 in tx mode

2017-03-06 Thread chetan loke
>>
>> Gosh. Can we also replace this BUG() into something less aggressive ?
>
>
> There are currently 5 of these WARN() + BUG() constructs and 1 BUG()-only
> for the 'default' TPACKET version spread all over af_packet, so probably
> makes sense to rather make all of them less aggressive.
>
>

Very few consumers actually go looking in the kernel logs to see the
error-warnings and report them back here.

This severity will get them to report the incident which in this case
got fixed??


Re: [PATCH net-next RFC 3/4] vhost: interrupt coalescing support

2017-03-06 Thread Willem de Bruijn
On Mon, Mar 6, 2017 at 4:28 AM, Jason Wang  wrote:
>
>
> On 2017年03月03日 22:39, Willem de Bruijn wrote:
>>
>> +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq);
>> +static enum hrtimer_restart vhost_coalesce_timer(struct hrtimer *timer)
>> +{
>> +   struct vhost_virtqueue *vq =
>> +   container_of(timer, struct vhost_virtqueue, ctimer);
>> +
>> +   if (mutex_trylock(>mutex)) {
>> +   vq->coalesce_frames = vq->max_coalesce_frames;
>> +   vhost_signal(vq->dev, vq);
>> +   mutex_unlock(>mutex);
>> +   }
>> +
>> +   /* TODO: restart if lock failed and not held by handle_tx */
>> +   return HRTIMER_NORESTART;
>> +}
>> +
>
>
> Then we may lose an interrupt forever if no new tx request? I believe we
> need e.g vhost_poll_queue() here.

Absolutely, I need to fix this. The common case for failing to grab
the lock is competition with handle_tx. With careful coding we can
probably avoid scheduling another run with vhost_poll_queue in
the common case.

Your patch v7 cancels the pending hrtimer at the start of handle_tx.
I need to reintroduce that, and also only schedule a timer at the end
of handle_tx, not immediately when vq->coalesce_frames becomes
non-zero.


Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-06 Thread David Ahern
On 3/4/17 1:15 PM, Eric Dumazet wrote:
> On Sat, 2017-03-04 at 19:57 +0100, Dmitry Vyukov wrote:
>> On Fri, Mar 3, 2017 at 8:12 PM, David Ahern  wrote:
>>> On 3/3/17 6:39 AM, Dmitry Vyukov wrote:
 I am getting heap out-of-bounds reports in
 fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone while running
 syzkaller fuzzer on 86292b33d4b79ee03e2f43ea0381ef85f077c760. They all
 follow the same pattern: an object of size 216 is allocated from
 ip_dst_cache slab, and then accessed at offset 272/276 withing
 fib6_walk. Looks like type confusion. Unfortunately this is not
 reproducible.
>>>
>>> I'll take a look this weekend or Monday at the latest.
>>
>>
>> I've got some additional useful info on this. I think this is
>> use-after-free rather than out-of-bounds. I've collected stack where
>> the route was disposed with call_rcu, see the last "Disposed" stack.
>> The crash happens when cmpxchg in rt_cache_route replaces an existing
>> route. And that route seems to have some existing pointers to it
>> (rt->dst.rt6_next) which fib6_walk uses to get to it after its
>> deletion.
> 
> rt_cache_route() deals with IPv4 routes.
> 
> We somehow mix IPv4 and IPv6 dsts in IPv6 tree.
> 
> We need to add type safety at IPV6 route insertions to catch the
> offender.
> 

I've seen something like this before -- a rt was on the gc list but
still linked in the tables because of some reference.

Dmitry: you seem to have reproduced this a few times. Can you share how
to run whatever tests you are using?


Re: [PATCH] netfilter: Use pr_cont where appropriate

2017-03-06 Thread Pablo Neira Ayuso
On Tue, Feb 28, 2017 at 02:09:24PM -0800, Joe Perches wrote:
> Logging output was changed when simple printks without KERN_CONT
> are now emitted on a new line and KERN_CONT is required to continue
> lines so use pr_cont.
> 
> Miscellanea:
> 
> o realign arguments
> o use print_hex_dump instead of a local variant

Applied, thanks Joe.


Re: [PATCH] netfilter: remove redundant check on ret being non-zero

2017-03-06 Thread Pablo Neira Ayuso
On Tue, Feb 28, 2017 at 11:31:15AM +, Colin King wrote:
> From: Colin Ian King 
> 
> ret is initialized to zero and if it is set to non-zero in the
> xt_entry_foreach loop then we exit via the out_free label. Hence
> the check for ret being non-zero is redundant and can be removed.
> 
> Detected by CoverityScan, CID#1357132 ("Logically Dead Code")

Applied, thanks.


Re: [PATCH] bridge: Add support for IEEE 802.11 Proxy ARP for IPv6

2017-03-06 Thread Jouni Malinen
On Fri, Feb 24, 2017 at 11:55:37AM -0800, Stephen Hemminger wrote:
> The concept is fine.

Thanks for taking a look.

> Please add some comments to the code about what is happening and why.
> The proposed patch is too sparse and has no comments.

Sure, will do that for the next version.

> > +   skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + sizeof(*msg) +
> > +   ndisc_opt_addr_space(dev,
> > +NDISC_NEIGHBOUR_ADVERTISEMENT) +
> > +   tlen, GFP_ATOMIC);
> > +   if (!skb)
> > +   return;
> 
> Why not netdev_alloc_skb which takes care of padding and setting skb->dev? 

This implementation in br_ndisc_send_na() was trying to follow
ndisc_send_na() design for the operations.. If this function remains
(see below), I can clean this up further.

> Rather than doing copy/paste of the code to generate a ND message, it would
> be better to have one function in IPv6 code that handles that. That would keep
> from having to fix code in two places in the future. Is there some way
> to extend ndisc_send_na?

That was the original plan and adding the target_lladdr part would be
straightforward. The part that gets complex is in figuring out how to
use a foreign link layer source address (the MAC address on behalf of
which the local device is replying) in the outgoing NA when using the
IEEE 802.11/Hotspot 2.0 design.

ndisc_send_na() uses the full IPv6 stack for building the frame when
calling ndisc_send_skb(). dst_output() ends up sending this through
ip6_output(), I'd assume, and after building the IPv6 header, the local
MAC address of the outgoing interface gets assigned to the Ethernet
header. I'm not sure how to override that functionality in any clean
way. The dev_hard_header() call in the mostly copy-pasted version in
br_ndisc_send_na() followed by use of the custom
br_ndisc_send_na_finish() to call dev_queue_xmit(skb) was done to allow
the link layer source address to be modified.

The normal path in the net stack seemed to use dev_hard_header() with
saddr = NULL which maps to eth_header() saddr = NULL case to use device
source address. Either those would need to be somehow modified for this
special skb containing the NA with different source address requirement
or something after these calls would need to modify the frame to change
the source address.

Would you happen to know any convenient means for modifying the IPv6
stack behavior for ndisc_send_skb() cases conditionally to allow the
link layer source address to be modified while still being able to use
the existing IPv6 header and the Ethernet header construction function?

-- 
Jouni MalinenPGP id EFC895FA


Re: [PATCH net-next] net: ipv4: add support for ECMP hash policy choice

2017-03-06 Thread Nikolay Aleksandrov
On 06/03/17 18:24, David Ahern wrote:
> On 3/6/17 7:59 AM, Nikolay Aleksandrov wrote:
>> diff --git a/include/net/route.h b/include/net/route.h
>> index c0874c87c173..77a5c613a290 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -113,13 +113,12 @@ struct in_device;
>>  int ip_rt_init(void);
>>  void rt_cache_flush(struct net *net);
>>  void rt_flush_dev(struct net_device *dev);
>> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
>> -  int mp_hash);
>> +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 
>> *flp);
>>  
>>  static inline struct rtable *__ip_route_output_key(struct net *net,
>> struct flowi4 *flp)
>>  {
>> -return __ip_route_output_key_hash(net, flp, -1);
>> +return __ip_route_output_key_hash(net, flp);
>>  }
>>  
>>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
> 
> The "_hash" variant was added by 79a131592dbb8. If the mp_hash arg is
> removed, the "_hash" wrapper should be removed and go back to
> __ip_route_output_key.
> 

Ah yes, I've missed that. :-) Will remove the _hash variant when posting v2.

Thanks,
 Nik



Re: [PATCH 08/26] brcmsmac: make some local variables 'static const' to reduce stack size

2017-03-06 Thread Kalle Valo
Arend Van Spriel  writes:

> On 2-3-2017 17:38, Arnd Bergmann wrote:
>> With KASAN and a couple of other patches applied, this driver is one
>> of the few remaining ones that actually use more than 2048 bytes of
>> kernel stack:
>> 
>> broadcom/brcm80211/brcmsmac/phy/phy_n.c: In function 
>> 'wlc_phy_workarounds_nphy_gainctrl':
>> broadcom/brcm80211/brcmsmac/phy/phy_n.c:16065:1: warning: the frame size of 
>> 3264 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>> broadcom/brcm80211/brcmsmac/phy/phy_n.c: In function 
>> 'wlc_phy_workarounds_nphy':
>> broadcom/brcm80211/brcmsmac/phy/phy_n.c:17138:1: warning: the frame size of 
>> 2864 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>> 
>> Here, I'm reducing the stack size by marking as many local variables as
>> 'static const' as I can without changing the actual code.
>
> Acked-by: Arend van Spriel 

Arnd, via which tree are you planning to submit these? I'm not sure
what I should do with the wireless drivers patches from this series.

-- 
Kalle Valo


Re: [PATCH 10/26] brcmsmac: reindent split functions

2017-03-06 Thread Kalle Valo
Arend Van Spriel  writes:

> On 2-3-2017 17:38, Arnd Bergmann wrote:
>> In the previous commit I left the indentation alone to help reviewing
>> the patch, this one now runs the three new functions through 'indent -kr -8'
>> with some manual fixups to avoid silliness.
>> 
>> No changes other than whitespace are intended here.
>
> Acked-by: Arend van Spriel 
>> Signed-off-by: Arnd Bergmann 
>> ---
>>  .../broadcom/brcm80211/brcmsmac/phy/phy_n.c| 1507 
>> +---
>>  1 file changed, 697 insertions(+), 810 deletions(-)
>> 

Arend, please edit your quotes. Leaving 1000 lines of unnecessary quotes
in your reply makes my use of patchwork horrible:

https://patchwork.kernel.org/patch/9601155/

-- 
Kalle Valo


Re: [PATCH 1/4] net: thunderx: Fix IOMMU translation faults

2017-03-06 Thread Robin Murphy
On 06/03/17 12:57, Sunil Kovvuri wrote:
>>>
>>> We are seeing a 0.75Mpps drop with IP forwarding rate due to that.
>>> Hence I have restricted calling DMA interfaces to only when IOMMU is 
>>> enabled.
>>
>> What's 0.07Mpps as a percentage of baseline? On a correctly configured
>> coherent arm64 system, in the absence of an IOMMU, dma_map_*() is
>> essentially just virt_to_phys() behind a function call or two, so I'd be
>> interested to know where any non-trivial overhead might be coming from.
> 
> It's a 5% drop and yes device is configured as coherent.
> And the drop is due to additional function calls.

OK, interesting - sounds like there's potential for some optimisation
there as well. AFAICS the callchain goes:

dma_map_single_attrs (inline)
- ops->map_page (__swiotlb_map_page)
  - swiotlb_map_page
- phys_to_dma (inline)
- dma_capable (inline)

Do you happen to have a breakdown of where the time goes? If it's mostly
just in the indirect branch our options are limited (I'm guessing
ThunderX doesn't have a particularly fancy branch predictor, if it's not
even got a data prefetcher), but if it's in the SWIOTLB code then
there's certainly room for improvement (which will hopefully tie in with
some DMA ops work I'm planning to do soon anyway).

Thanks,
Robin.

> 
> Thanks,
> Sunil.
> 



Re: [PATCH 11/29] drivers, media: convert cx88_core.refcount from atomic_t to refcount_t

2017-03-06 Thread Sergei Shtylyov

Hello.

On 03/06/2017 05:20 PM, Elena Reshetova wrote:


refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 

[...]

diff --git a/drivers/media/pci/cx88/cx88.h b/drivers/media/pci/cx88/cx88.h
index 115414c..16c1313 100644
--- a/drivers/media/pci/cx88/cx88.h
+++ b/drivers/media/pci/cx88/cx88.h
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -339,7 +340,7 @@ struct cx8802_dev;

 struct cx88_core {
struct list_head   devlist;
-   atomic_t   refcount;
+   refcount_t   refcount;


   Could you please keep the name aligned with above and below?



/* board name */
intnr;



MBR, Sergei



Re: [4.9.13] use after free in ipv4_mtu

2017-03-06 Thread Eric Dumazet
On Mon, 2017-03-06 at 05:45 -0800, Eric Dumazet wrote:
> On Mon, 2017-03-06 at 14:33 +0800, Daniel J Blueman wrote:

> > I do change the network queueing discipline and related at runtime [1]
> > which may be triggering this, though I did think I saw the KASAN
> > report only after resuming from suspend. rf(un)kill and other tweaking
> > may have been involved too.
> > 
> > Thanks,
> >   Dan
> > 
> > [1] /etc/sysctl.d/90-tcp.conf
> > 
> > net.core.default_qdisc = fq_codel
> > net.ipv4.tcp_congestion_control = bbr
> > net.ipv4.tcp_slow_start_after_idle = 0
> > net.ipv4.tcp_ecn = 1

BTW, fq_codel is not suitable for BBR.

Only fq contains the needed pacing for BBR.





Re: [PATCH net-next] net: ipv4: add support for ECMP hash policy choice

2017-03-06 Thread David Ahern
On 3/6/17 7:59 AM, Nikolay Aleksandrov wrote:
> diff --git a/include/net/route.h b/include/net/route.h
> index c0874c87c173..77a5c613a290 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -113,13 +113,12 @@ struct in_device;
>  int ip_rt_init(void);
>  void rt_cache_flush(struct net *net);
>  void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
> -   int mp_hash);
> +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 
> *flp);
>  
>  static inline struct rtable *__ip_route_output_key(struct net *net,
>  struct flowi4 *flp)
>  {
> - return __ip_route_output_key_hash(net, flp, -1);
> + return __ip_route_output_key_hash(net, flp);
>  }
>  
>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,

The "_hash" variant was added by 79a131592dbb8. If the mp_hash arg is
removed, the "_hash" wrapper should be removed and go back to
__ip_route_output_key.


Please view the attached file.

2017-03-06 Thread Ayesha Gadhafi



Hello.docx
Description: MS-Word 2007 document


Re: [patch net-next RFC 1/2] flow_dissecror: Move ARP dissection into a separate function

2017-03-06 Thread Jiri Pirko
Tue, Feb 21, 2017 at 07:50:53PM CET, t...@herbertland.com wrote:
>On Tue, Feb 21, 2017 at 6:33 AM, Jiri Pirko  wrote:
>> From: Jiri Pirko 
>>
>> Make the main flow_dissect function a bit smaller and move the ARP
>> dissection into a separate function. Along with that, do the ARP header
>> processing only in case the flow dissection user requires it.
>>
>
>Acked-by: Tom Herbert 
>
>GRE might also be a good candidate to get its own function.
>

Submitted with GRE bits. Note that I left you ack and Simon's revby out
since I did some cosmetic changes until rfc.

I would be glad if you both can check it again.

Thanks!


>
>> Signed-off-by: Jiri Pirko 
>> ---
>>  net/core/flow_dissector.c | 111 
>> --
>>  1 file changed, 59 insertions(+), 52 deletions(-)
>>
>> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>> index c35aae1..10dc5bb 100644
>> --- a/net/core/flow_dissector.c
>> +++ b/net/core/flow_dissector.c
>> @@ -113,6 +113,61 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, 
>> int thoff, u8 ip_proto,
>>  }
>>  EXPORT_SYMBOL(__skb_flow_get_ports);
>>
>> +static bool __skb_flow_dissect_arp(const struct sk_buff *skb,
>> +  struct flow_dissector *flow_dissector,
>> +  void *target_container, void *data,
>> +  int nhoff, int hlen)
>> +{
>> +   struct flow_dissector_key_arp *key_arp;
>> +   struct {
>> +   unsigned char ar_sha[ETH_ALEN];
>> +   unsigned char ar_sip[4];
>> +   unsigned char ar_tha[ETH_ALEN];
>> +   unsigned char ar_tip[4];
>> +   } *arp_eth, _arp_eth;
>> +   const struct arphdr *arp;
>> +   struct arphdr *_arp;
>> +
>> +   if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
>> +   return true;
>> +
>> +   arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
>> +  hlen, &_arp);
>> +   if (!arp)
>> +   return false;
>> +
>> +   if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
>> +   arp->ar_pro != htons(ETH_P_IP) ||
>> +   arp->ar_hln != ETH_ALEN ||
>> +   arp->ar_pln != 4 ||
>> +   (arp->ar_op != htons(ARPOP_REPLY) &&
>> +arp->ar_op != htons(ARPOP_REQUEST)))
>> +   return false;
>> +
>> +   arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
>> +  sizeof(_arp_eth), data,
>> +  hlen, &_arp_eth);
>> +   if (!arp_eth)
>> +   return false;
>> +
>> +   key_arp = skb_flow_dissector_target(flow_dissector,
>> +   FLOW_DISSECTOR_KEY_ARP,
>> +   target_container);
>> +
>> +   memcpy(_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
>> +   memcpy(_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));
>> +
>> +   /* Only store the lower byte of the opcode;
>> +* this covers ARPOP_REPLY and ARPOP_REQUEST.
>> +*/
>> +   key_arp->op = ntohs(arp->ar_op) & 0xff;
>> +
>> +   ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
>> +   ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
>> +
>> +   return true;
>> +}
>> +
>>  /**
>>   * __skb_flow_dissect - extract the flow_keys struct and return it
>>   * @skb: sk_buff to extract the flow from, can be NULL if the rest are 
>> specified
>> @@ -138,7 +193,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>> struct flow_dissector_key_control *key_control;
>> struct flow_dissector_key_basic *key_basic;
>> struct flow_dissector_key_addrs *key_addrs;
>> -   struct flow_dissector_key_arp *key_arp;
>> struct flow_dissector_key_ports *key_ports;
>> struct flow_dissector_key_icmp *key_icmp;
>> struct flow_dissector_key_tags *key_tags;
>> @@ -382,59 +436,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>> goto out_good;
>>
>> case htons(ETH_P_ARP):
>> -   case htons(ETH_P_RARP): {
>> -   struct {
>> -   unsigned char ar_sha[ETH_ALEN];
>> -   unsigned char ar_sip[4];
>> -   unsigned char ar_tha[ETH_ALEN];
>> -   unsigned char ar_tip[4];
>> -   } *arp_eth, _arp_eth;
>> -   const struct arphdr *arp;
>> -   struct arphdr *_arp;
>> -
>> -   arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
>> -  hlen, &_arp);
>> -   if (!arp)
>> -   goto out_bad;
>> -
>> -   if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
>> -   arp->ar_pro != htons(ETH_P_IP) ||
>> -   arp->ar_hln != ETH_ALEN ||

[patch net-next 3/5] flow_dissector: Fix GRE header error path

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Now, when an unexpected element in the GRE header appears, we break so
the l4 ports are processed. But since the ports are processed
unconditionally, there will be certainly random values dissected. Fix
this by just bailing out in such situations.

Signed-off-by: Jiri Pirko 
---
 net/core/flow_dissector.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 8d01298..cefaf23 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -479,18 +479,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 
/* Only look inside GRE without routing */
if (hdr->flags & GRE_ROUTING)
-   break;
+   goto out_good;
 
/* Only look inside GRE for version 0 and 1 */
gre_ver = ntohs(hdr->flags & GRE_VERSION);
if (gre_ver > 1)
-   break;
+   goto out_good;
 
proto = hdr->protocol;
if (gre_ver) {
/* Version1 must be PPTP, and check the flags */
if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
-   break;
+   goto out_good;
}
 
offset += sizeof(struct gre_base_hdr);
-- 
2.7.4



[patch net-next 5/5] flow_dissector: Move GRE dissection into a separate function

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Make the main flow_dissect function a bit smaller and move the GRE
dissection into a separate function.

Signed-off-by: Jiri Pirko 
---
 net/core/flow_dissector.c | 244 +-
 1 file changed, 134 insertions(+), 110 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 9120835..5f3ae92 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -116,6 +116,7 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
 enum flow_dissect_ret {
FLOW_DISSECT_RET_OUT_GOOD,
FLOW_DISSECT_RET_OUT_BAD,
+   FLOW_DISSECT_RET_OUT_PROTO_AGAIN,
 };
 
 static enum flow_dissect_ret
@@ -200,6 +201,128 @@ __skb_flow_dissect_arp(const struct sk_buff *skb,
return FLOW_DISSECT_RET_OUT_GOOD;
 }
 
+static enum flow_dissect_ret
+__skb_flow_dissect_gre(const struct sk_buff *skb,
+  struct flow_dissector_key_control *key_control,
+  struct flow_dissector *flow_dissector,
+  void *target_container, void *data,
+  __be16 *p_proto, int *p_nhoff, int *p_hlen,
+  unsigned int flags)
+{
+   struct flow_dissector_key_keyid *key_keyid;
+   struct gre_base_hdr *hdr, _hdr;
+   int offset = 0;
+   u16 gre_ver;
+
+   hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
+  data, *p_hlen, &_hdr);
+   if (!hdr)
+   return FLOW_DISSECT_RET_OUT_BAD;
+
+   /* Only look inside GRE without routing */
+   if (hdr->flags & GRE_ROUTING)
+   return FLOW_DISSECT_RET_OUT_GOOD;
+
+   /* Only look inside GRE for version 0 and 1 */
+   gre_ver = ntohs(hdr->flags & GRE_VERSION);
+   if (gre_ver > 1)
+   return FLOW_DISSECT_RET_OUT_GOOD;
+
+   *p_proto = hdr->protocol;
+   if (gre_ver) {
+   /* Version1 must be PPTP, and check the flags */
+   if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
+   return FLOW_DISSECT_RET_OUT_GOOD;
+   }
+
+   offset += sizeof(struct gre_base_hdr);
+
+   if (hdr->flags & GRE_CSUM)
+   offset += sizeof(((struct gre_full_hdr *) 0)->csum) +
+ sizeof(((struct gre_full_hdr *) 0)->reserved1);
+
+   if (hdr->flags & GRE_KEY) {
+   const __be32 *keyid;
+   __be32 _keyid;
+
+   keyid = __skb_header_pointer(skb, *p_nhoff + offset,
+sizeof(_keyid),
+data, *p_hlen, &_keyid);
+   if (!keyid)
+   return FLOW_DISSECT_RET_OUT_BAD;
+
+   if (dissector_uses_key(flow_dissector,
+  FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+   key_keyid = skb_flow_dissector_target(flow_dissector,
+ 
FLOW_DISSECTOR_KEY_GRE_KEYID,
+ target_container);
+   if (gre_ver == 0)
+   key_keyid->keyid = *keyid;
+   else
+   key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
+   }
+   offset += sizeof(((struct gre_full_hdr *) 0)->key);
+   }
+
+   if (hdr->flags & GRE_SEQ)
+   offset += sizeof(((struct pptp_gre_header *) 0)->seq);
+
+   if (gre_ver == 0) {
+   if (*p_proto == htons(ETH_P_TEB)) {
+   const struct ethhdr *eth;
+   struct ethhdr _eth;
+
+   eth = __skb_header_pointer(skb, *p_nhoff + offset,
+  sizeof(_eth),
+  data, *p_hlen, &_eth);
+   if (!eth)
+   return FLOW_DISSECT_RET_OUT_BAD;
+   *p_proto = eth->h_proto;
+   offset += sizeof(*eth);
+
+   /* Cap headers that we access via pointers at the
+* end of the Ethernet header as our maximum alignment
+* at that point is only 2 bytes.
+*/
+   if (NET_IP_ALIGN)
+   *p_hlen = *p_nhoff + offset;
+   }
+   } else { /* version 1, must be PPTP */
+   u8 _ppp_hdr[PPP_HDRLEN];
+   u8 *ppp_hdr;
+
+   if (hdr->flags & GRE_ACK)
+   offset += sizeof(((struct pptp_gre_header *) 0)->ack);
+
+   ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
+  sizeof(_ppp_hdr),
+  data, *p_hlen, _ppp_hdr);
+   if (!ppp_hdr)
+ 

[patch net-next 2/5] flow_dissector: Move MPLS dissection into a separate function

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Make the main flow_dissect function a bit smaller and move the MPLS
dissection into a separate function. Along with that, do the MPLS header
processing only in case the flow dissection user requires it.

Signed-off-by: Jiri Pirko 
---
 net/core/flow_dissector.c | 56 ---
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d79fb8f..8d01298 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -119,6 +119,33 @@ enum flow_dissect_ret {
 };
 
 static enum flow_dissect_ret
+__skb_flow_dissect_mpls(const struct sk_buff *skb,
+   struct flow_dissector *flow_dissector,
+   void *target_container, void *data, int nhoff, int hlen)
+{
+   struct flow_dissector_key_keyid *key_keyid;
+   struct mpls_label *hdr, _hdr[2];
+
+   if (!dissector_uses_key(flow_dissector,
+   FLOW_DISSECTOR_KEY_MPLS_ENTROPY))
+   return FLOW_DISSECT_RET_OUT_GOOD;
+
+   hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+  hlen, &_hdr);
+   if (!hdr)
+   return FLOW_DISSECT_RET_OUT_BAD;
+
+   if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
+   MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
+   key_keyid = skb_flow_dissector_target(flow_dissector,
+ 
FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+ target_container);
+   key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK);
+   }
+   return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
 __skb_flow_dissect_arp(const struct sk_buff *skb,
   struct flow_dissector *flow_dissector,
   void *target_container, void *data, int nhoff, int hlen)
@@ -408,31 +435,16 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
}
 
case htons(ETH_P_MPLS_UC):
-   case htons(ETH_P_MPLS_MC): {
-   struct mpls_label *hdr, _hdr[2];
+   case htons(ETH_P_MPLS_MC):
 mpls:
-   hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
-  hlen, &_hdr);
-   if (!hdr)
-   goto out_bad;
-
-   if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
-MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
-   if (dissector_uses_key(flow_dissector,
-  
FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
-   key_keyid = 
skb_flow_dissector_target(flow_dissector,
- 
FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
- 
target_container);
-   key_keyid->keyid = hdr[1].entry &
-   htonl(MPLS_LS_LABEL_MASK);
-   }
-
+   switch (__skb_flow_dissect_mpls(skb, flow_dissector,
+   target_container, data,
+   nhoff, hlen)) {
+   case FLOW_DISSECT_RET_OUT_GOOD:
goto out_good;
+   case FLOW_DISSECT_RET_OUT_BAD:
+   goto out_bad;
}
-
-   goto out_good;
-   }
-
case htons(ETH_P_FCOE):
if ((hlen - nhoff) < FCOE_HEADER_LEN)
goto out_bad;
-- 
2.7.4



[patch net-next 0/5] make flow dissector great again

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

This patchset follows-up the discussion about future extensions of flow
dissector and tries to address the mentioned concerns. Some parts are
cut out into sub-functions. Also, the processing of the code (ARP, MPLS)
is made dependent on user actually requiring the bisected values.
This prepares the code for future extensions to bisect IPv6 ND messages,
TCP flags, etc.

Jiri Pirko (5):
  flow_dissector: Move ARP dissection into a separate function
  flow_dissector: Move MPLS dissection into a separate function
  flow_dissector: Fix GRE header error path
  flow_dissector: rename "proto again" goto label
  flow_dissector: Move GRE dissection into a separate function

 net/core/flow_dissector.c | 426 ++
 1 file changed, 238 insertions(+), 188 deletions(-)

-- 
2.7.4



[patch net-next 4/5] flow_dissector: rename "proto again" goto label

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Align with "ip_proto_again" label used in the same function and rename
vague "again" to "proto_again".

Signed-off-by: Jiri Pirko 
---
 net/core/flow_dissector.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index cefaf23..9120835 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -267,7 +267,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
memcpy(key_eth_addrs, >h_dest, sizeof(*key_eth_addrs));
}
 
-again:
+proto_again:
switch (proto) {
case htons(ETH_P_IP): {
const struct iphdr *iph;
@@ -370,7 +370,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
if (skip_vlan)
-   goto again;
+   goto proto_again;
}
 
skip_vlan = true;
@@ -393,7 +393,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
}
}
 
-   goto again;
+   goto proto_again;
}
case htons(ETH_P_PPP_SES): {
struct {
@@ -577,7 +577,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
goto out_good;
 
-   goto again;
+   goto proto_again;
}
case NEXTHDR_HOP:
case NEXTHDR_ROUTING:
-- 
2.7.4



[patch net-next 1/5] flow_dissector: Move ARP dissection into a separate function

2017-03-06 Thread Jiri Pirko
From: Jiri Pirko 

Make the main flow_dissect function a bit smaller and move the ARP
dissection into a separate function. Along with that, do the ARP header
processing only in case the flow dissection user requires it.

Signed-off-by: Jiri Pirko 
---
 net/core/flow_dissector.c | 120 ++
 1 file changed, 67 insertions(+), 53 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c35aae1..d79fb8f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -113,6 +113,66 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int 
thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
+enum flow_dissect_ret {
+   FLOW_DISSECT_RET_OUT_GOOD,
+   FLOW_DISSECT_RET_OUT_BAD,
+};
+
+static enum flow_dissect_ret
+__skb_flow_dissect_arp(const struct sk_buff *skb,
+  struct flow_dissector *flow_dissector,
+  void *target_container, void *data, int nhoff, int hlen)
+{
+   struct flow_dissector_key_arp *key_arp;
+   struct {
+   unsigned char ar_sha[ETH_ALEN];
+   unsigned char ar_sip[4];
+   unsigned char ar_tha[ETH_ALEN];
+   unsigned char ar_tip[4];
+   } *arp_eth, _arp_eth;
+   const struct arphdr *arp;
+   struct arphdr *_arp;
+
+   if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
+   return FLOW_DISSECT_RET_OUT_GOOD;
+
+   arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+  hlen, &_arp);
+   if (!arp)
+   return FLOW_DISSECT_RET_OUT_BAD;
+
+   if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+   arp->ar_pro != htons(ETH_P_IP) ||
+   arp->ar_hln != ETH_ALEN ||
+   arp->ar_pln != 4 ||
+   (arp->ar_op != htons(ARPOP_REPLY) &&
+arp->ar_op != htons(ARPOP_REQUEST)))
+   return FLOW_DISSECT_RET_OUT_BAD;
+
+   arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+  sizeof(_arp_eth), data,
+  hlen, &_arp_eth);
+   if (!arp_eth)
+   return FLOW_DISSECT_RET_OUT_BAD;
+
+   key_arp = skb_flow_dissector_target(flow_dissector,
+   FLOW_DISSECTOR_KEY_ARP,
+   target_container);
+
+   memcpy(_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
+   memcpy(_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));
+
+   /* Only store the lower byte of the opcode;
+* this covers ARPOP_REPLY and ARPOP_REQUEST.
+*/
+   key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+   ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+   ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+
+   return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are 
specified
@@ -138,7 +198,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
-   struct flow_dissector_key_arp *key_arp;
struct flow_dissector_key_ports *key_ports;
struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
@@ -382,60 +441,15 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
goto out_good;
 
case htons(ETH_P_ARP):
-   case htons(ETH_P_RARP): {
-   struct {
-   unsigned char ar_sha[ETH_ALEN];
-   unsigned char ar_sip[4];
-   unsigned char ar_tha[ETH_ALEN];
-   unsigned char ar_tip[4];
-   } *arp_eth, _arp_eth;
-   const struct arphdr *arp;
-   struct arphdr *_arp;
-
-   arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
-  hlen, &_arp);
-   if (!arp)
-   goto out_bad;
-
-   if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
-   arp->ar_pro != htons(ETH_P_IP) ||
-   arp->ar_hln != ETH_ALEN ||
-   arp->ar_pln != 4 ||
-   (arp->ar_op != htons(ARPOP_REPLY) &&
-arp->ar_op != htons(ARPOP_REQUEST)))
-   goto out_bad;
-
-   arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
-  sizeof(_arp_eth), data,
-  hlen,
-  &_arp_eth);
-   if (!arp_eth)
+   case htons(ETH_P_RARP):
+   switch (__skb_flow_dissect_arp(skb, flow_dissector,
+  

Re: [PATCH 21/29] drivers, s390: convert fc_fcp_pkt.ref_cnt from atomic_t to refcount_t

2017-03-06 Thread Johannes Thumshirn
On 03/06/2017 03:21 PM, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.

The subject is wrong, should be something like "scsi: libfc convert
fc_fcp_pkt.ref_cnt from atomic_t to refcount_t" but not s390.

Other than that
Acked-by: Johannes Thumshirn 

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


[PATCH 03/29] drivers, char: convert vma_data.refcnt from atomic_t to refcount_t

2017-03-06 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 drivers/char/mspec.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index a9c2fa3..7b75669 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -89,7 +90,7 @@ static int is_sn2;
  * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
-   atomic_t refcnt;/* Number of vmas sharing the data. */
+   refcount_t refcnt;  /* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to this structure. */
int count;  /* Number of pages allocated. */
enum mspec_page_type type; /* Type of pages allocated. */
@@ -144,7 +145,7 @@ mspec_open(struct vm_area_struct *vma)
struct vma_data *vdata;
 
vdata = vma->vm_private_data;
-   atomic_inc(>refcnt);
+   refcount_inc(>refcnt);
 }
 
 /*
@@ -162,7 +163,7 @@ mspec_close(struct vm_area_struct *vma)
 
vdata = vma->vm_private_data;
 
-   if (!atomic_dec_and_test(>refcnt))
+   if (!refcount_dec_and_test(>refcnt))
return;
 
last_index = (vdata->vm_end - vdata->vm_start) >> PAGE_SHIFT;
@@ -274,7 +275,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
vdata->vm_end = vma->vm_end;
vdata->type = type;
spin_lock_init(>lock);
-   atomic_set(>refcnt, 1);
+   refcount_set(>refcnt, 1);
vma->vm_private_data = vdata;
 
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-- 
2.7.4



[RFC PATCH net] net: Work around lockdep limitation in sockets that use sockets

2017-03-06 Thread David Howells
Lockdep issues a circular dependency warning when AFS issues an operation
through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem.

The theory lockdep comes up with is as follows:

 (1) If the pagefault handler decides it needs to read pages from AFS, it
 calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but
 creating a call requires the socket lock:

mmap_sem must be taken before sk_lock-AF_RXRPC

 (2) afs_open_socket() opens an AF_RXRPC socket and binds it.  rxrpc_bind()
 binds the underlying UDP socket whilst holding its socket lock.
 inet_bind() takes its own socket lock:

sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET

 (3) Reading from a TCP socket into a userspace buffer might cause a fault
 and thus cause the kernel to take the mmap_sem, but the TCP socket is
 locked whilst doing this:

sk_lock-AF_INET must be taken before mmap_sem

However, lockdep's theory is wrong in this instance because it deals only
with lock classes and not individual locks.  The AF_INET lock in (2) isn't
really equivalent to the AF_INET lock in (3) as the former deals with a
socket entirely internal to the kernel that never sees userspace.  This is
a limitation in the design of lockdep.

Fix the general case by:

 (1) Double up all the locking keys used in sockets so that one set are
 used if the socket is created by userspace and the other set is used
 if the socket is created by the kernel.

 (2) Store the kern parameter passed to sk_alloc() in a variable in the
 sock struct (sk_kern_sock).  This informs sock_lock_init(),
 sock_init_data() and sk_clone_lock() as to the lock keys to be used.

 Note that the child created by sk_clone_lock() inherits the parent's
 kern setting.

 (3) Add a 'kern' parameter to ->accept() that is analogous to the one
 passed in to ->create() that distinguishes whether kernel_accept() or
 sys_accept4() was the caller and can be passed to sk_alloc().

 Note that a lot of accept functions merely dequeue an already
 allocated socket.  I haven't touched these as the new socket already
 exists before we get the parameter.

 Note also that there are a couple of places where I've made the accepted
 socket unconditionally kernel-based:

irda_accept()
rds_rcp_accept_one()
tcp_accept_from_sock()

 because they follow a sock_create_kern() and accept off of that.

Whilst creating this, I noticed that lustre and ocfs don't create sockets
through sock_create_kern() and thus they aren't marked as for-kernel,
though they appear to be internal.  I wonder if these should do that so
that they use the new set of lock keys.

Signed-off-by: David Howells 
---

 crypto/af_alg.c   |9 +-
 crypto/algif_hash.c   |9 +-
 drivers/staging/lustre/lnet/lnet/lib-socket.c |4 -
 fs/dlm/lowcomms.c |2 
 fs/ocfs2/cluster/tcp.c|2 
 include/crypto/if_alg.h   |2 
 include/linux/net.h   |2 
 include/net/inet_common.h |3 -
 include/net/inet_connection_sock.h|2 
 include/net/sctp/structs.h|3 -
 include/net/sock.h|7 +-
 net/atm/svc.c |5 +
 net/ax25/af_ax25.c|3 -
 net/bluetooth/l2cap_sock.c|2 
 net/bluetooth/rfcomm/sock.c   |3 -
 net/bluetooth/sco.c   |2 
 net/core/sock.c   |  106 +
 net/decnet/af_decnet.c|5 +
 net/ipv4/af_inet.c|5 +
 net/ipv4/inet_connection_sock.c   |2 
 net/irda/af_irda.c|5 +
 net/iucv/af_iucv.c|2 
 net/llc/af_llc.c  |4 +
 net/netrom/af_netrom.c|3 -
 net/nfc/llcp_sock.c   |2 
 net/phonet/pep.c  |6 +
 net/phonet/socket.c   |4 -
 net/rds/tcp_listen.c  |2 
 net/rose/af_rose.c|3 -
 net/sctp/ipv6.c   |5 +
 net/sctp/protocol.c   |5 +
 net/sctp/socket.c |4 -
 net/smc/af_smc.c  |2 
 net/socket.c  |4 -
 net/tipc/socket.c |8 +-
 net/unix/af_unix.c|5 +
 net/vmw_vsock/af_vsock.c  |3 -
 net/x25/af_x25.c  |3 -
 38 files changed, 141 insertions(+), 107 deletions(-)

diff --git 

[PATCH net-next] net: ipv4: add support for ECMP hash policy choice

2017-03-06 Thread Nikolay Aleksandrov
This patch adds support for ECMP hash policy choice via a new sysctl
called fib_multipath_hash_policy and also adds support for L4 hashes.
The current values for fib_multipath_hash_policy are:
 0 - layer 3
 1 - layer 4 (new default)
If there's an skb hash already set and it matches the chosen policy then it
will be used instead of being calculated. The ICMP inner IP addresses use
is removed, and we switch to L4 default for better distribution.

Signed-off-by: Nikolay Aleksandrov 
---
I'm not happy with using an integer, but it produces the smallest churn.
Just let me know if you'd like to switch to a string sysctl.

 Documentation/networking/ip-sysctl.txt |  8 +++
 include/net/ip_fib.h   | 14 ++---
 include/net/netns/ipv4.h   |  1 +
 include/net/route.h|  5 +-
 net/ipv4/fib_frontend.c|  3 ++
 net/ipv4/fib_semantics.c   | 11 ++--
 net/ipv4/icmp.c| 19 +--
 net/ipv4/route.c   | 93 ++
 net/ipv4/sysctl_net_ipv4.c |  9 
 9 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index fc73eeb7b3b8..15810ca7d8b0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
0 - disabled
1 - enabled
 
+fib_multipath_hash_policy - INTEGER
+   Controls which hash policy to use for multipath routes. Only valid
+   for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+   Default: 1 (Layer 4)
+   Possible values:
+   0 - Layer 3
+   1 - Layer 4
+
 route/max_size - INTEGER
Maximum number of routes allowed in the kernel.  Increase
this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 368bb4024b78..8ac9bec053c5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -371,17 +371,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned 
long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
-extern u32 fib_multipath_secret __read_mostly;
-
-static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
-{
-   return jhash_2words((__force u32)saddr, (__force u32)daddr,
-   fib_multipath_secret) >> 1;
-}
-
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+  const struct sk_buff *skb);
+#endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-struct flowi4 *fl4, int mp_hash);
+struct flowi4 *fl4);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 622d2da27135..70a1d4251790 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,6 +152,7 @@ struct netns_ipv4 {
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
int sysctl_fib_multipath_use_neigh;
+   int sysctl_fib_multipath_hash_policy;
 #endif
 
unsigned intfib_seq;/* protected by rtnl_mutex */
diff --git a/include/net/route.h b/include/net/route.h
index c0874c87c173..77a5c613a290 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -113,13 +113,12 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
- int mp_hash);
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp);
 
 static inline struct rtable *__ip_route_output_key(struct net *net,
   struct flowi4 *flp)
 {
-   return __ip_route_output_key_hash(net, flp, -1);
+   return __ip_route_output_key_hash(net, flp);
 }
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 42bfd08109dd..bba87195cbf4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1233,6 +1233,9 @@ static int __net_init ip_fib_net_init(struct net *net)
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+   net->ipv4.sysctl_fib_multipath_hash_policy = 1;
+#endif
net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
if (!net->ipv4.fib_table_hash)
return -ENOMEM;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..6601bd9744c9 100644
--- 

Re: [PATCH] 4.9.13 brcmfmac: fix use-after-free on resume

2017-03-06 Thread Daniel J Blueman
On 6 March 2017 at 21:00, Arend Van Spriel  wrote:
> + linux-wireless
>
> On 6-3-2017 8:14, Daniel J Blueman wrote:
>> KASAN reported 'struct wireless_dev wdev' was read after being freed.
>> Fix by freeing after the access.
>
> I would rather like to see the KASAN report, because something is off
> here. This function is called with wdev as a parameter so how can it be
> accessed after free here? brcmf_remove_interface() does not free the
> wdev nor the brcmf_cfg80211_vif instance which contains the wdev.
>
> Regards,
> Arend
>
>> Signed-off-by: Daniel J Blueman 
>>
>> diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
>> b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
>> index de19c7c..aa0f470 100644
>> --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
>> +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
>> @@ -2288,12 +2288,13 @@ int brcmf_p2p_del_vif(struct wiphy *wiphy,
>> struct wireless_dev *wdev)
>> else
>> err = 0;
>> }
>> -   brcmf_remove_interface(vif->ifp, true);
>>
>> -   brcmf_cfg80211_arm_vif_event(cfg, NULL);
>> if (vif->wdev.iftype != NL80211_IFTYPE_P2P_DEVICE)
>> p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION].vif = NULL;
>>
>> +   brcmf_remove_interface(vif->ifp, true);
>> +   brcmf_cfg80211_arm_vif_event(cfg, NULL);
>> +
>> return err;
>>  }

Sure, https://quora.org/kernel/brcmfmac/dmesg.txt

vmlinux, cfg80211.o, brcmfmac.o and config are in the same path; this
is against v4.9.13 stock.

Thanks,
  Daniel
-- 
Daniel J Blueman


Re: [PATCH v3 17/20] usb: gadget: pch_udc: Replace PCI pool old API

2017-03-06 Thread Felipe Balbi
Peter Senna Tschudin  writes:
> On Sun, Feb 26, 2017 at 08:24:22PM +0100, Romain Perier wrote:
>> The PCI pool API is deprecated. This commits replaces the PCI pool old
>> API by the appropriated function with the DMA pool API.
>> 
> Reviewed-by: Peter Senna Tschudin 

Fine by me:

Acked-by: Felipe Balbi 

-- 
balbi


signature.asc
Description: PGP signature


[PATCH net 3/7 v2] bnx2x: fix possible overrun of VFPF multicast addresses array

2017-03-06 Thread Michal Schmidt

It is too late to check for the limit of the number of VF multicast
addresses after they have already been copied to the req->multicast[]
array, possibly overflowing it.

Do the check before copying.

Checking early also avoids having to (and forgetting to) unlock
vf2pf_mutex.

While we're looking at the error paths in the function, also return
an error code from it when the PF responds with an error. Even though
the caller ignores it.

v2: Move the check before bnx2x_vfpf_prep() as suggested by Yuval.

Signed-off-by: Michal Schmidt 
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index bfae300..2b2ae92 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -864,46 +864,44 @@ int bnx2x_vfpf_config_rss(struct bnx2x *bp,
 }
 
 int bnx2x_vfpf_set_mcast(struct net_device *dev)

 {
struct bnx2x *bp = netdev_priv(dev);
struct vfpf_set_q_filters_tlv *req = >vf2pf_mbox->req.set_q_filters;
struct pfvf_general_resp_tlv *resp = >vf2pf_mbox->resp.general_resp;
-   int rc, i = 0;
+   int rc = 0, i = 0;
struct netdev_hw_addr *ha;
 
 	if (bp->state != BNX2X_STATE_OPEN) {

DP(NETIF_MSG_IFUP, "state is %x, returning\n", bp->state);
return -EINVAL;
}
 
+	/* We support PFVF_MAX_MULTICAST_PER_VF mcast addresses tops */

+   if (netdev_mc_count(dev) > PFVF_MAX_MULTICAST_PER_VF) {
+   DP(NETIF_MSG_IFUP,
+  "VF supports not more than %d multicast MAC addresses\n",
+  PFVF_MAX_MULTICAST_PER_VF);
+   return -EINVAL;
+   }
+
/* clear mailbox and prep first tlv */
bnx2x_vfpf_prep(bp, >first_tlv, CHANNEL_TLV_SET_Q_FILTERS,
sizeof(*req));
 
 	/* Get Rx mode requested */

DP(NETIF_MSG_IFUP, "dev->flags = %x\n", dev->flags);
 
 	netdev_for_each_mc_addr(ha, dev) {

DP(NETIF_MSG_IFUP, "Adding mcast MAC: %pM\n",
   bnx2x_mc_addr(ha));
memcpy(req->multicast[i], bnx2x_mc_addr(ha), ETH_ALEN);
i++;
}
 
-	/* We support four PFVF_MAX_MULTICAST_PER_VF mcast

- * addresses tops
- */
-   if (i >= PFVF_MAX_MULTICAST_PER_VF) {
-   DP(NETIF_MSG_IFUP,
-  "VF supports not more than %d multicast MAC addresses\n",
-  PFVF_MAX_MULTICAST_PER_VF);
-   return -EINVAL;
-   }
-
req->n_multicast = i;
req->flags |= VFPF_SET_Q_FILTERS_MULTICAST_CHANGED;
req->vf_qid = 0;
 
 	/* add list termination tlv */

bnx2x_add_tlv(bp, req, req->first_tlv.tl.length, CHANNEL_TLV_LIST_END,
  sizeof(struct channel_list_end_tlv));
@@ -920,15 +918,15 @@ int bnx2x_vfpf_set_mcast(struct net_device *dev)
BNX2X_ERR("Set Rx mode/multicast failed: %d\n",
  resp->hdr.status);
rc = -EINVAL;
}
 out:
bnx2x_vfpf_finalize(bp, >first_tlv);
 
-	return 0;

+   return rc;
 }
 
 /* request pf to add a vlan for the vf */

 int bnx2x_vfpf_update_vlan(struct bnx2x *bp, u16 vid, u8 vf_qid, bool add)
 {
struct vfpf_set_q_filters_tlv *req = >vf2pf_mbox->req.set_q_filters;
struct pfvf_general_resp_tlv *resp = >vf2pf_mbox->resp.general_resp;
--
2.9.3



[PATCH 18/29] drivers, s390: convert urdev.ref_count from atomic_t to refcount_t

2017-03-06 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 drivers/s390/char/vmur.c | 8 
 drivers/s390/char/vmur.h | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index 04aceb6..ced8151 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -110,7 +110,7 @@ static struct urdev *urdev_alloc(struct ccw_device *cdev)
mutex_init(>io_mutex);
init_waitqueue_head(>wait);
spin_lock_init(>open_lock);
-   atomic_set(>ref_count,  1);
+   refcount_set(>ref_count,  1);
urd->cdev = cdev;
get_device(>dev);
return urd;
@@ -126,7 +126,7 @@ static void urdev_free(struct urdev *urd)
 
 static void urdev_get(struct urdev *urd)
 {
-   atomic_inc(>ref_count);
+   refcount_inc(>ref_count);
 }
 
 static struct urdev *urdev_get_from_cdev(struct ccw_device *cdev)
@@ -159,7 +159,7 @@ static struct urdev *urdev_get_from_devno(u16 devno)
 
 static void urdev_put(struct urdev *urd)
 {
-   if (atomic_dec_and_test(>ref_count))
+   if (refcount_dec_and_test(>ref_count))
urdev_free(urd);
 }
 
@@ -946,7 +946,7 @@ static int ur_set_offline_force(struct ccw_device *cdev, 
int force)
rc = -EBUSY;
goto fail_urdev_put;
}
-   if (!force && (atomic_read(>ref_count) > 2)) {
+   if (!force && (refcount_read(>ref_count) > 2)) {
/* There is still a user of urd (e.g. ur_open) */
TRACE("ur_set_offline: BUSY\n");
rc = -EBUSY;
diff --git a/drivers/s390/char/vmur.h b/drivers/s390/char/vmur.h
index fa320ad..35ea9d1 100644
--- a/drivers/s390/char/vmur.h
+++ b/drivers/s390/char/vmur.h
@@ -11,6 +11,8 @@
 #ifndef _VMUR_H_
 #define _VMUR_H_
 
+#include 
+
 #define DEV_CLASS_UR_I 0x20 /* diag210 unit record input device class */
 #define DEV_CLASS_UR_O 0x10 /* diag210 unit record output device class */
 /*
@@ -69,7 +71,7 @@ struct urdev {
size_t reclen;  /* Record length for *write* CCWs */
int class;  /* VM device class */
int io_request_rc;  /* return code from I/O request */
-   atomic_t ref_count; /* reference counter */
+   refcount_t ref_count;   /* reference counter */
wait_queue_head_t wait; /* wait queue to serialize open */
int open_flag;  /* "urdev is open" flag */
spinlock_t open_lock;   /* serialize critical sections */
-- 
2.7.4



[PATCH 01/29] drivers, block: convert xen_blkif.refcnt from atomic_t to refcount_t

2017-03-06 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 drivers/block/xen-blkback/common.h | 7 ---
 drivers/block/xen-blkback/xenbus.c | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h 
b/drivers/block/xen-blkback/common.h
index dea61f6..2ccfd62 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -333,7 +334,7 @@ struct xen_blkif {
struct xen_vbd  vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
-   atomic_trefcnt;
+   refcount_t  refcnt;
/* for barrier (drain) requests */
struct completion   drain_complete;
atomic_tdrain;
@@ -386,10 +387,10 @@ struct pending_req {
 (_v)->bdev->bd_part->nr_sects : \
  get_capacity((_v)->bdev->bd_disk))
 
-#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define xen_blkif_get(_b) (refcount_inc(&(_b)->refcnt))
 #define xen_blkif_put(_b)  \
do {\
-   if (atomic_dec_and_test(&(_b)->refcnt)) \
+   if (refcount_dec_and_test(&(_b)->refcnt))   \
schedule_work(&(_b)->free_work);\
} while (0)
 
diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index 8fe61b5..9f89be3 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -176,7 +176,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
return ERR_PTR(-ENOMEM);
 
blkif->domid = domid;
-   atomic_set(>refcnt, 1);
+   refcount_set(>refcnt, 1);
init_completion(>drain_complete);
INIT_WORK(>free_work, xen_blkif_deferred_free);
 
-- 
2.7.4



  1   2   >