[v3 PATCH] netlink: Do not schedule work from sk_destruct

2016-12-05 Thread Herbert Xu
On Mon, Dec 05, 2016 at 03:26:00PM +0800, Herbert Xu wrote:
> On Mon, Dec 05, 2016 at 03:19:46PM +0800, Herbert Xu wrote:
> >
> > Thanks for the patch.  It'll obviously work but I wanted avoid that
> > because it penalises the common path for the rare case.
> > 
> > Andrey, please try this patch and let me know if it's any better.
> > 
> > ---8<---
> > Subject: netlink: Do not schedule work from sk_destruct
> 
> Crap, I screwed it up again.  Here is a v2 which moves the atomic
> call into the RCU callback as otherwise the socket can be freed from
> another path while we await the RCU callback.

With the move it no longer makes sense to rename deferred_put_nlk_sk
so here is v3 which restores the original name.

---8<---
It is wrong to schedule a work from sk_destruct using the socket
as the memory reserve because the socket will be freed immediately
after the return from sk_destruct.

Instead we should do the deferral prior to sk_free.

This patch does just that.

Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..246f29d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, 
struct sock *sk)
sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
struct netlink_sock *nlk = nlk_sk(sk);
 
if (nlk->cb_running) {
+   if (nlk->cb.done)
+   nlk->cb.done(>cb);
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
@@ -348,21 +350,7 @@ static void netlink_sock_destruct_work(struct work_struct 
*work)
struct netlink_sock *nlk = container_of(work, struct netlink_sock,
work);
 
-   nlk->cb.done(>cb);
-   __netlink_sock_destruct(>sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-   struct netlink_sock *nlk = nlk_sk(sk);
-
-   if (nlk->cb_running && nlk->cb.done) {
-   INIT_WORK(>work, netlink_sock_destruct_work);
-   schedule_work(>work);
-   return;
-   }
-
-   __netlink_sock_destruct(sk);
+   sk_free(>sk);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
@@ -667,8 +655,18 @@ static int netlink_create(struct net *net, struct socket 
*sock, int protocol,
 static void deferred_put_nlk_sk(struct rcu_head *head)
 {
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+   struct sock *sk = >sk;
+
+   if (!atomic_dec_and_test(>sk_refcnt))
+   return;
+
+   if (nlk->cb_running && nlk->cb.done) {
+   INIT_WORK(>work, netlink_sock_destruct_work);
+   schedule_work(>work);
+   return;
+   }
 
-   sock_put(>sk);
+   sk_free(sk);
 }
 
 static int netlink_release(struct socket *sock)
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH] netlink: Do not schedule work from sk_destruct

2016-12-04 Thread Herbert Xu
On Mon, Dec 05, 2016 at 03:19:46PM +0800, Herbert Xu wrote:
>
> Thanks for the patch.  It'll obviously work but I wanted avoid that
> because it penalises the common path for the rare case.
> 
> Andrey, please try this patch and let me know if it's any better.
> 
> ---8<---
> Subject: netlink: Do not schedule work from sk_destruct

Crap, I screwed it up again.  Here is a v2 which moves the atomic
call into the RCU callback as otherwise the socket can be freed from
another path while we await the RCU callback.

---8<---
It is wrong to schedule a work from sk_destruct using the socket
as the memory reserve because the socket will be freed immediately
after the return from sk_destruct.

Instead we should do the deferral prior to sk_free.

This patch does just that.

Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..463f5cf 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, 
struct sock *sk)
sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
struct netlink_sock *nlk = nlk_sk(sk);
 
if (nlk->cb_running) {
+   if (nlk->cb.done)
+   nlk->cb.done(>cb);
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
@@ -348,21 +350,7 @@ static void netlink_sock_destruct_work(struct work_struct 
*work)
struct netlink_sock *nlk = container_of(work, struct netlink_sock,
work);
 
-   nlk->cb.done(>cb);
-   __netlink_sock_destruct(>sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-   struct netlink_sock *nlk = nlk_sk(sk);
-
-   if (nlk->cb_running && nlk->cb.done) {
-   INIT_WORK(>work, netlink_sock_destruct_work);
-   schedule_work(>work);
-   return;
-   }
-
-   __netlink_sock_destruct(sk);
+   sk_free(>sk);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
@@ -664,11 +652,21 @@ static int netlink_create(struct net *net, struct socket 
*sock, int protocol,
goto out;
 }
 
-static void deferred_put_nlk_sk(struct rcu_head *head)
+static void deferred_free_nlk_sk(struct rcu_head *head)
 {
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+   struct sock *sk = >sk;
+
+   if (!atomic_dec_and_test(>sk_refcnt))
+   return;
+
+   if (nlk->cb_running && nlk->cb.done) {
+   INIT_WORK(>work, netlink_sock_destruct_work);
+   schedule_work(>work);
+   return;
+   }
 
-   sock_put(>sk);
+   sk_free(sk);
 }
 
 static int netlink_release(struct socket *sock)
@@ -743,7 +741,7 @@ static int netlink_release(struct socket *sock)
local_bh_disable();
sock_prot_inuse_add(sock_net(sk), _proto, -1);
local_bh_enable();
-   call_rcu(>rcu, deferred_put_nlk_sk);
+   call_rcu(>rcu, deferred_free_nlk_sk);
return 0;
 }
 
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: net: use-after-free in worker_thread

2016-12-04 Thread Herbert Xu
On Sat, Dec 03, 2016 at 05:49:07AM -0800, Eric Dumazet wrote:
>
> @@ -600,6 +600,7 @@ static int __netlink_create(struct net *net, struct 
> socket *sock,
>   }
>   init_waitqueue_head(>wait);
>  
> + sock_set_flag(sk, SOCK_RCU_FREE);
>   sk->sk_destruct = netlink_sock_destruct;
>   sk->sk_protocol = protocol;
>   return 0;

It's not necessarily a big deal but I just wanted to point out
that SOCK_RCU_FREE is not equivalent to the call_rcu thing that
netlink does.  The latter only does the RCU deferral for the socket
release call which is the only place where it's needed while
SOCK_RCU_FREE will force every path to do an RCU deferral.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: net: use-after-free in worker_thread

2016-12-04 Thread Herbert Xu
On Sat, Dec 03, 2016 at 10:14:48AM -0800, Cong Wang wrote:
> On Sat, Dec 3, 2016 at 9:41 AM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> > On Sat, Dec 3, 2016 at 4:56 AM, Andrey Konovalov <andreyk...@google.com> 
> > wrote:
> >> Hi!
> >>
> >> I'm seeing lots of the following error reports while running the
> >> syzkaller fuzzer.
> >>
> >> Reports appeared when I updated to 3c49de52 (Dec 2) from 2caceb32 (Dec 1).
> >>
> >> ==
> >> BUG: KASAN: use-after-free in worker_thread+0x17d8/0x18a0
> >> Read of size 8 at addr 880067f3ecd8 by task kworker/3:1/774
> >>
> >> page:ea00019fce00 count:1 mapcount:0 mapping:  (null)
> >> index:0x880067f39c10 compound_mapcount: 0
> >> flags: 0x5004080(slab|head)
> >> page dumped because: kasan: bad access detected
> >>
> >> CPU: 3 PID: 774 Comm: kworker/3:1 Not tainted 4.9.0-rc7+ #66
> >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 
> >> 01/01/2011
> >>  88006c267838 81f882da 6c25e338 11000d84ce9a
> >>  ed000d84ce92 88006c25e340 41b58ab3 8541e198
> >>  81f88048 0001 41b58ab3 853d3ee8
> >> Call Trace:
> >>  [< inline >] __dump_stack lib/dump_stack.c:15
> >>  [] dump_stack+0x292/0x398 lib/dump_stack.c:51
> >>  [< inline >] describe_address mm/kasan/report.c:262
> >>  [] kasan_report_error+0x121/0x560 mm/kasan/report.c:368
> >>  [< inline >] kasan_report mm/kasan/report.c:390
> >>  [] __asan_report_load8_noabort+0x3e/0x40
> >> mm/kasan/report.c:411
> >>  [] worker_thread+0x17d8/0x18a0 kernel/workqueue.c:2228
> >>  [] kthread+0x323/0x3e0 kernel/kthread.c:209
> >>  [] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
> >
> > Heck... this is the pending work vs. sk_destruct() race. :-/
> > We can't wait for the work in RCU callback, let me think about it...

Sorry, my patch was obviously crap as it was trying to delay the
freeing of a socket from sk_destruct which can't be done.

> Please try the attached patch, I only did compile test, I can't access
> my desktop now, so can't do further tests.

Thanks for the patch.  It'll obviously work but I wanted avoid that
because it penalises the common path for the rare case.

Andrey, please try this patch and let me know if it's any better.

---8<---
Subject: netlink: Do not schedule work from sk_destruct

It is wrong to schedule a work from sk_destruct using the socket
as the memory reserve because the socket will be freed immediately
after the return from sk_destruct.

Instead we should do the deferral prior to sk_free.

This patch does just that.

Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..8a642c5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, 
struct sock *sk)
sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
struct netlink_sock *nlk = nlk_sk(sk);
 
if (nlk->cb_running) {
+   if (nlk->cb.done)
+   nlk->cb.done(>cb);
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
@@ -348,21 +350,7 @@ static void netlink_sock_destruct_work(struct work_struct 
*work)
struct netlink_sock *nlk = container_of(work, struct netlink_sock,
work);
 
-   nlk->cb.done(>cb);
-   __netlink_sock_destruct(>sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-   struct netlink_sock *nlk = nlk_sk(sk);
-
-   if (nlk->cb_running && nlk->cb.done) {
-   INIT_WORK(>work, netlink_sock_destruct_work);
-   schedule_work(>work);
-   return;
-   }
-
-   __netlink_sock_destruct(sk);
+   sk_free(>sk);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
@@ -664,11 +652,17 @@ static int netlink_create(struct net *net, struct socket 
*sock, int protocol,
goto out;
 }
 
-static void deferred_put_nlk_sk(struct rcu_head *head)
+static void deferred_free_nlk_sk(struct rcu_head *head)
 {
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
 
-   sock_put(>sk);
+   if (nl

Re: [PATCH net 1/2] esp4: Fix integrity verification when ESN are used

2016-11-30 Thread Herbert Xu
On Tue, Nov 29, 2016 at 05:05:20PM +0100, Tobias Brunner wrote:
> When handling inbound packets, the two halves of the sequence number
> stored on the skb are already in network order.
> 
> Fixes: 7021b2e1cddd ("esp4: Switch to new AEAD interface")
> Signed-off-by: Tobias Brunner <tob...@strongswan.org>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Thanks for catching this!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net 2/2] esp6: Fix integrity verification when ESN are used

2016-11-30 Thread Herbert Xu
On Tue, Nov 29, 2016 at 05:05:25PM +0100, Tobias Brunner wrote:
> When handling inbound packets, the two halves of the sequence number
> stored on the skb are already in network order.
> 
> Fixes: 000ae7b2690e ("esp6: Switch to new AEAD interface")
> Signed-off-by: Tobias Brunner <tob...@strongswan.org>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Thanks.
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Crash due to mutex genl_lock called from RCU context

2016-11-28 Thread Herbert Xu
On Sun, Nov 27, 2016 at 10:53:21PM -0800, Cong Wang wrote:
>
> I just took a deeper look, some user calls rhashtable_destroy() in ->done(),
> so even removing that genl lock is not enough, perhaps we should just
> move it to a work struct like what Daniel does for the tcf_proto, but that is
> ugly... I don't know if RCU provides any API to execute the callback in 
> process
> context.

I looked into doing it without a worker struct, but basically it
means that we'd have to add more crap to the common code path for
what is essentially a very rare case.

So I think we should go with the worker struct because all we lose
is a bit of memory.

---8<---
netlink: Call cb->done from a worker thread

The cb->done interface expects to be called in process context.
This was broken by the netlink RCU conversion.  This patch fixes
it by adding a worker struct to make the cb->done call where
necessary.

Fixes: 21e4902aea80 ("netlink: Lockless lookup with RCU grace...")
Reported-by: Subash Abhinov Kasiviswanathan <subas...@codeaurora.org>
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 62bea45..602e5eb 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,14 +322,11 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, 
struct sock *sk)
sk_mem_charge(sk, skb->truesize);
 }
 
-static void netlink_sock_destruct(struct sock *sk)
+static void __netlink_sock_destruct(struct sock *sk)
 {
struct netlink_sock *nlk = nlk_sk(sk);
 
if (nlk->cb_running) {
-   if (nlk->cb.done)
-   nlk->cb.done(>cb);
-
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
@@ -346,6 +343,28 @@ static void netlink_sock_destruct(struct sock *sk)
WARN_ON(nlk_sk(sk)->groups);
 }
 
+static void netlink_sock_destruct_work(struct work_struct *work)
+{
+   struct netlink_sock *nlk = container_of(work, struct netlink_sock,
+   work);
+
+   nlk->cb.done(>cb);
+   __netlink_sock_destruct(>sk);
+}
+
+static void netlink_sock_destruct(struct sock *sk)
+{
+   struct netlink_sock *nlk = nlk_sk(sk);
+
+   if (nlk->cb_running && nlk->cb.done) {
+   INIT_WORK(>work, netlink_sock_destruct_work);
+   schedule_work(>work);
+   return;
+   }
+
+   __netlink_sock_destruct(sk);
+}
+
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
  * SMP. Look, when several writers sleep and reader wakes them up, all but one
  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 3cfd6cc..4fdb383 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
@@ -33,6 +34,7 @@ struct netlink_sock {
 
struct rhash_head   node;
struct rcu_head rcu;
+   struct work_struct  work;
 };
 
 static inline struct netlink_sock *nlk_sk(struct sock *sk)

-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: rhashtable: how to use insecure_elasticity of rhashtable_params

2016-11-07 Thread Herbert Xu
On Sun, Nov 06, 2016 at 09:15:07PM +0800, Xin Long wrote:
> Now when I don't set  insecure_elasticity, ht->elasticity would be
> set 16, it would be checked in  the loop of __rhashtable_insert_fast
> and rhashtable_lookup_one.

This is now obsolete.  Please use the new rhlist interface.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] crypto: ccm - avoid scatterlist for MAC encryption

2016-10-18 Thread Herbert Xu
On Mon, Oct 17, 2016 at 06:21:14PM +0100, Ard Biesheuvel wrote:
>
> Annoyingly, all this complication with scatterlists etc is for doing
> asynchronous crypto via DMA capable crypto accelerators, and the
> networking code (ipsec as well as mac80211, afaik) only allow
> synchronous in the first place, given that they execute in softirq
> context.

I'm still thinking about the issue (in particular, whether we
should continue to rely on the request context being SG-capable
or allow it to be on the stack for AEAD).

But IPsec definitely supports async crypto.  In fact it was the
very first user of async crypto.

mac80211 on the other hand is currently sync-only.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net-next 5/7] rhashtable: abstract out function to get hash

2016-09-20 Thread Herbert Xu
On Tue, Sep 20, 2016 at 07:58:03PM +0200, Thomas Graf wrote:
> 
> I understand this particular patch as an effort not to duplicate
> hash function selection such as jhash vs jhash2 based on key_len.

If the rhashtable params stay non-const as is then this is going
to produce some monstrous code which will be worse than using
jhash unconditionally.

If the rhashtable params are made const then you'll already know
whether jhash or jhash2 is used.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: skb_splice_bits() and large chunks in pipe (was Re: xfs_file_splice_read: possible circular locking dependency detected

2016-09-20 Thread Herbert Xu
Al Viro <v...@zeniv.linux.org.uk> wrote:
>
>* shoved into scatterlist, which gets fed into crypto/*.c machinery.
> No way for a pipe_buffer stuff to get there, fortunately, because I would
> be very surprised if it works correctly with compound pages and large
> ranges in those.

FWIW the crypto API has always been supposed to handle SG entries
that cross page boundaries.  There were a couple of bugs in this
area but AFAIK they've all been fixed.

Of course I cannot guarantee that every crypto driver also handles
it correctly, but at least we have a few test vectors which test
the page-crossing case specifically.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 net-next 0/7] net: ILA resolver and generic resolver backend

2016-09-20 Thread Herbert Xu
David Miller <da...@davemloft.net> wrote:
> 
> Can you please repost this series with Herbert Xu properly CC:'d
> since he maintains rhashtable and is making changes to it recently
> which might conflict with what you are proposing here?

There is no need to respost this series as is since I've just found
it :)

But I'd like to hear from Tom regarding

1) Why couldn't jhash be called directly instead of through rhashtable?
2) Making rhashtable parameters constanst since otherwise the inlining
will suck.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net-next 6/7] net: Generic resolver backend

2016-09-20 Thread Herbert Xu
Tom Herbert <t...@herbertland.com> wrote:
>
> +   nrslv->params.head_offset = offsetof(struct net_rslv_ent, node);
> +   nrslv->params.key_offset = offsetof(struct net_rslv_ent, object);
> +   nrslv->params.key_len = key_len;
> +   nrslv->params.max_size = max_size;
> +   nrslv->params.min_size = 256;
> +   nrslv->params.automatic_shrinking = true;
> +   nrslv->params.obj_cmpfn = cmp_fn ? net_rslv_cmp : NULL;

This completely defeats the rhashtable inlining since that relies
on the parameter being constant.

Looking at your next patch you have exactly one user for this.  So
who is going to be the next user and do we really need all these
fields to be variable?

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net-next 5/7] rhashtable: abstract out function to get hash

2016-09-20 Thread Herbert Xu
Tom Herbert <t...@herbertland.com> wrote:
> Split out most of rht_key_hashfn which is calculating the hash into
> its own function. This way the hash function can be called separately to
> get the hash value.
> 
> Acked-by: Thomas Graf <tg...@suug.ch>
> Signed-off-by: Tom Herbert <t...@herbertland.com>

I don't get this one.  You're just using jhash, right? Why not
call jhash directly instead of rht_get_key_hashfn?

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net-next 3/7] rhashtable: Call library function alloc_bucket_locks

2016-09-20 Thread Herbert Xu
Tom Herbert <t...@herbertland.com> wrote:
> To allocate the array of bucket locks for the hash table we now
> call library function alloc_bucket_spinlocks. This function is
> based on the old alloc_bucket_locks in rhashtable and should
> produce the same effect.
> 
> Acked-by: Thomas Graf <tg...@suug.ch>
> Signed-off-by: Tom Herbert <t...@herbertland.com>

On second thought when I do the nested allocation I could simply
restore this function.  So

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [v3 PATCH 1/2] rhashtable: Add rhlist interface

2016-09-19 Thread Herbert Xu
On Mon, Sep 19, 2016 at 11:16:21PM +0200, Thomas Graf wrote:
>
> Nice, I like how this simplifies users! Is this suitable for
> ILA as well?

Does it have duplicate objects and use inelastic_security? If so
then yes it should switch over to rhlist.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH RFC 2/6] rhashtable: Call library function alloc_bucket_locks

2016-09-19 Thread Herbert Xu
Tom Herbert <t...@herbertland.com> wrote:
> To allocate the array of bucket locks for the hash table we now
> call library function alloc_bucket_spinlocks. This function is
> based on the old alloc_bucket_locks in rhashtable and should
> produce the same effect.
> 
> Signed-off-by: Tom Herbert <t...@herbertland.com>

This conflicts with the work I'm doing to fix the resize ENOMEM
issue.  I'll be making the hashtable as well as the spinlock table
nested, in which case you must not directly dereference it as an
array.

If you're just trying to share the spinlocks for another purpose,
what we can do is provide a helper function to return the right
lock for a given key/object.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v3 PATCH 1/2] rhashtable: Add rhlist interface

2016-09-19 Thread Herbert Xu
The insecure_elasticity setting is an ugly wart brought out by
users who need to insert duplicate objects (that is, distinct
objects with identical keys) into the same table.

In fact, those users have a much bigger problem.  Once those
duplicate objects are inserted, they don't have an interface to
find them (unless you count the walker interface which walks
over the entire table).

Some users have resorted to doing a manual walk over the hash
table which is of course broken because they don't handle the
potential existence of multiple hash tables.  The result is that
they will break sporadically when they encounter a hash table
resize/rehash.

This patch provides a way out for those users, at the expense
of an extra pointer per object.  Essentially each object is now
a list of objects carrying the same key.  The hash table will
only see the lists so nothing changes as far as rhashtable is
concerned.

To use this new interface, you need to insert a struct rhlist_head
into your objects instead of struct rhash_head.  While the hash
table is unchanged, for type-safety you'll need to use struct
rhltable instead of struct rhashtable.  All the existing interfaces
have been duplicated for rhlist, including the hash table walker.

One missing feature is nulls marking because AFAIK the only potential
user of it does not need duplicate objects.  Should anyone need
this it shouldn't be too hard to add.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 include/linux/rhashtable.h |  491 ++---
 lib/rhashtable.c   |  258 ++-
 2 files changed, 583 insertions(+), 166 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index fd82584..5c132d3 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,7 @@
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herb...@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herb...@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tg...@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <ka...@trash.net>
  *
@@ -53,6 +53,11 @@ struct rhash_head {
struct rhash_head __rcu *next;
 };
 
+struct rhlist_head {
+   struct rhash_head   rhead;
+   struct rhlist_head __rcu*next;
+};
+
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -137,6 +142,7 @@ struct rhashtable_params {
  * @key_len: Key length for hashfn
  * @elasticity: Maximum chain length before rehash
  * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
@@ -147,12 +153,21 @@ struct rhashtable {
unsigned intkey_len;
unsigned intelasticity;
struct rhashtable_paramsp;
+   boolrhlist;
struct work_struct  run_work;
struct mutexmutex;
spinlock_t  lock;
 };
 
 /**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+   struct rhashtable ht;
+};
+
+/**
  * struct rhashtable_walker - Hash table walker
  * @list: List entry on list of walkers
  * @tbl: The table that we were walking over
@@ -163,9 +178,10 @@ struct rhashtable_walker {
 };
 
 /**
- * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * struct rhashtable_iter - Hash table iterator
  * @ht: Table to iterate through
  * @p: Current pointer
+ * @list: Current hash list pointer
  * @walker: Associated rhashtable walker
  * @slot: Current slot
  * @skip: Number of entries to skip in slot
@@ -173,6 +189,7 @@ struct rhashtable_walker {
 struct rhashtable_iter {
struct rhashtable *ht;
struct rhash_head *p;
+   struct rhlist_head *list;
struct rhashtable_walker walker;
unsigned int slot;
unsigned int skip;
@@ -339,13 +356,11 @@ static inline int lockdep_rht_bucket_is_held(const struct 
bucket_table *tbl,
 
 int rhashtable_init(struct rhashtable *ht,
const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+ const struct rhashtable_params *params);
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-   const void *key,
-   struct rhash_head *obj,
-   struct bucket_table *old_tbl,
-   void **data);
-int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+

[v3 PATCH 2/2] mac80211: Use rhltable instead of rhashtable

2016-09-19 Thread Herbert Xu
mac80211 currently uses rhashtable with insecure_elasticity set
to true.  The latter is because of duplicate objects.  What's
more, mac80211 walks the rhashtable chains by hand which is broken
as rhashtable may contain multiple tables due to resizing or
rehashing.

This patch fixes it by converting it to the newly added rhltable
interface which is designed for use with duplicate objects.

With rhltable a lookup returns a list of objects instead of a
single one.  This is then fed into the existing for_each_sta_info
macro.

This patch also deletes the sta_addr_hash function since rhashtable
defaults to jhash.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 net/mac80211/ieee80211_i.h |2 -
 net/mac80211/rx.c  |7 +-
 net/mac80211/sta_info.c|   52 ++---
 net/mac80211/sta_info.h|   19 ++--
 net/mac80211/status.c  |7 +-
 5 files changed, 33 insertions(+), 54 deletions(-)

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342..1a52cd4 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1208,7 +1208,7 @@ struct ieee80211_local {
spinlock_t tim_lock;
unsigned long num_sta;
struct list_head sta_list;
-   struct rhashtable sta_hash;
+   struct rhltable sta_hash;
struct timer_list sta_cleanup;
int sta_generation;
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b1..5e26dc6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3940,7 +3940,7 @@ static void __ieee80211_rx_handle_packet(struct 
ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
-   struct rhash_head *tmp;
+   struct rhlist_head *tmp;
int err = 0;
 
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -3983,13 +3983,10 @@ static void __ieee80211_rx_handle_packet(struct 
ieee80211_hw *hw,
goto out;
} else if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
-   const struct bucket_table *tbl;
 
prev_sta = NULL;
 
-   tbl = rht_dereference_rcu(local->sta_hash.tbl, 
>sta_hash);
-
-   for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+   for_each_sta_info(local, hdr->addr2, sta, tmp) {
if (!prev_sta) {
prev_sta = sta;
continue;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 19f14c9..198d0bd 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
 
 static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
-   .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
-   .hashfn = sta_addr_hash,
.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
 };
 
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
 static int sta_info_hash_del(struct ieee80211_local *local,
 struct sta_info *sta)
 {
-   return rhashtable_remove_fast(>sta_hash, >hash_node,
- sta_rht_params);
+   return rhltable_remove(>sta_hash, >hash_node,
+  sta_rht_params);
 }
 
 static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
sta_info_free(local, sta);
 }
 
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+const u8 *addr)
+{
+   return rhltable_lookup(>sta_hash, addr, sta_rht_params);
+}
+
 /* protected by RCU */
 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
  const u8 *addr)
 {
struct ieee80211_local *local = sdata->local;
+   struct rhlist_head *tmp;
struct sta_info *sta;
-   struct rhash_head *tmp;
-   const struct bucket_table *tbl;
 
rcu_read_lock();
-   tbl = rht_dereference_rcu(local->sta_hash.tbl, >sta_hash);
-
-   for_each_sta_info(local, tbl, addr, sta, tmp) {
+   for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata) {
rcu_read_unlock();
/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct 
ieee80211_sub_if_data *sdata,
  const u8 *addr)
 {
struct ieee80211_local *local = sdata->local;
+   struct rhlist_head *tmp;
struct sta_info *sta;
-   struct r

[v3 PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-19 Thread Herbert Xu
v3 fixes a bug in the remove path that causes the element count
to decrease when it shouldn't, leading to a gigantic hash table
when it underflows.

v2 contains a reworked insertion slowpath to ensure that the
spinlock for the table we're inserting into is taken.

This series contains two patches.  The first adds the rhlist
interface and the second converts mac80211 to use it.  If this
works out I'll then proceed to convert the other insecure_elasticity
users over to this.

I've tested the rhlist code with test_rhashtable but I haven't
tested the mac80211 conversion.  So please give it a go and see
if it still works.

Thanks!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [v2 PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-19 Thread Herbert Xu
On Mon, Sep 19, 2016 at 12:10:27PM +0200, Johannes Berg wrote:
> Btw, for debug I put
> 
> BUG_ON(atomic_read(>nelems) < 0);
> 
> after the atomic_dec() in __rhashtable_remove_fast_one(). That makes
> the kernel crash instantly on the buggy code, and I just have to run a
> single test ("wpas_ctrl_interface_add_many") to get there.

Aha I see the problem now.  The nelems logic on remove is broken.
I'll send out a v3.

Thanks!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [v2 PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-19 Thread Herbert Xu
On Mon, Sep 19, 2016 at 11:27:24AM +0200, Johannes Berg wrote:
> 
> I have a feeling there's a bug with ht->nelems, since the crash is
> always in the grow worker, but I haven't quite put my finger on it yet.

Can you show me a stack trace?

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [v2 PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-19 Thread Herbert Xu
On Mon, Sep 19, 2016 at 11:15:30AM +0200, Johannes Berg wrote:
> On Mon, 2016-09-19 at 16:40 +0800, Herbert Xu wrote:
> 
> > I've tested the rhlist code with test_rhashtable but I haven't
> > tested the mac80211 conversion.  So please give it a go and see
> > if it still works.
> 
> This is still running out of memory on my test suite.
> 
> Somehow I don't see kmemleak kicking in, so I'll have to find the bug
> manually :)

What does your test suite actually do? Is it something that I
can run without special hardware?

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH 1/2] rhashtable: Add rhlist interface

2016-09-19 Thread Herbert Xu
The insecure_elasticity setting is an ugly wart brought out by
users who need to insert duplicate objects (that is, distinct
objects with identical keys) into the same table.

In fact, those users have a much bigger problem.  Once those
duplicate objects are inserted, they don't have an interface to
find them (unless you count the walker interface which walks
over the entire table).

Some users have resorted to doing a manual walk over the hash
table which is of course broken because they don't handle the
potential existence of multiple hash tables.  The result is that
they will break sporadically when they encounter a hash table
resize/rehash.

This patch provides a way out for those users, at the expense
of an extra pointer per object.  Essentially each object is now
a list of objects carrying the same key.  The hash table will
only see the lists so nothing changes as far as rhashtable is
concerned.

To use this new interface, you need to insert a struct rhlist_head
into your objects instead of struct rhash_head.  While the hash
table is unchanged, for type-safety you'll need to use struct
rhltable instead of struct rhashtable.  All the existing interfaces
have been duplicated for rhlist, including the hash table walker.

One missing feature is nulls marking because AFAIK the only potential
user of it does not need duplicate objects.  Should anyone need
this it shouldn't be too hard to add.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 include/linux/rhashtable.h |  490 ++---
 lib/rhashtable.c   |  258 ++-
 2 files changed, 582 insertions(+), 166 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index fd82584..dc7bea6 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,7 @@
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herb...@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herb...@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tg...@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <ka...@trash.net>
  *
@@ -53,6 +53,11 @@ struct rhash_head {
struct rhash_head __rcu *next;
 };
 
+struct rhlist_head {
+   struct rhash_head   rhead;
+   struct rhlist_head __rcu*next;
+};
+
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -137,6 +142,7 @@ struct rhashtable_params {
  * @key_len: Key length for hashfn
  * @elasticity: Maximum chain length before rehash
  * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
@@ -147,12 +153,21 @@ struct rhashtable {
unsigned intkey_len;
unsigned intelasticity;
struct rhashtable_paramsp;
+   boolrhlist;
struct work_struct  run_work;
struct mutexmutex;
spinlock_t  lock;
 };
 
 /**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+   struct rhashtable ht;
+};
+
+/**
  * struct rhashtable_walker - Hash table walker
  * @list: List entry on list of walkers
  * @tbl: The table that we were walking over
@@ -163,9 +178,10 @@ struct rhashtable_walker {
 };
 
 /**
- * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * struct rhashtable_iter - Hash table iterator
  * @ht: Table to iterate through
  * @p: Current pointer
+ * @list: Current hash list pointer
  * @walker: Associated rhashtable walker
  * @slot: Current slot
  * @skip: Number of entries to skip in slot
@@ -173,6 +189,7 @@ struct rhashtable_walker {
 struct rhashtable_iter {
struct rhashtable *ht;
struct rhash_head *p;
+   struct rhlist_head *list;
struct rhashtable_walker walker;
unsigned int slot;
unsigned int skip;
@@ -339,13 +356,11 @@ static inline int lockdep_rht_bucket_is_held(const struct 
bucket_table *tbl,
 
 int rhashtable_init(struct rhashtable *ht,
const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+ const struct rhashtable_params *params);
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-   const void *key,
-   struct rhash_head *obj,
-   struct bucket_table *old_tbl,
-   void **data);
-int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+

[v2 PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-19 Thread Herbert Xu
v2 contains a reworked insertion slowpath to ensure that the
spinlock for the table we're inserting into is taken.

This series contains one two patches.  The first adds the rhlist
interface and the second converts mac80211 to use it.  If this works
out I'll then proceed to convert the other insecure_elasticity
users over to this.

I've tested the rhlist code with test_rhashtable but I haven't
tested the mac80211 conversion.  So please give it a go and see
if it still works.

Thanks!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH 2/2] mac80211: Use rhltable instead of rhashtable

2016-09-19 Thread Herbert Xu
mac80211 currently uses rhashtable with insecure_elasticity set
to true.  The latter is because of duplicate objects.  What's
more, mac80211 walks the rhashtable chains by hand which is broken
as rhashtable may contain multiple tables due to resizing or
rehashing.

This patch fixes it by converting it to the newly added rhltable
interface which is designed for use with duplicate objects.

With rhltable a lookup returns a list of objects instead of a
single one.  This is then fed into the existing for_each_sta_info
macro.

This patch also deletes the sta_addr_hash function since rhashtable
defaults to jhash.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 net/mac80211/ieee80211_i.h |2 -
 net/mac80211/rx.c  |7 +-
 net/mac80211/sta_info.c|   52 ++---
 net/mac80211/sta_info.h|   19 ++--
 net/mac80211/status.c  |7 +-
 5 files changed, 33 insertions(+), 54 deletions(-)

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342..1a52cd4 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1208,7 +1208,7 @@ struct ieee80211_local {
spinlock_t tim_lock;
unsigned long num_sta;
struct list_head sta_list;
-   struct rhashtable sta_hash;
+   struct rhltable sta_hash;
struct timer_list sta_cleanup;
int sta_generation;
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b1..5e26dc6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3940,7 +3940,7 @@ static void __ieee80211_rx_handle_packet(struct 
ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
-   struct rhash_head *tmp;
+   struct rhlist_head *tmp;
int err = 0;
 
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -3983,13 +3983,10 @@ static void __ieee80211_rx_handle_packet(struct 
ieee80211_hw *hw,
goto out;
} else if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
-   const struct bucket_table *tbl;
 
prev_sta = NULL;
 
-   tbl = rht_dereference_rcu(local->sta_hash.tbl, 
>sta_hash);
-
-   for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+   for_each_sta_info(local, hdr->addr2, sta, tmp) {
if (!prev_sta) {
prev_sta = sta;
continue;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 19f14c9..198d0bd 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
 
 static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
-   .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
-   .hashfn = sta_addr_hash,
.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
 };
 
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
 static int sta_info_hash_del(struct ieee80211_local *local,
 struct sta_info *sta)
 {
-   return rhashtable_remove_fast(>sta_hash, >hash_node,
- sta_rht_params);
+   return rhltable_remove(>sta_hash, >hash_node,
+  sta_rht_params);
 }
 
 static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
sta_info_free(local, sta);
 }
 
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+const u8 *addr)
+{
+   return rhltable_lookup(>sta_hash, addr, sta_rht_params);
+}
+
 /* protected by RCU */
 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
  const u8 *addr)
 {
struct ieee80211_local *local = sdata->local;
+   struct rhlist_head *tmp;
struct sta_info *sta;
-   struct rhash_head *tmp;
-   const struct bucket_table *tbl;
 
rcu_read_lock();
-   tbl = rht_dereference_rcu(local->sta_hash.tbl, >sta_hash);
-
-   for_each_sta_info(local, tbl, addr, sta, tmp) {
+   for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata) {
rcu_read_unlock();
/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct 
ieee80211_sub_if_data *sdata,
  const u8 *addr)
 {
struct ieee80211_local *local = sdata->local;
+   struct rhlist_head *tmp;
struct sta_info *sta;
-   struct r

Re: [PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-19 Thread Herbert Xu
On Mon, Sep 19, 2016 at 10:25:18AM +0200, Johannes Berg wrote:
> 
> > Yes, it's passing all the wpa_supplicant tests, so
> > 
> > Acked-by: Johannes Berg <johan...@sipsolutions.net>
> > 
> 
> I take that back. I think it's leaking memory - my tests never used to
> run out of memory, but now they eventually do.
> 
> I'll try to figure out more.

Interesting.  The kernel test robot found a bug in the insertion
slowpath where we end up inserting without taking the inner spinlock
in case of a nested table.  Not sure whether that's the same issue
as you're seeing but I'll do a v2 posting.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH 2/2] mac80211: Use rhltable instead of rhashtable

2016-09-18 Thread Herbert Xu
mac80211 currently uses rhashtable with insecure_elasticity set
to true.  The latter is because of duplicate objects.  What's
more, mac80211 walks the rhashtable chains by hand which is broken
as rhashtable may contain multiple tables due to resizing or
rehashing.

This patch fixes it by converting it to the newly added rhltable
interface which is designed for use with duplicate objects.

With rhltable a lookup returns a list of objects instead of a
single one.  This is then fed into the existing for_each_sta_info
macro.

This patch also deletes the sta_addr_hash function since rhashtable
defaults to jhash.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 net/mac80211/ieee80211_i.h |2 -
 net/mac80211/rx.c  |7 +-
 net/mac80211/sta_info.c|   52 ++---
 net/mac80211/sta_info.h|   19 ++--
 net/mac80211/status.c  |7 +-
 5 files changed, 33 insertions(+), 54 deletions(-)

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342..1a52cd4 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1208,7 +1208,7 @@ struct ieee80211_local {
spinlock_t tim_lock;
unsigned long num_sta;
struct list_head sta_list;
-   struct rhashtable sta_hash;
+   struct rhltable sta_hash;
struct timer_list sta_cleanup;
int sta_generation;
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b1..5e26dc6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3940,7 +3940,7 @@ static void __ieee80211_rx_handle_packet(struct 
ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
-   struct rhash_head *tmp;
+   struct rhlist_head *tmp;
int err = 0;
 
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -3983,13 +3983,10 @@ static void __ieee80211_rx_handle_packet(struct 
ieee80211_hw *hw,
goto out;
} else if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
-   const struct bucket_table *tbl;
 
prev_sta = NULL;
 
-   tbl = rht_dereference_rcu(local->sta_hash.tbl, 
>sta_hash);
-
-   for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+   for_each_sta_info(local, hdr->addr2, sta, tmp) {
if (!prev_sta) {
prev_sta = sta;
continue;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 19f14c9..198d0bd 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
 
 static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
-   .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
-   .hashfn = sta_addr_hash,
.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
 };
 
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
 static int sta_info_hash_del(struct ieee80211_local *local,
 struct sta_info *sta)
 {
-   return rhashtable_remove_fast(>sta_hash, >hash_node,
- sta_rht_params);
+   return rhltable_remove(>sta_hash, >hash_node,
+  sta_rht_params);
 }
 
 static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
sta_info_free(local, sta);
 }
 
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+const u8 *addr)
+{
+   return rhltable_lookup(>sta_hash, addr, sta_rht_params);
+}
+
 /* protected by RCU */
 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
  const u8 *addr)
 {
struct ieee80211_local *local = sdata->local;
+   struct rhlist_head *tmp;
struct sta_info *sta;
-   struct rhash_head *tmp;
-   const struct bucket_table *tbl;
 
rcu_read_lock();
-   tbl = rht_dereference_rcu(local->sta_hash.tbl, >sta_hash);
-
-   for_each_sta_info(local, tbl, addr, sta, tmp) {
+   for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata) {
rcu_read_unlock();
/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct 
ieee80211_sub_if_data *sdata,
  const u8 *addr)
 {
struct ieee80211_local *local = sdata->local;
+   struct rhlist_head *tmp;
struct sta_info *sta;
-   struct r

[PATCH 1/2] rhashtable: Add rhlist interface

2016-09-18 Thread Herbert Xu
The insecure_elasticity setting is an ugly wart brought out by
users who need to insert duplicate objects (that is, distinct
objects with identical keys) into the same table.

In fact, those users have a much bigger problem.  Once those
duplicate objects are inserted, they don't have an interface to
find them (unless you count the walker interface which walks
over the entire table).

Some users have resorted to doing a manual walk over the hash
table which is of course broken because they don't handle the
potential existence of multiple hash tables.  The result is that
they will break sporadically when they encounter a hash table
resize/rehash.

This patch provides a way out for those users, at the expense
of an extra pointer per object.  Essentially each object is now
a list of objects carrying the same key.  The hash table will
only see the lists so nothing changes as far as rhashtable is
concerned.

To use this new interface, you need to insert a struct rhlist_head
into your objects instead of struct rhash_head.  While the hash
table is unchanged, for type-safety you'll need to use struct
rhltable instead of struct rhashtable.  All the existing interfaces
have been duplicated for rhlist, including the hash table walker.

One missing feature is nulls marking because AFAIK the only potential
user of it does not need duplicate objects.  Should anyone need
this it shouldn't be too hard to add.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 include/linux/rhashtable.h |  490 ++---
 lib/rhashtable.c   |  231 -
 2 files changed, 560 insertions(+), 161 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index fd82584..dc7bea6 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,7 @@
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herb...@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herb...@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tg...@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <ka...@trash.net>
  *
@@ -53,6 +53,11 @@ struct rhash_head {
struct rhash_head __rcu *next;
 };
 
+struct rhlist_head {
+   struct rhash_head   rhead;
+   struct rhlist_head __rcu*next;
+};
+
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -137,6 +142,7 @@ struct rhashtable_params {
  * @key_len: Key length for hashfn
  * @elasticity: Maximum chain length before rehash
  * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
@@ -147,12 +153,21 @@ struct rhashtable {
unsigned intkey_len;
unsigned intelasticity;
struct rhashtable_paramsp;
+   boolrhlist;
struct work_struct  run_work;
struct mutexmutex;
spinlock_t  lock;
 };
 
 /**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+   struct rhashtable ht;
+};
+
+/**
  * struct rhashtable_walker - Hash table walker
  * @list: List entry on list of walkers
  * @tbl: The table that we were walking over
@@ -163,9 +178,10 @@ struct rhashtable_walker {
 };
 
 /**
- * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * struct rhashtable_iter - Hash table iterator
  * @ht: Table to iterate through
  * @p: Current pointer
+ * @list: Current hash list pointer
  * @walker: Associated rhashtable walker
  * @slot: Current slot
  * @skip: Number of entries to skip in slot
@@ -173,6 +189,7 @@ struct rhashtable_walker {
 struct rhashtable_iter {
struct rhashtable *ht;
struct rhash_head *p;
+   struct rhlist_head *list;
struct rhashtable_walker walker;
unsigned int slot;
unsigned int skip;
@@ -339,13 +356,11 @@ static inline int lockdep_rht_bucket_is_held(const struct 
bucket_table *tbl,
 
 int rhashtable_init(struct rhashtable *ht,
const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+ const struct rhashtable_params *params);
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-   const void *key,
-   struct rhash_head *obj,
-   struct bucket_table *old_tbl,
-   void **data);
-int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+

[PATCH 0/2] rhashtable: rhashtable with duplicate objects

2016-09-18 Thread Herbert Xu
On Fri, Aug 05, 2016 at 12:50:33PM +0200, Johannes Berg wrote:
> > My plan is to build support for this directly into rhashtable.
> > So I'm adding a struct rhlist_head that would be used in place
> > of rhash_head for these cases and it'll carry an extra pointer
> > for the list of identical entries.
> > 
> > I will then add an additional layer of insert/lookup interfaces
> > for rhlist_head.
> 
> Oh, ok.

OK, it's finally ready now.

This series contains one two patches.  The first adds the rhlist
interface and the second converts mac80211 to use it.  If this works
out I'll then proceed to convert the other insecure_elasticity
users over to this.

I've tested the rhlist code with test_rhashtable but I haven't
tested the mac80211 conversion.  So please give it a go and see
if it still works.

Thanks!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net-next v2] netlink: don't forget to release a rhashtable_iter structure

2016-09-06 Thread Herbert Xu
On Tue, Sep 06, 2016 at 09:31:17PM -0700, Andrei Vagin wrote:
> This bug was detected by kmemleak:
> unreferenced object 0x8804269cc3c0 (size 64):
>   comm "criu", pid 1042, jiffies 4294907360 (age 13.713s)
>   hex dump (first 32 bytes):
> a0 32 cc 2c 04 88 ff ff 00 00 00 00 00 00 00 00  .2.,
> 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de  
>   backtrace:
> [] kmemleak_alloc+0x4a/0xa0
> [] kmem_cache_alloc_trace+0x10f/0x280
> [] __netlink_diag_dump+0x26c/0x290 [netlink_diag]
> 
> v2: don't remove a reference on a rhashtable_iter structure to
> release it from netlink_diag_dump_done
> 
> Cc: Herbert Xu <herb...@gondor.apana.org.au>
> Fixes: ad202074320c ("netlink: Use rhashtable walk interface in diag dump")
> Signed-off-by: Andrei Vagin <ava...@openvz.org>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Thanks for catching this!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net] rhashtable: fix a memory leak in alloc_bucket_locks()

2016-08-26 Thread Herbert Xu
On Fri, Aug 26, 2016 at 08:51:39AM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <eduma...@google.com>
> 
> If vmalloc() was successful, do not attempt a kmalloc_array()
> 
> Fixes: 4cf0b354d92e ("rhashtable: avoid large lock-array allocations")
> Reported-by: CAI Qian <caiq...@redhat.com>
> Signed-off-by: Eric Dumazet <eduma...@google.com>
> Cc: Florian Westphal <f...@strlen.de>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Thanks Eric!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH -next v2] chcr: Fix non static symbol warning

2016-08-26 Thread Herbert Xu
On Fri, Aug 26, 2016 at 02:21:08PM +, Wei Yongjun wrote:
> From: Wei Yongjun <weiyongj...@huawei.com>
> 
> Fixes the following sparse warning:
> 
> drivers/crypto/chelsio/chcr_algo.c:593:5: warning:
>  symbol 'cxgb4_is_crypto_q_full' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun <weiyongj...@huawei.com>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH nf-next,v2 1/2] rhashtable: add rhashtable_lookup_get_insert_key()

2016-08-26 Thread Herbert Xu
On Thu, Aug 25, 2016 at 04:41:26PM +0200, Pablo Neira Ayuso wrote:
> This patch modifies __rhashtable_insert_fast() so it returns the
> existing object that clashes with the one that you want to insert.
> In case the object is successfully inserted, NULL is returned.
> Otherwise, you get an error via ERR_PTR().
> 
> This patch adapts the existing callers of __rhashtable_insert_fast()
> so they handle this new logic, and it adds a new
> rhashtable_lookup_get_insert_key() interface to fetch this existing
> object.
> 
> nf_tables needs this change to improve handling of EEXIST cases via
> honoring the NLM_F_EXCL flag and by checking if the data part of the
> mapping matches what we have.
> 
> Cc: Herbert Xu <herb...@gondor.apana.org.au>
> Cc: Thomas Graf <tg...@suug.ch>
> Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [RFC PATCH] net: ip_finish_output_gso: Attempt gso_size clamping if segments exceed mtu

2016-08-26 Thread Herbert Xu
On Thu, Aug 25, 2016 at 12:05:33PM +0300, Shmulik Ladkani wrote:
>
> We have few alternatives for gso_size clamping:
> 
> 1 Fix 'skb_segment' arithmentics to support inputs that do not match
>   the "frag_list members terminate on exact MSS" assumption.
> 
> 2 Perform gso_size clamping in 'ip_finish_output_gso' for non-GROed skbs.
>   Other usecases will still benefit: (a) packets arriving from
>   virtualized interfaces, e.g. tap and friends; (b) packets arriving from
>   veth of other namespaces (packets are locally generated by TCP stack
>   on a different netns).
> 
> 3 Ditch the idea, again ;)
> 
> We can persue (1), especially if there are other benefits doing so.
> OTOH due to the current complexity of 'skb_segment' this is bit risky.
> 
> Going with (2) could be reasonable too, as it brings value for
> the virtualized environmnets that need gso_size clamping, while
> presenting minimal risk.

Please remember the overarching rule for GRO and that is it must
be completely transparent, i.e., the output must be as if it didn't
exist.

So if this is a DF packet and then we must generate ICMP packets
rather than downsizing the packets.  If it's not DF then we must
fragment only within each frag_list skb.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [v2 PATCH 3/3] netlink: Use rhashtable walk interface in diag dump

2016-08-19 Thread Herbert Xu
On Fri, Aug 19, 2016 at 06:32:37PM +0200, Thomas Graf wrote:
> On 08/19/16 at 04:21pm, Herbert Xu wrote:
> > This patch converts the diag dumping code to use the rhashtable
> > walk code instead of going through rhashtable by hand.  The lock
> > nl_table_lock is now only taken while we process the multicast
> > list as it's not needed for the rhashtable walk.
> > 
> > Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
> 
> LGTM. Curious, what did you use to test this? I've been struggling
> to find a good test case.

ss -A netlink

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 1/3] rhashtable: Remove GFP flag from rhashtable_walk_init

2016-08-19 Thread Herbert Xu
On Fri, Aug 19, 2016 at 06:25:04PM +0200, Thomas Graf wrote:
> On 08/18/16 at 04:50pm, Herbert Xu wrote:
> > +/* Obsolete function, do not use in new code. */
> > +static inline int rhashtable_walk_init(struct rhashtable *ht,
> > +  struct rhashtable_iter *iter, gfp_t gfp)
> > +{
> > +   rhashtable_walk_enter(ht, iter);
> > +   return 0;
> > +}
> 
> Converting the 5 users of rhashtable_walk_init() looks very straight
> forward, any reason to keep this around?

It is easy but it's needless churn, especially since we've just
converted all of them the other way.

Feel free to do a patch on top of this to get rid of them.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH 3/3] netlink: Use rhashtable walk interface in diag dump

2016-08-19 Thread Herbert Xu
This patch converts the diag dumping code to use the rhashtable
walk code instead of going through rhashtable by hand.  The lock
nl_table_lock is now only taken while we process the multicast
list as it's not needed for the rhashtable walk.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 net/netlink/diag.c |  103 +
 1 file changed, 73 insertions(+), 30 deletions(-)

diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8dd836a..3e3e253 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -63,43 +63,75 @@ out_nlmsg_trim:
 static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback 
*cb,
int protocol, int s_num)
 {
+   struct rhashtable_iter *hti = (void *)cb->args[2];
struct netlink_table *tbl = _table[protocol];
-   struct rhashtable *ht = >hash;
-   const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
struct net *net = sock_net(skb->sk);
struct netlink_diag_req *req;
struct netlink_sock *nlsk;
struct sock *sk;
-   int ret = 0, num = 0, i;
+   int num = 2;
+   int ret = 0;
 
req = nlmsg_data(cb->nlh);
 
-   for (i = 0; i < htbl->size; i++) {
-   struct rhash_head *pos;
+   if (s_num > 1)
+   goto mc_list;
 
-   rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) {
-   sk = (struct sock *)nlsk;
+   num--;
 
-   if (!net_eq(sock_net(sk), net))
-   continue;
-   if (num < s_num) {
-   num++;
+   if (!hti) {
+   hti = kmalloc(sizeof(*hti), GFP_KERNEL);
+   if (!hti)
+   return -ENOMEM;
+
+   cb->args[2] = (long)hti;
+   }
+
+   if (!s_num)
+   rhashtable_walk_enter(>hash, hti);
+
+   ret = rhashtable_walk_start(hti);
+   if (ret == -EAGAIN)
+   ret = 0;
+   if (ret)
+   goto stop;
+
+   while ((nlsk = rhashtable_walk_next(hti))) {
+   if (IS_ERR(nlsk)) {
+   ret = PTR_ERR(nlsk);
+   if (ret == -EAGAIN) {
+   ret = 0;
continue;
}
+   break;
+   }
 
-   if (sk_diag_fill(sk, skb, req,
-NETLINK_CB(cb->skb).portid,
-cb->nlh->nlmsg_seq,
-NLM_F_MULTI,
-sock_i_ino(sk)) < 0) {
-   ret = 1;
-   goto done;
-   }
+   sk = (struct sock *)nlsk;
 
-   num++;
+   if (!net_eq(sock_net(sk), net))
+   continue;
+
+   if (sk_diag_fill(sk, skb, req,
+NETLINK_CB(cb->skb).portid,
+cb->nlh->nlmsg_seq,
+NLM_F_MULTI,
+sock_i_ino(sk)) < 0) {
+   ret = 1;
+   break;
}
}
 
+stop:
+   rhashtable_walk_stop(hti);
+   if (ret)
+   goto done;
+
+   rhashtable_walk_exit(hti);
+   cb->args[2] = 0;
+   num++;
+
+mc_list:
+   read_lock(_table_lock);
sk_for_each_bound(sk, >mc_list) {
if (sk_hashed(sk))
continue;
@@ -116,13 +148,14 @@ static int __netlink_diag_dump(struct sk_buff *skb, 
struct netlink_callback *cb,
 NLM_F_MULTI,
 sock_i_ino(sk)) < 0) {
ret = 1;
-   goto done;
+   break;
}
num++;
}
+   read_unlock(_table_lock);
+
 done:
cb->args[0] = num;
-   cb->args[1] = protocol;
 
return ret;
 }
@@ -131,20 +164,20 @@ static int netlink_diag_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
 {
struct netlink_diag_req *req;
int s_num = cb->args[0];
+   int err = 0;
 
req = nlmsg_data(cb->nlh);
 
-   rcu_read_lock();
-   read_lock(_table_lock);
-
if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
int i;
 
for (i = cb->args[1]; i < MAX_LINKS; i++) {
-   if (__netlink_diag_dump(skb, cb, i, s_num))
+   err = __netlink_diag_dump(skb, cb, i, s_num);
+   if (err)
break;
s_num = 0;
}
+

Re: [PATCH 3/3] netlink: Use rhashtable walk interface in diag dump

2016-08-19 Thread Herbert Xu
On Thu, Aug 18, 2016 at 04:50:58PM +0800, Herbert Xu wrote:
> This patch converts the diag dumping code to use the rhashtable
> walk code instead of going through rhashtable by hand.  The lock
> nl_table_lock is now only taken while we process the multicast
> list as it's not needed for the rhashtable walk.
> 
> Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

Please drop this patch.  While the current dump is already buggy
my new version seems worse and returns a lot less entries.  I'll
post a new version of this patch when I figure out why.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCHv3 net-next 3/4] chcr: Support for Chelsio's Crypto Hardware

2016-08-19 Thread Herbert Xu
On Wed, Aug 17, 2016 at 12:33:05PM +0530, Hariprasad Shenai wrote:
> The Chelsio's Crypto Hardware can perform the following operations:
> SHA1, SHA224, SHA256, SHA384 and SHA512, HMAC(SHA1), HMAC(SHA224),
> HMAC(SHA256), HMAC(SHA384), HAMC(SHA512), AES-128-CBC, AES-192-CBC,
> AES-256-CBC, AES-128-XTS, AES-256-XTS
> 
> This patch implements the driver for above mentioned features. This
> driver is an Upper Layer Driver which is attached to Chelsio's LLD
> (cxgb4) and uses the queue allocated by the LLD for sending the crypto
> requests to the Hardware and receiving the responses from it.
> 
> The crypto operations can be performed by Chelsio's hardware from the
> userspace applications and/or from within the kernel space using the
> kernel's crypto API.
> 
> The above mentioned crypto features have been tested using kernel's
> tests mentioned in testmgr.h. They also have been tested from user
> space using libkcapi and Openssl.
> 
> Signed-off-by: Atul Gupta <atul.gu...@chelsio.com>
> Signed-off-by: Hariprasad Shenai <haripra...@chelsio.com>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCHv3 net-next 0/4] crypto/chcr: Add support for Chelsio Crypto Driver

2016-08-19 Thread Herbert Xu
On Thu, Aug 18, 2016 at 11:11:01PM -0700, David Miller wrote:
> From: Hariprasad Shenai <haripra...@chelsio.com>
> Date: Wed, 17 Aug 2016 12:33:02 +0530
> 
> > This patch series adds support for Chelsio Crypto driver. 
> 
> Herbert, what do you want to do with this?  I can push it via
> net-next if you like.

Sure thing, the crypto part looks good to me.  Thanks Dave!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH 3/3] netlink: Use rhashtable walk interface in diag dump

2016-08-18 Thread Herbert Xu
This patch converts the diag dumping code to use the rhashtable
walk code instead of going through rhashtable by hand.  The lock
nl_table_lock is now only taken while we process the multicast
list as it's not needed for the rhashtable walk.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 net/netlink/diag.c |  105 +
 1 file changed, 75 insertions(+), 30 deletions(-)

diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8dd836a..d4e1eca 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -63,43 +63,77 @@ out_nlmsg_trim:
 static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback 
*cb,
int protocol, int s_num)
 {
+   struct rhashtable_iter *hti = (void *)cb->args[2];
struct netlink_table *tbl = _table[protocol];
-   struct rhashtable *ht = >hash;
-   const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
struct net *net = sock_net(skb->sk);
struct netlink_diag_req *req;
struct netlink_sock *nlsk;
+   struct rhash_head *obj;
struct sock *sk;
-   int ret = 0, num = 0, i;
+   int num = 2;
+   int ret = 0;
 
req = nlmsg_data(cb->nlh);
 
-   for (i = 0; i < htbl->size; i++) {
-   struct rhash_head *pos;
+   if (s_num > 1)
+   goto mc_list;
 
-   rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) {
-   sk = (struct sock *)nlsk;
+   if (!hti) {
+   hti = kmalloc(sizeof(*hti), GFP_KERNEL);
+   if (!hti)
+   return -ENOMEM;
 
-   if (!net_eq(sock_net(sk), net))
-   continue;
-   if (num < s_num) {
-   num++;
+   cb->args[2] = (long)hti;
+   }
+
+   if (!s_num) {
+   rhashtable_walk_enter(>hash, hti);
+   num--;
+   }
+
+   ret = rhashtable_walk_start(hti);
+   if (ret == -EAGAIN)
+   ret = 0;
+   if (ret)
+   goto stop;
+
+   while ((obj = rhashtable_walk_next(hti))) {
+   if (IS_ERR(obj)) {
+   ret = PTR_ERR(obj);
+   if (ret == -EAGAIN) {
+   ret = 0;
continue;
}
+   break;
+   }
 
-   if (sk_diag_fill(sk, skb, req,
-NETLINK_CB(cb->skb).portid,
-cb->nlh->nlmsg_seq,
-NLM_F_MULTI,
-sock_i_ino(sk)) < 0) {
-   ret = 1;
-   goto done;
-   }
+   nlsk = container_of(obj, struct netlink_sock, node);
+   sk = (struct sock *)nlsk;
 
-   num++;
+   if (!net_eq(sock_net(sk), net))
+   continue;
+
+   if (sk_diag_fill(sk, skb, req,
+NETLINK_CB(cb->skb).portid,
+cb->nlh->nlmsg_seq,
+NLM_F_MULTI,
+sock_i_ino(sk)) < 0) {
+   ret = 1;
+   break;
}
}
 
+stop:
+   rhashtable_walk_stop(hti);
+   if (ret)
+   goto done;
+
+   rhashtable_walk_exit(hti);
+   cb->args[2] = 0;
+   num++;
+
+mc_list:
+   read_lock(_table_lock);
sk_for_each_bound(sk, >mc_list) {
if (sk_hashed(sk))
continue;
@@ -116,13 +150,14 @@ static int __netlink_diag_dump(struct sk_buff *skb, 
struct netlink_callback *cb,
 NLM_F_MULTI,
 sock_i_ino(sk)) < 0) {
ret = 1;
-   goto done;
+   break;
}
num++;
}
+   read_unlock(_table_lock);
+
 done:
cb->args[0] = num;
-   cb->args[1] = protocol;
 
return ret;
 }
@@ -131,20 +166,20 @@ static int netlink_diag_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
 {
struct netlink_diag_req *req;
int s_num = cb->args[0];
+   int err = 0;
 
req = nlmsg_data(cb->nlh);
 
-   rcu_read_lock();
-   read_lock(_table_lock);
-
if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
int i;
 
for (i = cb->args[1]; i < MAX_LINKS; i++) {
-   if (__netlink_diag_dump(skb, cb, i, s_num))
+   err = __netlink_diag_dump(skb, cb, i, s_num);
+   if (err)
 

[PATCH 2/3] MAINTAINERS: Add extra rhashtable maintainer

2016-08-18 Thread Herbert Xu
As I'm working actively on rhashtable it helps if people CCed me
when they work on in.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 MAINTAINERS |1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5d813a3..db09498 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9641,6 +9641,7 @@ F:net/rfkill/
 
 RHASHTABLE
 M: Thomas Graf <tg...@suug.ch>
+M:     Herbert Xu <herb...@gondor.apana.org.au>
 L: netdev@vger.kernel.org
 S: Maintained
 F: lib/rhashtable.c


[PATCH 1/3] rhashtable: Remove GFP flag from rhashtable_walk_init

2016-08-18 Thread Herbert Xu
The commit 8f6fd83c6c5ec66a4a70c728535ddcdfef4f3697 ("rhashtable:
accept GFP flags in rhashtable_walk_init") added a GFP flag argument
to rhashtable_walk_init because some users wish to use the walker
in an unsleepable context.

In fact we don't need to allocate memory in rhashtable_walk_init
at all.  The walker is always paired with an iterator so we could
just stash ourselves there.

This patch does that by introducing a new enter function to replace
the existing init function.  This way we don't have to churn all
the existing users again.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>
---

 include/linux/rhashtable.h |   14 ++---
 lib/rhashtable.c   |   46 +
 2 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 3eef080..8b72ee7 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -173,7 +173,7 @@ struct rhashtable_walker {
 struct rhashtable_iter {
struct rhashtable *ht;
struct rhash_head *p;
-   struct rhashtable_walker *walker;
+   struct rhashtable_walker walker;
unsigned int slot;
unsigned int skip;
 };
@@ -346,8 +346,8 @@ struct bucket_table *rhashtable_insert_slow(struct 
rhashtable *ht,
struct bucket_table *old_tbl);
 int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
 
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
-gfp_t gfp);
+void rhashtable_walk_enter(struct rhashtable *ht,
+  struct rhashtable_iter *iter);
 void rhashtable_walk_exit(struct rhashtable_iter *iter);
 int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
@@ -906,4 +906,12 @@ static inline int rhashtable_replace_fast(
return err;
 }
 
+/* Obsolete function, do not use in new code. */
+static inline int rhashtable_walk_init(struct rhashtable *ht,
+  struct rhashtable_iter *iter, gfp_t gfp)
+{
+   rhashtable_walk_enter(ht, iter);
+   return 0;
+}
+
 #endif /* _LINUX_RHASHTABLE_H */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5d845ff..5b0a2b3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -484,10 +484,9 @@ exit:
 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
 /**
- * rhashtable_walk_init - Initialise an iterator
+ * rhashtable_walk_enter - Initialise an iterator
  * @ht:Table to walk over
  * @iter:  Hash table Iterator
- * @gfp:   GFP flags for allocations
  *
  * This function prepares a hash table walk.
  *
@@ -502,30 +501,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * This function may sleep so you must not call it from interrupt
  * context or with spin locks held.
  *
- * You must call rhashtable_walk_exit if this function returns
- * successfully.
+ * You must call rhashtable_walk_exit after this function returns.
  */
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
-gfp_t gfp)
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
 {
iter->ht = ht;
iter->p = NULL;
iter->slot = 0;
iter->skip = 0;
 
-   iter->walker = kmalloc(sizeof(*iter->walker), gfp);
-   if (!iter->walker)
-   return -ENOMEM;
-
spin_lock(>lock);
-   iter->walker->tbl =
+   iter->walker.tbl =
rcu_dereference_protected(ht->tbl, lockdep_is_held(>lock));
-   list_add(>walker->list, >walker->tbl->walkers);
+   list_add(>walker.list, >walker.tbl->walkers);
spin_unlock(>lock);
-
-   return 0;
 }
-EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
 
 /**
  * rhashtable_walk_exit - Free an iterator
@@ -536,10 +527,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init);
 void rhashtable_walk_exit(struct rhashtable_iter *iter)
 {
spin_lock(>ht->lock);
-   if (iter->walker->tbl)
-   list_del(>walker->list);
+   if (iter->walker.tbl)
+   list_del(>walker.list);
spin_unlock(>ht->lock);
-   kfree(iter->walker);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 
@@ -565,12 +555,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter)
rcu_read_lock();
 
spin_lock(>lock);
-   if (iter->walker->tbl)
-   list_del(>walker->list);
+   if (iter->walker.tbl)
+   list_del(>walker.list);
spin_unlock(>lock);
 
-   if (!iter->walker->tbl) {
-   iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
+   if (!iter->walker.tbl) {
+   iter->wal

[PATCH 0/3] rhashtable: Get rid of raw table walkers part 1

2016-08-18 Thread Herbert Xu
Hi:

This series starts the process of getting rid of all raw rhashtable
walkers (e.g., using any of the rht_for_each helpers) from the
kernel.

We need to do this before I can fix the resize kmalloc failure issue
by using multi-layered tables.

We should do this anyway because almost all raw table walkers are
already buggy in that they don't handle multiple rhashtables during
a resize.

Dave/Tomas, please keep an eye out for any new patches that try
to introduce raw table walkers and nack them.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] rhashtable: fix shift by 64 when shrinking

2016-08-15 Thread Herbert Xu
On Fri, Aug 12, 2016 at 08:10:44PM +0200, Vegard Nossum wrote:
> I got this:
> 
> 
> 
> UBSAN: Undefined behaviour in ./include/linux/log2.h:63:13
> shift exponent 64 is too large for 64-bit type 'long unsigned int'
> CPU: 1 PID: 721 Comm: kworker/1:1 Not tainted 4.8.0-rc1+ #87
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014
> Workqueue: events rht_deferred_worker
>   88011661f8d8 82344f50 41b58ab3
>  84f98000 82344ea4 88011661f900 88011661f8b0
>  0001 88011661f6b8 dc00 867f7640
> Call Trace:
>  [] dump_stack+0xac/0xfc
>  [] ? _atomic_dec_and_lock+0xc4/0xc4
>  [] ubsan_epilogue+0xd/0x8a
>  [] __ubsan_handle_shift_out_of_bounds+0x255/0x29a
>  [] ? __ubsan_handle_out_of_bounds+0x180/0x180
>  [] ? nl80211_req_set_reg+0x256/0x2f0
>  [] ? print_context_stack+0x8a/0x160
>  [] ? amd_pmu_reset+0x341/0x380
>  [] rht_deferred_worker+0x1618/0x1790
>  [] ? rht_deferred_worker+0x1618/0x1790
>  [] ? rhashtable_jhash2+0x370/0x370
>  [] ? process_one_work+0x6fd/0x1970
>  [] process_one_work+0x79f/0x1970
>  [] ? process_one_work+0x6fd/0x1970
>  [] ? try_to_grab_pending+0x4c0/0x4c0
>  [] ? worker_thread+0x1c4/0x1340
>  [] worker_thread+0x55f/0x1340
>  [] ? __schedule+0x4df/0x1d40
>  [] ? process_one_work+0x1970/0x1970
>  [] ? process_one_work+0x1970/0x1970
>  [] kthread+0x237/0x390
>  [] ? __kthread_parkme+0x280/0x280
>  [] ? _raw_spin_unlock_irq+0x33/0x50
>  [] ret_from_fork+0x1f/0x40
>  [] ? __kthread_parkme+0x280/0x280
> 
> 
> 
> roundup_pow_of_two() is undefined when called with an argument of 0, so
> let's avoid the call and just fall back to ht->p.min_size (which should
> never be smaller than HASH_MIN_SIZE).
> 
> Cc: Herbert Xu <herb...@gondor.apana.org.au>
> Signed-off-by: Vegard Nossum <vegard.nos...@oracle.com>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Buggy rhashtable walking

2016-08-08 Thread Herbert Xu
On Fri, Aug 05, 2016 at 04:46:43AM -0700, Ben Greear wrote:
>
> It would not be fun to have to revert to the old way of hashing
> stations in mac80211...
> 
> I'll be happy to test the patches when you have them ready.

Thanks for the offer.  Unfortunately it'll be a few days before
I'm ready because I need to work through some crypto patches first.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Buggy rhashtable walking

2016-08-05 Thread Herbert Xu
On Fri, Aug 05, 2016 at 08:16:53AM +0200, Johannes Berg wrote:
> 
> Hm. Would you rather allocate a separate head entry for the hashtable,
> or chain the entries?

My plan is to build support for this directly into rhashtable.
So I'm adding a struct rhlist_head that would be used in place
of rhash_head for these cases and it'll carry an extra pointer
for the list of identical entries.

I will then add an additional layer of insert/lookup interfaces
for rhlist_head.

So bottom-line is that if you have no identical entries that you
only incur an extra 8 bytes per-object.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Buggy rhashtable walking

2016-08-04 Thread Herbert Xu
Hi:

While working on rhashtable I noticed that wireless is walking
rhashtables by hand using rht_for_each_*.  You must not do that
as an rhashtable can entail multiple hash tables when resizing.
If you walk it by hand then you may end up missing entries.

The correct way to do it is to use the rhashtable walk interface.
However, even this comes with the caveat that a given entry may
show up multiple times.  So if you cannot handle that then you
must construct your own data structure outside of rhashtable, like
we do in IPsec.

So the question is can wireless handle seeing an entry multiple
times? In particular, __ieee80211_rx_handle_packet would appear
to process the same packet multiple times if this were to happen.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Buggy rhashtable walking

2016-08-04 Thread Herbert Xu
On Thu, Aug 04, 2016 at 03:18:46PM +0800, Herbert Xu wrote:
> 
> So the question is can wireless handle seeing an entry multiple
> times? In particular, __ieee80211_rx_handle_packet would appear
> to process the same packet multiple times if this were to happen.

It's worse than I thought.  In fact it's not walking the table
at all, rather it's doing a hash lookup by hand!

This cannot possibly work given that rhashtable makes use of
multiple hash tables.

In fact this also demonstrates why putting multiple identical
objects into the same table is crap.  Because there is no sane
way of returning all objects corresponding to a single key, given
that they may be spread over multiple tables.

So I'm going to fix this by consolidating identical objects into
a single rhashtable entry which also lets us get rid of the
insecure_elasticity setting.

So the next time someone comes along and wants to add multiple
objects with the same key to one table, please just say no.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH ipsec] xfrm: Ignore socket policies when rebuilding hash tables

2016-07-29 Thread Herbert Xu
On Fri, Jul 29, 2016 at 09:57:32AM +0200, Tobias Brunner wrote:
> Whenever thresholds are changed the hash tables are rebuilt.  This is
> done by enumerating all policies and hashing and inserting them into
> the right table according to the thresholds and direction.
> 
> Because socket policies are also contained in net->xfrm.policy_all but
> no hash tables are defined for their direction (dir + XFRM_POLICY_MAX)
> this causes a NULL or invalid pointer dereference after returning from
> policy_hash_bysel() if the rebuild is done while any socket policies
> are installed.
> 
> Since the rebuild after changing thresholds is scheduled this crash
> could even occur if the userland sets thresholds seemingly before
> installing any socket policies.
> 
> Fixes: 53c2e285f970 ("xfrm: Do not hash socket policies")
> Signed-off-by: Tobias Brunner <tob...@strongswan.org>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Good catch, thanks!
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] xfrm: use printk instead of WARN for bad policy reporting

2016-07-27 Thread Herbert Xu
On Wed, Jul 27, 2016 at 08:20:57AM +0200, Vegard Nossum wrote:
>
> Here's another patch to remove that too.
> 
> I don't actually *use* this code myself and I feel the justification
> I've given for removing the WARN to be a bit weak, so if you don't take
> the patch I'll just keep it in my local tree to keep it from showing up
> again during fuzzing.

Please just kill the whole else clause.  For soft policy expires
we simply need to relay a message to the KM and nothing more.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] xfrm: use printk instead of WARN for bad policy reporting

2016-07-26 Thread Herbert Xu
On Wed, Jul 20, 2016 at 01:53:12PM +0200, Vegard Nossum wrote:
> On 07/20/2016 10:32 AM, Vegard Nossum wrote:
> >AFAICT this message is just printed whenever input validation fails.
> >This is a normal failure and we shouldn't be dumping the stack over it.
> >
> >Looks like it was originally a printk that was maybe incorrectly
> >upgraded to a WARN:
> >
> >commit 62db5cfd70b1ef53aa21f144a806fe3b78c84fab
> >Author: stephen hemminger <shemmin...@vyatta.com>
> >Date:   Wed May 12 06:37:06 2010 +
> >
> > xfrm: add severity to printk
> 
> Just FYI I'm also running into the
> 
> // reset the timers here?
> WARN(1, "Don't know what to do with soft policy expire\n");
> 
> in xfrm_add_pol_expire() from the same commit, but that looks
> potentially somewhat more serious (or at least it looks like we might
> want to do some sort of cleaning up), so I won't touch it for now.

It certainly shouldn't be a WARN, it probably shouldn't print
anything either.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 3/3] crypto: Added Chelsio Menu to the Kconfig file

2016-07-12 Thread Herbert Xu
On Tue, Jul 12, 2016 at 03:30:41AM +0800, kbuild test robot wrote:
> Hi,
> 
> [auto build test WARNING on net-next/master]
> [also build test WARNING on v4.7-rc7 next-20160711]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]

Yeshaswi, please fix these warnings/errors even though they're
compile-only.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 2/3] chcr: Support for Chelsio's Crypto Hardware

2016-07-12 Thread Herbert Xu
On Mon, Jul 11, 2016 at 11:28:07AM -0700, Yeshaswi M R Gowda wrote:
>
> + u_ctx = ULD_CTX(ctx);
> + if (cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], ctx->tx_channel_id))
> + return -EBUSY;

You cannot just return -EBUSY.  If the request has the MAY_BACKLOG
bit set, it must be queued regardless, but you should return -EBUSY
in order to throttle the user and then call the completion function
with -EINPROGRESS once the queue can accept more requests from the
user.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] xfrm: fix crash in XFRM_MSG_GETSA netlink handler

2016-07-05 Thread Herbert Xu
On Tue, Jul 05, 2016 at 12:13:03PM -0700, David Miller wrote:
> From: Vegard Nossum <vegard.nos...@oracle.com>
> Date: Tue,  5 Jul 2016 10:18:08 +0200
> 
> > If we hit any of the error conditions inside xfrm_dump_sa(), then
> > xfrm_state_walk_init() never gets called. However, we still call
> > xfrm_state_walk_done() from xfrm_dump_sa_done(), which will crash
> > because the state walk was never initialized properly.
> > 
> > We can fix this by setting cb->args[0] only after we've processed the
> > first element and checking this before calling xfrm_state_walk_done().
> > 
> > Fixes: d3623099d3 ("ipsec: add support of limited SA dump")
> > Cc: Nicolas Dichtel <nicolas.dich...@6wind.com>
> > Cc: Steffen Klassert <steffen.klass...@secunet.com>
> > Signed-off-by: Vegard Nossum <vegard.nos...@oracle.com>
> 
> I assume Steffen will pick this up.

I think Steffen said that he is going to be on vacation for two
weeks starting this week.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 net-next] tcp: md5: use kmalloc() backed scratch areas

2016-06-29 Thread Herbert Xu
On Wed, Jun 29, 2016 at 03:39:37PM -0700, Andy Lutomirski wrote:
>
> I don't care about TCP MD5 performance at all.  Ease of maintenance is
> nice, though, and maybe there are other places in the kernel where
> performance does matter.

TCP MD5 is using ahash because it touches SG lists.  Touching
SG lists is a pretty reliable indicator for hashing a large amount
of data.  In fact TCP MD5 hashes the entire packet content so we're
talking about ~1000 bytes, just like IPsec.

Therefore it is completely pointless to use something like shash
for it as shash is only meant to be used for those hashing less
one block of data or less.

If you're aware of any other user in the kernel that is using
ahash and is hashing a small amount of data in aggregate (not
per update) please let me know.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 net-next] tcp: md5: use kmalloc() backed scratch areas

2016-06-29 Thread Herbert Xu
On Wed, Jun 29, 2016 at 08:26:43AM -0700, Andy Lutomirski wrote:
> 
> Two reasons:
> 
> 1. Code like tcp md5 would be simpler if it could pass a scatterlist
> to hash the skb but use a virtual address for the header.

True.  But I bet we can make it simpler in other ways without
creating special interfaces for it.  Look at how we do IPsec
encryption/hashing, this is what TCP md5 should look like.  But
nobody in their right mind would bother doing this because this
is dead code.

> 2. The actual calling sequence and the amount of indirection is much
> less for shash, so hashing short buffer is probably *much* faster.

Really?

Have you measured the speed difference between the ahash and shash
interfaces? Are you sure that this would matter when compared
against the speed of hashing a single MD5 block?

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 net-next] tcp: md5: use kmalloc() backed scratch areas

2016-06-29 Thread Herbert Xu
On Wed, Jun 29, 2016 at 07:59:22AM -0700, Andy Lutomirski wrote:
>> I suspect that, if you compare a synchronous implementation that can
> use virtual addresses to a DMA based implementation that can't, you'll
> find that, for short messages like tcp md5 uses, the synchronous
> implementation would win every time.  I'm wondering if shash should
> gain the ability to use scatterlists and, if it doesn't already have
> it, the ability to use synchronous hardware implementations (like
> SHA-NI, for example, although I don't think that's useful for MD5).

I don't understand, if you add SGs to shash you get ahash.  So
why wouldn't you just use ahash?

AFAICS tcp md5 already uses ahash in sync mode so there is nothing
asynchronous here at all.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 net-next] tcp: md5: use kmalloc() backed scratch areas

2016-06-28 Thread Herbert Xu
On Tue, Jun 28, 2016 at 10:35:31AM -0700, Andy Lutomirski wrote:
>
> Do you mean this code:

Yes.
 
> I'm wondering why support for scatterlists is all-or-nothing.  Why
> can't we initialize a hash object and then alternate between passing
> it scatterlists and pointers?

Because once you have started hashing the hash state is not stored
in a consistent format.  Our software code may maintain one format
while a hardware implementation could do something else altogether.
So you have to stick with one implementation throughout a particular
hashing session.

> I'm guessing that ahash enables async operation and shash is
> synchronous only.  If I'm right, I understand why ahash requires a
> scatterlist.  What I don't understand is why shash can't also accept a
> scatterlist.  It appears that most of the ahash users in the tree
> actually want synchronous crypto and are presumably using ahash for
> some other reason such as ahash's ability to hash via scatterlist (in
> this case, struct page *).

ahash is meant to be the interface everyone uses regardless of
whether they want sync-only or async.  shash should only be used
for small amounts of hashing on virtual addresses.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Doing crypto in small stack buffers (bluetooth vs vmalloc-stack crash, etc)

2016-06-28 Thread Herbert Xu
On Tue, Jun 28, 2016 at 10:32:12AM -0400, George Spelvin wrote:
> 
> - struct crypto_instance
> - struct crypto_spawn
> - struct crypto_blkcipher
> - struct blkcipher_desc
> - More on the context structures returned by crypto_tfm_ctx

blkcipher is obsolete and will be removed soon.  So if you are
going to write this then please document skcipher instead.

> Also not mentioned in the documentation is that some algorithms *do*
> have different implementations depending on key size.  SHA-2 is the
> classic example.

What do you mean by that? SHA has no keying at all.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Doing crypto in small stack buffers (bluetooth vs vmalloc-stack crash, etc)

2016-06-28 Thread Herbert Xu
On Tue, Jun 28, 2016 at 09:23:01AM -0400, George Spelvin wrote:
> 
> Wow, I should see how you do that.  I couldn't get it below 3
> blocks of temporary, and the dst SG list only gives you
> one and a half.

I don't mean that I'm using no temporary buffers at all, just
that the actual crypto only operates on the SG lists.  I'm still
doing the xoring and stitching in temp buffers.  I just counted
and I'm using three blocks like you.

> Is net/sunrpc/auth_gss/gss_krb5_mech.c doing something odd?

Yes gss_krb5_crypto.c is the one.

> I have a request of you: like Andy, I find the crypto layer an
> impenetrable thicket of wrapper structures.  I'm not suggesting there
> aren't reasons for it, but it's extremely hard to infer those reasons by
> looking at the code.  If I were to draft a (hilariously wrong) overview
> document, would you be willing to edit it into correctness?

We have actually gained quite a bit of documentation recently.
Have you looked at Documentation/DocBook/crypto-API.tmpl?

More is always welcome of course.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Doing crypto in small stack buffers (bluetooth vs vmalloc-stack crash, etc)

2016-06-28 Thread Herbert Xu
On Tue, Jun 28, 2016 at 08:37:43AM -0400, George Spelvin wrote:
> Just a note, crypto/cts.c also does a lot of sg_set_buf() in stack buffers.
> 
> I have a local patch (appended, if anyone wants) to reduce the wasteful
> amount of buffer space it uses (from 7 to 3 blocks on encrypt, from
> 6 to 3 blocks on decrypt), but it would take some rework to convert to
> crypto_cipher_encrypt_one() or avoid stack buffers entirely.

I'm currently working on cts and I'm removing the stack usage
altogether by having it operate on the src/dst SG lists only.

It's part of the skcipher conversion though so it'll have to go
through the crypto tree.

BTW, the only cts user in our tree appears to be implementing
CTS all over again and is only calling the crypto API cts for
the last two blocks.  Someone should fix that.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 net-next] tcp: md5: use kmalloc() backed scratch areas

2016-06-27 Thread Herbert Xu
On Mon, Jun 27, 2016 at 10:58:42AM -0700, Andy Lutomirski wrote:
>
> I wonder if it's worth switching from ahash to shash, though.  It
> would probably be simpler and faster.

No shash is not appropriate here because it needs to hash skb
frags which are SG lists.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v4 01/16] bluetooth: Switch SMP to crypto_cipher_encrypt_one()

2016-06-24 Thread Herbert Xu
On Thu, Jun 23, 2016 at 09:22:56PM -0700, Andy Lutomirski wrote:
> SMP does ECB crypto on stack buffers.  This is complicated and
> fragile, and it will not work if the stack is virtually allocated.
> 
> Switch to the crypto_cipher interface, which is simpler and safer.
> 
> Cc: Marcel Holtmann <mar...@holtmann.org>
> Cc: Gustavo Padovan <gust...@padovan.org>
> Cc: Johan Hedberg <johan.hedb...@gmail.com>
> Cc: "David S. Miller" <da...@davemloft.net>
> Cc: linux-blueto...@vger.kernel.org
> Cc: Herbert Xu <herb...@gondor.apana.org.au>
> Cc: netdev@vger.kernel.org
> Signed-off-by: Andy Lutomirski <l...@kernel.org>

Acked-by: Herbert Xu <herb...@gondor.apana.org.au>

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: esp: Fix ESN generation under UDP encapsulation

2016-06-23 Thread Herbert Xu
On Thu, Jun 23, 2016 at 11:52:45AM -0400, David Miller wrote:
> 
> Does the ipv6 side need the same fix?

Last I checked IPv6 didn't do IPsec UDP-encapsulation so we're
safe for now.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Doing crypto in small stack buffers (bluetooth vs vmalloc-stack crash, etc)

2016-06-23 Thread Herbert Xu
On Thu, Jun 23, 2016 at 11:48:25AM +0800, Herbert Xu wrote:
> 
> No we never had such an API in the kernel.  However, I see that
> rxkad does some pretty silly things and we should be able to avoid
> using the stack in pretty much all cases.  Let me try to come up with
> something.

Here it is:

---8<---
Subject: rxrpc: Avoid using stack memory in SG lists in rxkad

rxkad uses stack memory in SG lists which would not work if stacks
were allocated from vmalloc memory.  In fact, in most cases this
isn't even necessary as the stack memory ends up getting copied
over to kmalloc memory.

This patch eliminates all the unnecessary stack memory uses by
supplying the final destination directly to the crypto API.  In
two instances where a temporary buffer is actually needed we also
switch use the skb->cb area instead of the stack.

Finally there is no need to split a split-page buffer into two SG
entries so code dealing with that has been removed.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f0b807a..8ee5933 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -277,6 +277,7 @@ struct rxrpc_connection {
struct key  *key;   /* security for this connection 
(client) */
struct key  *server_key;/* security for this service */
struct crypto_skcipher  *cipher;/* encryption handle */
+   struct rxrpc_crypt  csum_iv_head;   /* leading block for csum_iv */
struct rxrpc_crypt  csum_iv;/* packet checksum base */
unsigned long   events;
 #define RXRPC_CONN_CHALLENGE   0   /* send challenge packet */
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 6b726a0..ee142de 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -105,11 +105,9 @@ static void rxkad_prime_packet_security(struct 
rxrpc_connection *conn)
 {
struct rxrpc_key_token *token;
SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
-   struct scatterlist sg[2];
+   struct rxrpc_crypt *csum_iv;
+   struct scatterlist sg;
struct rxrpc_crypt iv;
-   struct {
-   __be32 x[4];
-   } tmpbuf __attribute__((aligned(16))); /* must all be in same page */
 
_enter("");
 
@@ -119,24 +117,21 @@ static void rxkad_prime_packet_security(struct 
rxrpc_connection *conn)
token = conn->key->payload.data[0];
memcpy(, token->kad->session_key, sizeof(iv));
 
-   tmpbuf.x[0] = htonl(conn->epoch);
-   tmpbuf.x[1] = htonl(conn->cid);
-   tmpbuf.x[2] = 0;
-   tmpbuf.x[3] = htonl(conn->security_ix);
+   csum_iv = >csum_iv_head;
+   csum_iv[0].x[0] = htonl(conn->epoch);
+   csum_iv[0].x[1] = htonl(conn->cid);
+   csum_iv[1].x[0] = 0;
+   csum_iv[1].x[1] = htonl(conn->security_ix);
 
-   sg_init_one([0], , sizeof(tmpbuf));
-   sg_init_one([1], , sizeof(tmpbuf));
+   sg_init_one(, csum_iv, 16);
 
skcipher_request_set_tfm(req, conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
-   skcipher_request_set_crypt(req, [1], [0], sizeof(tmpbuf), iv.x);
+   skcipher_request_set_crypt(req, , , 16, iv.x);
 
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
 
-   memcpy(>csum_iv, [2], sizeof(conn->csum_iv));
-   ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 
__force)tmpbuf.x[2]);
-
_leave("");
 }
 
@@ -150,12 +145,9 @@ static int rxkad_secure_packet_auth(const struct 
rxrpc_call *call,
 {
struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
+   struct rxkad_level1_hdr hdr;
struct rxrpc_crypt iv;
-   struct scatterlist sg[2];
-   struct {
-   struct rxkad_level1_hdr hdr;
-   __be32  first;  /* first four bytes of data and padding */
-   } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+   struct scatterlist sg;
u16 check;
 
sp = rxrpc_skb(skb);
@@ -165,24 +157,21 @@ static int rxkad_secure_packet_auth(const struct 
rxrpc_call *call,
check = sp->hdr.seq ^ sp->hdr.callNumber;
data_size |= (u32)check << 16;
 
-   tmpbuf.hdr.data_size = htonl(data_size);
-   memcpy(, sechdr + 4, sizeof(tmpbuf.first));
+   hdr.data_size = htonl(data_size);
+   memcpy(sechdr, , sizeof(hdr));
 
/* start the encryption afresh */
memset(, 0, sizeof(iv));
 
-   sg_init_one([0], , sizeof(tmpbuf));
-   sg_init_one([1], , sizeof(tmpbuf));
+   sg_init_one(, sechdr, 8);
 
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
-   skcipher_request_set_crypt(req, [1], [0], sizeof(tmpbuf), iv.x);
+   skcipher_request_set_crypt(req, , , 

esp: Fix ESN generation under UDP encapsulation

2016-06-17 Thread Herbert Xu
On Fri, Jun 17, 2016 at 12:24:29PM +0200, Steffen Klassert wrote:
> On Wed, Jun 15, 2016 at 12:44:54AM +, Blair Steven wrote:
> > The restoration is happening - but being actioned on the wrong location.
> > 
> > The destination IP address is being saved and restored, and the SPI 
> > being written directly after the destination IP address. From my 
> > understanding though, the ESN shuffling should have saved and restored 
> > the UDP source / dest ports + SPI.
> 
> Yes, looks like we copy with a wrong offset if udp encapsulation
> is used, skb_transport_header() does not point to the esp header
> in this case. Ccing Herbert, he changed this part when switching
> to the new AEAD interface with
> commit 7021b2e1cddd ("esp4: Switch to new AEAD interface").

Thanks for catching this!

I think rather than changing the transport header (which isn't
quite right because UDP still is the transport protocol), we can
just save the offset locally.  Something like this:

---8<---
Blair Steven noticed that ESN in conjunction with UDP encapsulation
is broken because we set the temporary ESP header to the wrong spot.

This patch fixes this by first of all using the right spot, i.e.,
4 bytes off the real ESP header, and then saving this information
so that after encryption we can restore it properly.

Fixes: 7021b2e1cddd ("esp4: Switch to new AEAD interface")
Reported-by: Blair Steven <blair.ste...@alliedtelesis.co.nz>
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 4779374..d95631d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,11 @@ struct esp_skb_cb {
void *tmp;
 };
 
+struct esp_output_extra {
+   __be32 seqhi;
+   u32 esphoff;
+};
+
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
 
 static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
@@ -35,11 +40,11 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
  *
  * TODO: Use spare space in skb for this where possible.
  */
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
 {
unsigned int len;
 
-   len = seqhilen;
+   len = extralen;
 
len += crypto_aead_ivsize(aead);
 
@@ -57,15 +62,16 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int 
nfrags, int seqhilen)
return kmalloc(len, GFP_ATOMIC);
 }
 
-static inline __be32 *esp_tmp_seqhi(void *tmp)
+static inline void *esp_tmp_extra(void *tmp)
 {
-   return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+   return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
 }
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
 {
return crypto_aead_ivsize(aead) ?
-  PTR_ALIGN((u8 *)tmp + seqhilen,
-crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+  PTR_ALIGN((u8 *)tmp + extralen,
+crypto_aead_alignmask(aead) + 1) : tmp + extralen;
 }
 
 static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 
*iv)
@@ -99,7 +105,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned 
int offset)
 {
struct ip_esp_hdr *esph = (void *)(skb->data + offset);
void *tmp = ESP_SKB_CB(skb)->tmp;
-   __be32 *seqhi = esp_tmp_seqhi(tmp);
+   __be32 *seqhi = esp_tmp_extra(tmp);
 
esph->seq_no = esph->spi;
esph->spi = *seqhi;
@@ -107,7 +113,11 @@ static void esp_restore_header(struct sk_buff *skb, 
unsigned int offset)
 
 static void esp_output_restore_header(struct sk_buff *skb)
 {
-   esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+   void *tmp = ESP_SKB_CB(skb)->tmp;
+   struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+   esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+   sizeof(__be32));
 }
 
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
@@ -121,6 +131,7 @@ static void esp_output_done_esn(struct crypto_async_request 
*base, int err)
 static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 {
int err;
+   struct esp_output_extra *extra;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
@@ -137,8 +148,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff 
*skb)
int tfclen;
int nfrags;
int assoclen;
-   int seqhilen;
-   __be32 *seqhi;
+   int extralen;
__be64 seqno;
 
/* skb is pure payload to encrypt */
@@ -166,21 +176,21 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
nfrags = err;
 
assoclen = sizeof(*esph

Re: rhashtable - rhashtable_insert_fast failed

2016-06-07 Thread Herbert Xu
On Tue, Jun 07, 2016 at 04:47:28PM +0200, Helge Deller wrote:
> On 07.06.2016 16:16, Herbert Xu wrote:
> > On Tue, Jun 07, 2016 at 04:13:50PM +0200, Helge Deller wrote:
> >>
> >> What warnings do you mean specifically? Some specific CONFIG_ option ?
> > 
> > Look for GFP_NOWARN in lib/rhashtable.c and delete it.
> 
> Ok, removed it.
> It generates a kernel warning: 

Thanks.  This is exactly the problem that I'm yet to fix, namely
we're trying to allocate a hash table that's too big to be done
using physically contiguous memory.
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: rhashtable - rhashtable_insert_fast failed

2016-06-07 Thread Herbert Xu
On Tue, Jun 07, 2016 at 04:13:50PM +0200, Helge Deller wrote:
>
> What warnings do you mean specifically? Some specific CONFIG_ option ?

Look for GFP_NOWARN in lib/rhashtable.c and delete it.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: rhashtable - rhashtable_insert_fast failed

2016-06-05 Thread Herbert Xu
On Sat, Jun 04, 2016 at 09:35:27AM +0200, Phil Sutter wrote:
> [Cc'ing other interested parties, therefore full-quoting.]
> 
> Hi Helge,
> 
> On Fri, Jun 03, 2016 at 10:22:26PM +0200, Helge Deller wrote:
> > Hi Phil,
> > 
> > I'm testing 4.7.0-rc1-64bit on a parisc/hppa machine and get
> > those message with CONFIG_TEST_RHASHTABLE=y.
> > 
> > Is this expected/normal ?
> 
> No, this shouldn't happen. Can you possibly bisect the issue?

This reminds me that I do still have to tackle the multi-page
kmalloc issue.  This may or may not be the problem here.  Enabling
warnings at the kmalloc call should be an easy way to check.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts

2016-05-31 Thread Herbert Xu
On Tue, May 31, 2016 at 09:19:37PM -0700, Cong Wang wrote:
>
> Hmm, why could this happen? The upper device should be linked
> with the lower device, where a refcount is already held.
> Also, the work is cancelled in ->uninit().

Of course it can happen.  We are talking about the source macvlan
device that we just looked up using the Ethernet address.  That
device has nothing to do with the packet now so it may be deleted
at any time.

We do flush the work but only when the all macvlan devices on a
port have been deleted.  Perhaps you're confusing the source
device with vlan->lowerdev which is confusingly the actual
hardware device?

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH 2/2] macvlan: Avoid unnecessary multicast cloning

2016-05-31 Thread Herbert Xu
Currently we always queue a multicast packet for further processing,
even if none of the macvlan devices are subscribed to the address.

This patch optimises this by adding a global multicast filter for
a macvlan_port.

Note that this patch doesn't handle the broadcast addresses of the
individual macvlan devices correctly, if they are not all identical
to vlan->lowerdev.  However, this is already broken because there
is no mechanism in place to update the individual multicast filters
when you change the broadcast address.

If someone cares enough they should fix this by collecting all
broadcast addresses for a macvlan as we do for multicast and unicast.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index a71fa59..0c65bd9 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -49,6 +49,7 @@ struct macvlan_port {
boolpassthru;
int count;
struct hlist_head   vlan_source_hash[MACVLAN_HASH_SIZE];
+   DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
 };
 
 struct macvlan_source_entry {
@@ -419,6 +420,8 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
 
port = macvlan_port_get_rcu(skb->dev);
if (is_multicast_ether_addr(eth->h_dest)) {
+   unsigned int hash;
+
skb = ip_check_defrag(dev_net(skb->dev), skb, 
IP_DEFRAG_MACVLAN);
if (!skb)
return RX_HANDLER_CONSUMED;
@@ -436,7 +439,9 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
goto out;
}
 
-   macvlan_broadcast_enqueue(port, src, skb);
+   hash = mc_hash(NULL, eth->h_dest);
+   if (test_bit(hash, port->mc_filter))
+   macvlan_broadcast_enqueue(port, src, skb);
 
return RX_HANDLER_PASS;
}
@@ -722,12 +727,12 @@ static void macvlan_change_rx_flags(struct net_device 
*dev, int change)
}
 }
 
-static void macvlan_set_mac_lists(struct net_device *dev)
+static void macvlan_compute_filter(unsigned long *mc_filter,
+  struct net_device *dev,
+  struct macvlan_dev *vlan)
 {
-   struct macvlan_dev *vlan = netdev_priv(dev);
-
if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
-   bitmap_fill(vlan->mc_filter, MACVLAN_MC_FILTER_SZ);
+   bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ);
} else {
struct netdev_hw_addr *ha;
DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ);
@@ -739,10 +744,33 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 
__set_bit(mc_hash(vlan, dev->broadcast), filter);
 
-   bitmap_copy(vlan->mc_filter, filter, MACVLAN_MC_FILTER_SZ);
+   bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ);
}
+}
+
+static void macvlan_set_mac_lists(struct net_device *dev)
+{
+   struct macvlan_dev *vlan = netdev_priv(dev);
+
+   macvlan_compute_filter(vlan->mc_filter, dev, vlan);
+
dev_uc_sync(vlan->lowerdev, dev);
dev_mc_sync(vlan->lowerdev, dev);
+
+   /* This is slightly inaccurate as we're including the subscription
+* list of vlan->lowerdev too.
+*
+* Bug alert: This only works if everyone has the same broadcast
+* address as lowerdev.  As soon as someone changes theirs this
+* will break.
+*
+* However, this is already broken as when you change your broadcast
+* address we don't get called.
+*
+* The solution is to maintain a list of broadcast addresses like
+* we do for uc/mc, if you care.
+*/
+   macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL);
 }
 
 static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts

2016-05-31 Thread Herbert Xu
When we postpone a broadcast packet we save the source port in
the skb if it is local.  However, the source port can disappear
before we get a chance to process the packet.

This patch fixes this by holding a ref count on the netdev.

It also delays the skb->cb modification until after we allocate
the new skb as you should not modify shared skbs.

Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue")
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..78a00e3 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -305,11 +305,14 @@ static void macvlan_process_broadcast(struct work_struct 
*w)
 
rcu_read_unlock();
 
+   if (src)
+   dev_put(src->dev);
kfree_skb(skb);
}
 }
 
 static void macvlan_broadcast_enqueue(struct macvlan_port *port,
+ const struct macvlan_dev *src,
  struct sk_buff *skb)
 {
struct sk_buff *nskb;
@@ -319,8 +322,12 @@ static void macvlan_broadcast_enqueue(struct macvlan_port 
*port,
if (!nskb)
goto err;
 
+   MACVLAN_SKB_CB(nskb)->src = src;
+
spin_lock(>bc_queue.lock);
if (skb_queue_len(>bc_queue) < MACVLAN_BC_QUEUE_LEN) {
+   if (src)
+   dev_hold(src->dev);
__skb_queue_tail(>bc_queue, nskb);
err = 0;
}
@@ -429,8 +436,7 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
goto out;
}
 
-   MACVLAN_SKB_CB(skb)->src = src;
-   macvlan_broadcast_enqueue(port, skb);
+   macvlan_broadcast_enqueue(port, src, skb);
 
return RX_HANDLER_PASS;
}
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[v2 PATCH 0/2] macvlan: Avoid unnecessary multicast cloning

2016-05-31 Thread Herbert Xu
On Tue, May 31, 2016 at 02:07:13PM -0700, David Miller wrote:
> 
> I think you need to set the vlan->port->mc_filter to all 1's in the
> PROMISC/ALLMUTI branch here.
> 
> Otherwise packets won't properly pass your new hash test.

Good point.  Here's v2.

This patch tries to improve macvlan multicast performance by
maintaining a filter hash at the macvlan_port level so that we
can quickly determine whether a given packet is needed or not.

It is preceded by a patch that fixes a potential use-after-free
bug that I discovered while looking over this.

v2 fixed a bug where promiscuous/allmulti settings weren't handled
correctly.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning

2016-05-30 Thread Herbert Xu
On Mon, May 30, 2016 at 07:27:59PM +0300, Lennert Buytenhek wrote:
>
> That and stack switches to kworker threads and serialisation on
> the bc_queue queue lock.

My patch should resolve these problems too since the packet is
discarded if nobody is interested in it.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH 2/2] macvlan: Avoid unnecessary multicast cloning

2016-05-30 Thread Herbert Xu
Currently we always queue a multicast packet for further processing,
even if none of the macvlan devices are subscribed to the address.

This patch optimises this by adding a global multicast filter for
a macvlan_port.

Note that this patch doesn't handle the broadcast addresses of the
individual macvlan devices correctly, if they are not all identical.
However, this is already broken because there is no mechanism in
place to update the individual multicast filters when you change
the broadcast address.

If someone cares enough they should fix this by collecting all
broadcast addresses for a macvlan as we do for multicast and unicast.

Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index f55fe21..9fa4532 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -49,6 +49,7 @@ struct macvlan_port {
boolpassthru;
int count;
struct hlist_head   vlan_source_hash[MACVLAN_HASH_SIZE];
+   DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
 };
 
 struct macvlan_source_entry {
@@ -418,6 +419,8 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
 
port = macvlan_port_get_rcu(skb->dev);
if (is_multicast_ether_addr(eth->h_dest)) {
+   unsigned int hash;
+
skb = ip_check_defrag(dev_net(skb->dev), skb, 
IP_DEFRAG_MACVLAN);
if (!skb)
return RX_HANDLER_CONSUMED;
@@ -435,7 +438,9 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
goto out;
}
 
-   macvlan_broadcast_enqueue(port, src, skb);
+   hash = mc_hash(NULL, eth->h_dest);
+   if (test_bit(hash, port->mc_filter))
+   macvlan_broadcast_enqueue(port, src, skb);
 
return RX_HANDLER_PASS;
}
@@ -725,6 +730,8 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 {
struct macvlan_dev *vlan = netdev_priv(dev);
 
+   dev_uc_sync(vlan->lowerdev, dev);
+   dev_mc_sync(vlan->lowerdev, dev);
if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
bitmap_fill(vlan->mc_filter, MACVLAN_MC_FILTER_SZ);
} else {
@@ -739,9 +746,31 @@ static void macvlan_set_mac_lists(struct net_device *dev)
__set_bit(mc_hash(vlan, dev->broadcast), filter);
 
bitmap_copy(vlan->mc_filter, filter, MACVLAN_MC_FILTER_SZ);
+
+   /* This is slightly inaccurate as we're including
+* the subscription list of vlan->lowerdev too.
+*/
+   bitmap_zero(filter, MACVLAN_MC_FILTER_SZ);
+   netdev_for_each_mc_addr(ha, vlan->lowerdev) {
+   __set_bit(mc_hash(NULL, ha->addr), filter);
+   }
+
+   /* Bug alert: This only works if everyone has the
+* same broadcast address.  As soon as someone
+* changes theirs this will break.
+*
+* However, this is already broken as when you
+* change your broadcast address we don't get
+* called.
+*
+* The solution is to maintain a list of broadcast
+* addresses like we do for uc/mc, if you care.
+*/
+   __set_bit(mc_hash(NULL, dev->broadcast), filter);
+
+   bitmap_copy(vlan->port->mc_filter, filter,
+   MACVLAN_MC_FILTER_SZ);
}
-   dev_uc_sync(vlan->lowerdev, dev);
-   dev_mc_sync(vlan->lowerdev, dev);
 }
 
 static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH 1/2] macvlan: Fix potential use-after free for broadcasts

2016-05-30 Thread Herbert Xu
When we postpone a broadcast packet we save the source port in
the skb if it is local.  However, the source port can disappear
before we get a chance to process the packet.

This patch fixes this by holding a ref count on the netdev.

It also delays the skb->cb modification until after we allocate
the new skb as you should not modify shared skbs.

Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue")
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..78a00e3 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -305,11 +305,14 @@ static void macvlan_process_broadcast(struct work_struct 
*w)
 
rcu_read_unlock();
 
+   if (src)
+   dev_put(src->dev);
kfree_skb(skb);
}
 }
 
 static void macvlan_broadcast_enqueue(struct macvlan_port *port,
+ const struct macvlan_dev *src,
  struct sk_buff *skb)
 {
struct sk_buff *nskb;
@@ -319,8 +322,12 @@ static void macvlan_broadcast_enqueue(struct macvlan_port 
*port,
if (!nskb)
goto err;
 
+   MACVLAN_SKB_CB(nskb)->src = src;
+
spin_lock(>bc_queue.lock);
if (skb_queue_len(>bc_queue) < MACVLAN_BC_QUEUE_LEN) {
+   if (src)
+   dev_hold(src->dev);
__skb_queue_tail(>bc_queue, nskb);
err = 0;
}
@@ -429,8 +436,7 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
goto out;
}
 
-   MACVLAN_SKB_CB(skb)->src = src;
-   macvlan_broadcast_enqueue(port, skb);
+   macvlan_broadcast_enqueue(port, src, skb);
 
return RX_HANDLER_PASS;
}
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH 0/2] macvlan: Avoid unnecessary multicast cloning

2016-05-30 Thread Herbert Xu
On Fri, May 27, 2016 at 02:44:33AM +0300, Lennert Buytenhek wrote:
> Commit 412ca1550cbecb2c ("macvlan: Move broadcasts into a work queue")
> moved processing of all macvlan multicasts into a work queue.  This
> causes a noticable performance regression when there is heavy multicast
> traffic on the underlying interface for multicast groups that the
> macvlan subinterfaces are not members of, in which case we end up
> cloning all those packets and then freeing them again from a work queue
> without really doing any useful work with them in between.

OK so your motivation is to get rid of the unnecessary memory
allocation, right?

Here's my totally untested patch, it tries to resolve your problem
by maintaining a filter hash at the macvlan_port level so that we
can quickly determine whether a given packet is needed or not.

It is preceded by a patch that fixes a potential use-after-free
bug that I discovered while looking over this.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: BUG: net/netlink: KASAN: use-after-free in netlink_sock_destruct

2016-05-27 Thread Herbert Xu
On Fri, May 27, 2016 at 09:19:48AM -0700, Cong Wang wrote:
>
> This one looks different though, this time the bug is
> triggered in netlink_sock_destruct(), where all the sock
> ref should be gone, which means it is impossible to refer
> nlk->cb anywhere else. Hmm... I have no idea how
> could this happen.

netlink_sock_destruct is one of the two exit paths for cb->skb
so this is consistent with the previous trace.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: BUG: use-after-free in netlink_dump

2016-05-16 Thread Herbert Xu
On Sun, May 15, 2016 at 12:06:46PM -0700, Cong Wang wrote:
>
> Similar to what Richard reported, I think the problem is cb->skb,
> which is exposed to other thread since cb is per netlink socket
> (cb = >cb). IOW, the cb->skb is freed by one thread at the
> end of netlink_dump() meanwhile the other thread is still using
> it via NETLINK_CB(cb->skb).portid.

You're on the right track.  I think what's happening is that the
second thread is starting a new dump and the first thread ends up
freeing the skb of the new dump and leaking the old skb.

---8<---
Subject: netlink: Fix dump skb leak/double free

When we free cb->skb after a dump, we do it after releasing the
lock.  This means that a new dump could have started in the time
being and we'll end up freeing their skb instead of ours.

This patch saves the skb and module before we unlock so we free
the right memory.

Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.")
Reported-by: Baozeng Ding <splovi...@gmail.com>
Signed-off-by: Herbert Xu <herb...@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 215fc08..f8b50c1 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2059,6 +2059,7 @@ static int netlink_dump(struct sock *sk)
struct netlink_callback *cb;
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
+   struct module *module;
int len, err = -ENOBUFS;
int alloc_min_size;
int alloc_size;
@@ -2134,9 +2135,11 @@ static int netlink_dump(struct sock *sk)
cb->done(cb);
 
nlk->cb_running = false;
+   module = cb->module;
+   skb = cb->skb;
mutex_unlock(nlk->cb_mutex);
-   module_put(cb->module);
-   consume_skb(cb->skb);
+   module_put(module);
+   consume_skb(skb);
return 0;
 
 errout_skb:
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] macsec: fix crypto Kconfig dependency

2016-04-18 Thread Herbert Xu
On Mon, Apr 18, 2016 at 12:23:42PM +0200, Arnd Bergmann wrote:
> 
> No, that is not the problem.
> 
> These are the options from the randconfig file that showed the error:
> 
> # CONFIG_MODULES is not set
> # CONFIG_CRYPTO is not set
> CONFIG_CRYPTO_GCM=y
> CONFIG_CRYPTO_AES=y
> CONFIG_MACSEC=y
> # CONFIG_CFG80211 is not set
> 
> As you can see, none of them are modules, and CRYPTO is completely
> disabled.

Right, the problem is that nothing within crypto ever selects
CRYPTO since it's also used as a way of hiding the crypto menu
options.

It's kind of silly really now that IPv4 selects CRYPTO which
means that you can't really avoid seeing all those options which
most users won't care about.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] macsec: fix crypto Kconfig dependency

2016-04-18 Thread Herbert Xu
Arnd Bergmann <a...@arndb.de> wrote:
> The new MACsec driver uses the AES crypto algorithm, but can be configured
> even if CONFIG_CRYPTO is disabled, leading to a build error:
> 
> warning: (MAC80211 && MACSEC) selects CRYPTO_GCM which has unmet direct 
> dependencies (CRYPTO)
> warning: (BT && CEPH_LIB && INET && MAC802154 && MAC80211 && BLK_DEV_RBD && 
> MACSEC && AIRO_CS && LIBIPW && HOSTAP && USB_WUSB && RTLLIB_CRYPTO_CCMP && 
> FS_ENCRYPTION && EXT4_ENCRYPTION && CEPH_FS && BIG_KEYS && ENCRYPTED_KEYS) 
> selects CRYPTO_AES which has unmet direct dependencies (CRYPTO)
> crypto/built-in.o: In function `gcm_enc_copy_hash':
> aes_generic.c:(.text+0x2b8): undefined reference to `crypto_xor'
> aes_generic.c:(.text+0x2dc): undefined reference to `scatterwalk_map_and_copy'
> 
> This adds an explicit 'select CRYPTO' statement the way that other
> drivers handle it.
> 
> Signed-off-by: Arnd Bergmann <a...@arndb.de>
> Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver")

Thnis patch is bogus.  The build error is coming from the fact
that GCM is built-in but CRYPTO_ALGAPI is (presumably) only built
as a module.  The patch in question does nothing to address that
AFAICS.

In fact this strikes me as a kbuild bug because CRYPTO_GCM already
selects (indirectly) CRYPTO_ALGAPI so why is this happening at all?

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] netlink: don't send NETLINK_URELEASE for unbound sockets

2016-04-16 Thread Herbert Xu
Johannes Berg <johan...@sipsolutions.net> wrote:
>
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 215fc08c02ab..330ebd600f25 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -688,7 +688,7 @@ static int netlink_release(struct socket *sock)
> 
>skb_queue_purge(>sk_write_queue);
> 
> -   if (nlk->portid) {
> +   if (nlk->portid && nlk->bound) {

Any reason why we're still testing portid at all? Isn't testing
bound enough?

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 0/3] crypto: af_alg - add TLS type encryption

2016-04-07 Thread Herbert Xu
On Wed, Apr 06, 2016 at 10:56:12AM -0700, Tadeusz Struk wrote:
> 
> The intend is to enable HW acceleration of the TLS protocol.
> The way it will work is that the user space will send a packet of data
> via AF_ALG and HW will authenticate and encrypt it in one go.

There have been suggestions to implement TLS data-path within
the kernel.  So we should decide whether we pursue that or go
with your approach before we start adding algorithms.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864

2016-04-04 Thread Herbert Xu
On Mon, Apr 04, 2016 at 09:26:55PM -0700, Alexander Duyck wrote:
> 
> The problem is right now we are mangling the IP ID for outer headers
> on tunnels.  We end up totally ignoring the delta between the values
> so if you have two flows that get interleaved over the same tunnel GRO
> will currently mash the IP IDs for the two tunnels so that they end up
> overlapping.

Then it should be fixed.  I never reviewed those patches or I would
have objected at the time.

> The reason why I keep referencing RFC 6864 is because it specifies
> that the IP ID field must not be read if the DF bit is set, and that
> if we are manipulating headers we can handle the IP ID as though we
> are the transmitting station.  What this means is that if DF is not
> set we have to have unique values per packet, otherwise we can ignore
> the values if DF is set.

As I said GRO itself should not be visible.  The fact that it is
for tunnels is a bug.
 
> The question I would have is what are you really losing with increment
> from 0 versus fixed 0?  From what I see it is essentially just garbage
> in/garbage out.

GRO is meant to be lossless, that is, you should not be able to
detect its presence from the outside.  If you lose information then
you're breaking this rule and people will soon start asking for it
to be disabled in various situations.

I'm not against doing this per se but it should not be part of the
default configuration.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864

2016-04-04 Thread Herbert Xu
On Mon, Apr 04, 2016 at 09:31:21AM -0700, Alexander Duyck wrote:
> RFC 6864 states that the IPv4 ID field MUST NOT be used for purposes other
> than fragmentation and reassembly.  Currently we are looking at this field
> as a way of identifying what frames can be aggregated and  which cannot for
> GRO.  While this is valid for frames that do not have DF set, it is invalid
> to do so if the bit is set.

This justification is bogus.  GRO is a completely local optimisation
that should have zero visibility to third parties.  So it makes no
sense to talk about RFC compliance of GRO.  The Linux network stack
as a whole is subject to RFC compliance, but not GRO per se.
 
> In the case of the non-incrementing IP ID we will end up losing the data
> that the IP ID is fixed.  However as per RFC 6864 we should be able to
> write any value into the IP ID when the DF bit is set so this should cause
> minimal harm.

No we should not do that, at least not by default.  GRO was designed
to be completely lossless, that is its main advantage of the various
forms of LRO which preceded it.

If you lose that people will start asking it to be disabled for
routers/bridges and we'll be back in the same old mess that we
had with LRO.

So please do this properly and preserve the information in the packet.
As I said earlier all it takes is one single bit, like we do with ECN.
If you put it in the feature bit you'll also allow us to distinguish
between TSO drivers that produce fixed IDs vs. incrementing IDs.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [net PATCH 2/2] ipv4/GRO: Make GRO conform to RFC 6864

2016-04-01 Thread Herbert Xu
On Fri, Apr 01, 2016 at 07:15:33PM -0700, Eric Dumazet wrote:
> On Sat, 2016-04-02 at 09:57 +0800, Herbert Xu wrote:
> > 
> > We could easily fix that by adding a feature bit to control this,
> > something like SKB_GSO_TCP_FIXEDID.
> 
> I understood the patch allowed to aggregate 4 segments having
> 
> ID=12   ID=40   ID=80  ID=1000

Right.  But I haven't seen any justification that we should aggregate
such packets at all.  The only valid case that I have seen is for
the case of unchanging IDs, no?

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [net PATCH 2/2] ipv4/GRO: Make GRO conform to RFC 6864

2016-04-01 Thread Herbert Xu
Eric Dumazet <eric.duma...@gmail.com> wrote:
>
> I do not particularly care, but it is worth mentioning that GRO+TSO
> would not be idempotent anymore.

We could easily fix that by adding a feature bit to control this,
something like SKB_GSO_TCP_FIXEDID.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Question on rhashtable in worst-case scenario.

2016-04-01 Thread Herbert Xu
On Fri, Apr 01, 2016 at 11:34:10PM +0200, Johannes Berg wrote:
>
> I was thinking about that one - it's not obvious to me from the code
> how this "explicitly checking for dups" would be done or let's say how
> rhashtable differentiates. But since it seems to work for Ben until
> hitting a certain number of identical keys, surely that's just me not
> understanding the code rather than anything else :)

It's really simple, rhashtable_insert_fast does not check for dups
while rhashtable_lookup_insert_* do.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Question on rhashtable in worst-case scenario.

2016-03-31 Thread Herbert Xu
On Thu, Mar 31, 2016 at 05:29:59PM +0200, Johannes Berg wrote:
> 
> Does removing this completely disable the "-EEXIST" error? I can't say
> I fully understand the elasticity stuff in __rhashtable_insert_fast().

What EEXIST error are you talking about? The only one that can be
returned on insertion is if you're explicitly checking for dups
which clearly can't be the case for you.

If you're talking about the EEXIST error due to a rehash then it is
completely hidden from you by rhashtable_insert_rehash.

If you actually meant EBUSY then yes this should prevent it from
occurring, unless your chain-length exceeds 2^32.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Question on rhashtable in worst-case scenario.

2016-03-31 Thread Herbert Xu
On Thu, Mar 31, 2016 at 09:46:45AM +0200, Johannes Berg wrote:
>
> In this case, I think perhaps you can just patch your local system with
> the many interfaces connecting to the same AP to add the parameter
> Herbert suggested (.insecure_elasticity = true in sta_rht_params). This
> is, after all, very much a case that "normal" operation doesn't even
> get close to.

I think you should just turn it on everywhere for mac80211.  Chain
length checks simply don't make sense when you allow duplicate
keys in the hash table.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Question on rhashtable in worst-case scenario.

2016-03-30 Thread Herbert Xu
On Wed, Mar 30, 2016 at 04:03:08PM +0200, Johannes Berg wrote:
> 
> But we really don't want that either - in the normal case where you
> don't create all these virtual interfaces for testing, you have a
> certain number of peers that can vary a lot (zero to hundreds, in
> theory thousands) where you *don't* have the same key, so we still want
> to have the rehashing if the chains get longer in that case.

insecure_elasticity only disables rehashing without growing, it
does not inhibit table expansion which is driven by the number of
objects in the whole table.

> It's really just the degenerate case that Ben is creating locally
> that's causing a problem, afaict, though it's a bit disconcerting that
> rhashtable in general can cause strange failures at delete time.

The failure is simple, rhashtable will rehash the table if a given
chain becomes too long.  This simply doesn't work if you hash many
objects with the same key since the chain will never get shorter
even after a rehash (or expansion).

Therefore if your hashtable has to do this then you have to disable
the rehash logic using the insecure_elasticity flag.

Alternatively you can construct your own linked list for objects
with the same key outside of rhashtable and hash the list instead.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: Question on rhashtable in worst-case scenario.

2016-03-30 Thread Herbert Xu
On Wed, Mar 30, 2016 at 11:14:12AM +0200, Johannes Berg wrote:
> On Tue, 2016-03-29 at 09:16 -0700, Ben Greear wrote:
> > Looks like rhashtable has too much policy in it to properly deal with
> > cases where there are too many hash collisions, so I am going to work
> > on reverting it's use in mac80211.
> 
> I'm not really all that happy with that approach - can't we fix the
> rhashtable? It's a pretty rare corner case that many keys really are
> identical and no kind of hash algorithm, but it seems much better to
> still deal with it than to remove the rhashtable usage and go back to
> hand-rolling something.

Well to start with you should assess whether you really want to
hash multiple objects with the same key.  In particular, can an
adversary generate a large number of such objects?

If your conclusion is that yes you really want to do this, then
we have the parameter insecure_elasticity that you can use to
disable the rehashing based on chain length.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net v2] xfrm: Fix crash observed during device unregistration and decryption

2016-03-23 Thread Herbert Xu
On Wed, Mar 23, 2016 at 10:29:25AM -0700, Eric Dumazet wrote:
>
> OK, but before calling netif_rx() are we properly testing dev->flags
> IFF_UP status ?
> 
> Otherwise, we still allow packets being queued after flush_backlog() had
> been called.

That's the first thing enqueue_to_backlog tests.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net v2] xfrm: Fix crash observed during device unregistration and decryption

2016-03-23 Thread Herbert Xu
On Wed, Mar 23, 2016 at 06:25:47AM -0700, Eric Dumazet wrote:
>
> Wont this prevent device from being dismantled ?

This is only held while the crypto processing is ongoing.

> Where is this xfrm queue purged at device dismantle ?

There is no way to cancel an ongoing crypto processing so you'll
just have to wait it out.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] xfrm: don't segment UFO packets

2016-03-19 Thread Herbert Xu
On Thu, Mar 17, 2016 at 06:08:55PM +0100, Jiri Bohac wrote:
> On Thu, Mar 17, 2016 at 11:24:59AM +0100, Steffen Klassert wrote:
> > In IPv6 this check is missing, so this could be the
> > problem if this is IPv6.
> 
> indeed, this patch also fixes my problem:

Hmm, is this what you really want? If I understood you correctly,
you want the fragmentation to occur after IPsec.  So while this
might generate the same output, it is still going to prefragment
the data, only to merge them back for IPsec and then refragment
again.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] xfrm: don't segment UFO packets

2016-03-19 Thread Herbert Xu
On Wed, Mar 16, 2016 at 05:00:26PM +0100, Jiri Bohac wrote:
> xfrm_output() will segment GSO packets, including UDP (UFO) packets.
> this is wrong per RFC4303, section 3.3.4.  Fragmentation:
> 
>If necessary, fragmentation is performed after ESP
>processing within an IPsec implementation.  Thus,
>transport mode ESP is applied only to whole IP
>datagrams (not to IP fragments).
> 
> Prevent xfrm_output() from segmenting UFO packets so that they will be
> fragmented after the xfrm transforms.
> 
> Signed-off-by: Jiri Bohac <jbo...@suse.cz>

Fair enough.  But I wonder if this is enough.  Wouldn't UDP notice
that we're doing IPsec and prefragment the packet anyway? So I think
this check may also be needed in the UDP output path.

Thanks,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [RFC] xfrm: netdevice unregistration during decryption

2016-03-08 Thread Herbert Xu
On Tue, Mar 08, 2016 at 07:16:23PM -0700, subas...@codeaurora.org wrote:
>
> 2. Encrypted packet queued for decryption (asynchronous)
> 
> static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
> ...
>  aead_request_set_callback(req, 0, esp_input_done, skb);

I suppose we'll have to hold onto the device at this point.  We
may have to hold onto other resources too.

> Would it make sense here to detect the device going away here using a
> netdev notifier callback and free the packets after the asynchronous
> callback returns.

The same path is used for synchronous processing so you can't just
change it to netif_rx_ni unconditionally.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v3] net:Add sysctl_max_skb_frags

2016-02-03 Thread Herbert Xu
On Wed, Feb 03, 2016 at 09:26:57AM +0100, Hans Westgaard Ry wrote:
> Devices may have limits on the number of fragments in an skb they support.
> Current codebase uses a constant as maximum for number of fragments one
> skb can hold and use.
> When enabling scatter/gather and running traffic with many small messages
> the codebase uses the maximum number of fragments and may thereby violate
> the max for certain devices.
> The patch introduces a global variable as max number of fragments.
> 
> Signed-off-by: Hans Westgaard Ry <hans.westgaard...@oracle.com>
> Reviewed-by: Håkon Bugge <haakon.bu...@oracle.com>

I have to say this seems rather dirty.  I mean if taken to the
extreme wouldn't this mean that we should disable frags altogether
if some NIC can't handle them at all?

Someone suggested earlier to partially linearise the skb, why
couldn't we do that? IOW let's handle this craziness in the crazy
drivers and not in the general stack.

Cheers,
-- 
Email: Herbert Xu <herb...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


<    1   2   3   4   5   6   7   8   9   10   >