Re: [PATCH 06/33] netfilter: nf_tables: Support RULE_ID reference in new rule

2019-01-28 Thread Cong Wang
On Mon, Jan 28, 2019 at 4:00 PM Pablo Neira Ayuso  wrote:
>
> From: Phil Sutter 
>
> To allow for a batch to contain rules in arbitrary ordering, introduce
> NFTA_RULE_POSITION_ID attribute which works just like NFTA_RULE_POSITION
> but contains the ID of another rule within the same batch. This helps
> iptables-nft-restore handling dumps with mixed insert/append commands
> correctly.
>
> Note that NFTA_RULE_POSITION takes precedence over
> NFTA_RULE_POSITION_ID, so if the former is present, the latter is
> ignored.

It looks like you forgot to add NFTA_RULE_POSITION_ID into
nft_rule_policy[]?


[Patch nf-next] nf_conntrack: fix error path in nf_conntrack_pernet_init()

2019-01-23 Thread Cong Wang
When nf_ct_netns_get() fails, it should clean up itself,
its caller doesn't need to call nf_conntrack_fini_net().

nf_conntrack_init_net() is called after registering sysctl
and proc, so its cleanup function should be called before
unregistering sysctl and proc.

Fixes: ba3fbe663635 ("netfilter: nf_conntrack: provide modparam to always 
register conntrack hooks")
Fixes: b884fa461776 ("netfilter: conntrack: unify sysctl handling")
Reported-and-tested-by: syzbot+fcee88b2d87f0539d...@syzkaller.appspotmail.com
Cc: Pablo Neira Ayuso 
Cc: Jozsef Kadlecsik 
Cc: Florian Westphal 
Signed-off-by: Cong Wang 
---
 net/netfilter/nf_conntrack_standalone.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_standalone.c 
b/net/netfilter/nf_conntrack_standalone.c
index 8928a4d0933e..c2ae14c720b4 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1115,11 +1115,11 @@ static int nf_conntrack_pernet_init(struct net *net)
return 0;
 
 out_hooks:
-   nf_conntrack_fini_net(net);
+   nf_conntrack_cleanup_net(net);
 out_init_net:
nf_conntrack_standalone_fini_proc(net);
 out_proc:
-   nf_conntrack_cleanup_net(net);
+   nf_conntrack_standalone_fini_sysctl(net);
return ret;
 }
 
-- 
2.20.1



[Patch nf] xt_hashlimit: use s->file instead of s->private

2018-09-05 Thread Cong Wang
After switching to the new procfs API, it is supposed to
retrieve the private pointer from PDE_DATA(file_inode(s->file)),
s->private is no longer referred.

Fixes: 1cd671827290 ("netfilter/x_tables: switch to proc_create_seq_private")
Reported-by: Sami Farin 
Cc: Christoph Hellwig 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/xt_hashlimit.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9b16402f29af..3e7d259e5d8d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1057,7 +1057,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = 
{
 static void *dl_seq_start(struct seq_file *s, loff_t *pos)
__acquires(htable->lock)
 {
-   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket;
 
spin_lock_bh(&htable->lock);
@@ -1074,7 +1074,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
 
*pos = ++(*bucket);
@@ -1088,7 +1088,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, 
loff_t *pos)
 static void dl_seq_stop(struct seq_file *s, void *v)
__releases(htable->lock)
 {
-   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
 
if (!IS_ERR(bucket))
@@ -1130,7 +1130,7 @@ static void dl_seq_print(struct dsthash_ent *ent, 
u_int8_t family,
 static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
   struct seq_file *s)
 {
-   struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1145,7 +1145,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, 
u_int8_t family,
 static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
   struct seq_file *s)
 {
-   struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1160,7 +1160,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, 
u_int8_t family,
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
 {
-   struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1174,7 +1174,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, 
u_int8_t family,
 
 static int dl_seq_show_v2(struct seq_file *s, void *v)
 {
-   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
 
@@ -1188,7 +1188,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
 
 static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
-   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
 
@@ -1202,7 +1202,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 
 static int dl_seq_show(struct seq_file *s, void *v)
 {
-   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+   struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
 
-- 
2.14.4



Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-24 Thread Cong Wang
On Tue, Jul 24, 2018 at 8:14 AM David Ahern  wrote:
>
> On 7/19/18 11:12 AM, Cong Wang wrote:
> > On Thu, Jul 19, 2018 at 9:16 AM David Ahern  wrote:
> >>
> >> Chatting with Nikolay about this and he brought up a good corollary - ip
> >> fragmentation. It really is a similar problem in that memory is consumed
> >> as a result of packets received from an external entity. The ipfrag
> >> sysctls are per namespace with a limit that non-init_net namespaces can
> >> not set high_thresh > the current value of init_net. Potential memory
> >> consumed by fragments scales with the number of namespaces which is the
> >> primary concern with making neighbor tables per namespace.
> >
> > Nothing new, already discussed:
> > https://marc.info/?l=linux-netdev&m=140391416215988&w=2
> >
> > :)
> >
>
> Neighbor tables, bridge fdbs, vxlan fdbs and ip fragments all consume
> local memory resources due to received packets. bridge and vxlan fdb's
> are fairly straightforward analogs to neighbor entries; they are per
> device with no limits on the number of entries. Fragments have memory
> limits per namespace. So neighbor tables are the only ones with this
> strict limitation and concern on memory consumption.
>
> I get the impression there is no longer a strong resistance against
> moving the tables to per namespace, but deciding what is the right
> approach to handle backwards compatibility. Correct? Changing the
> accounting is inevitably going to be noticeable to some use case(s), but
> with sysctl settings it is a simple runtime update once the user knows
> to make the change.

This question definitely should go to Eric Biederman who was against
my proposal.

Let's add Eric into CC.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-19 Thread Cong Wang
On Thu, Jul 19, 2018 at 9:16 AM David Ahern  wrote:
>
> Chatting with Nikolay about this and he brought up a good corollary - ip
> fragmentation. It really is a similar problem in that memory is consumed
> as a result of packets received from an external entity. The ipfrag
> sysctls are per namespace with a limit that non-init_net namespaces can
> not set high_thresh > the current value of init_net. Potential memory
> consumed by fragments scales with the number of namespaces which is the
> primary concern with making neighbor tables per namespace.

Nothing new, already discussed:
https://marc.info/?l=linux-netdev&m=140391416215988&w=2

:)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-17 Thread Cong Wang
On Tue, Jul 17, 2018 at 12:02 PM David Ahern  wrote:
> As for the per-namespace tables, it is 4 years later and over that time
> Linux supports a number of features: EVPN which is very mac heavy, VRR
> which doubles mac entries (one against the VRR device and one against
> the lower device) and NOS level features such as mlxsw which has to
> ensure mac entries for nexthop gateaways stay active. In addition there
> are other features on the horizon - like the ability to use namespaces
> to create virtual switches (what Cisco calls a VDC) where you absolutely
> want isolation and not allowing entries from virtual switch to evict
> entries from another. And of course the continued proliferation of
> containerized workloads where isolation is desired.

As long as no change in neigh table code base itself, these can't
address the concern people raised before.


>
> I understand the concern about global resource and limits: as it stands
> you have to increase the limits in init_net to the max expected and hope
> for the best. With per namespace limits you can lower the limits of each
> namespace better control the total impact on the total memory used.

The problem is that the number of containers in a host is usually
not predictable.

Of course, you can say containers limit kernel memory too, but
memcg is not part of netns. I once told David Miller cpuset is the
isolation for isolating per-CPU softnet_data, he didn't like it. Based
on that I don't think you can convince him with memcg as a solution
here.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-17 Thread Cong Wang
On Tue, Jul 17, 2018 at 10:43 AM David Ahern  wrote:
>
> On 7/17/18 11:40 AM, Cong Wang wrote:
> > On Tue, Jul 17, 2018 at 5:11 AM  wrote:
> >>
> >> From: David Ahern 
> >>
> >> Nikita Leshenko reported that neighbor entries in one namespace can
> >> evict neighbor entries in another. The problem is that the neighbor
> >> tables have entries across all namespaces without separate accounting
> >> and with global limits on when to scan for entries to evict.
> >
> > It is nothing new, people including me already noticed this before.
> >
> >
> >>
> >> Resolve by making the neighbor tables for ipv4, ipv6 and decnet per
> >> namespace and making the accounting and threshold limits per namespace.
> >
> >
> > The last discussion about this a long time ago concluded that neigh
> > table entries are controllable by remote, so after moving it to per netns,
> > it would be easier to DOS the host.
> >
>
> There are still limits on the total number of entries and with
> per-namespace limits an admin has better control.

Per-netns limit is *exactly* the problem here.

Quote from David Miller:
"
From: ebied...@xmission.com (Eric W. Biederman)
Date: Wed, 25 Jun 2014 18:17:08 -0700

> I disagree that removing a global DOS prevention check is a benefit.
> Certainly large semantics changes like that should not happen without
> being discussed in the patch description.

Agreed, this is the most important core issue.

If we just make these things per netns, then as a result if you create
N namespaces we will allow N times more neighbour entries to be
sitting in the system at once.

Actually, I'm really surprised the limits get hit and this actually
causes problems.
"

You can see the original discussion here:
https://marc.info/?l=linux-netdev&m=140356141019653&w=2
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-17 Thread Cong Wang
On Tue, Jul 17, 2018 at 5:11 AM  wrote:
>
> From: David Ahern 
>
> Nikita Leshenko reported that neighbor entries in one namespace can
> evict neighbor entries in another. The problem is that the neighbor
> tables have entries across all namespaces without separate accounting
> and with global limits on when to scan for entries to evict.

It is nothing new, people including me already noticed this before.


>
> Resolve by making the neighbor tables for ipv4, ipv6 and decnet per
> namespace and making the accounting and threshold limits per namespace.


The last discussion about this a long time ago concluded that neigh
table entries are controllable by remote, so after moving it to per netns,
it would be easier to DOS the host.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next 0/2] net: preserve sock reference when scrubbing the skb.

2018-06-29 Thread Cong Wang
On Thu, Jun 28, 2018 at 7:22 PM David Miller  wrote:
>
> From: Cong Wang 
> Date: Thu, 28 Jun 2018 14:53:09 -0700
>
> > I will send a revert with quote of the above.
>
> And it will go to /dev/null as far as I am concerned.  I read it the
> first time, so posting it again will not change my opinion of what you
> have to say.

David, you claim you read it, now tell me, where is "cgroups" or "cpu"
from?

This is the link of my reply you quoted:
https://marc.info/?l=linux-netdev&m=153013948711582&w=2

I did mention cgroups to Eric because of isolation, the softnet_data
is per-CPU, and CPU is not isolated by netns apparently, therefore
sd->input_pkt_queue can't be totally isolated for netns without cpuset.

But this is never the reason why I dislike it, this is why I never even
mentioned it in the link above.

>
> Cong, you really need to calm down and understand that people perhaps
> simply fundamentally disagree with you.

1. Eric's "forwarding to eth0" is missing, never brought up until in his
private reply. Without this information, XPS makes no sense at all in
this patchset. For the record, I provide a different solution for Eric.

2. No one responses to:
https://marc.info/?l=linux-netdev&m=153013948711582&w=2
I never expect you agree with me on all of them, but no one
gives me any response to my concerns.

3. I will write a blog post to draw you some pictures, since it
is so hard to understand the isolation...
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-28 Thread Cong Wang
On Mon, Jun 25, 2018 at 8:59 AM Flavio Leitner  wrote:
> XPS breaks because the queue mapping stored in the socket is not
> available, so another random queue might be selected when the stack
> needs to transmit something like a TCP ACK, or TCP Retransmissions.
> That causes packet re-ordering and/or performance issues.

Now let me look at the XPS part, a key question first:

By queue mapping stored in socket, you mean sk_tx_queue_get(),
which is only called in __netdev_pick_tx(), and of course even before
hitting qdisc layer.

However, veth device orphans the skb inside its veth_xmit(),
(dev_forward_skb()), which is after going through qdisc layer.

So, how could the skb_orphan() called _after_ XPS break XPS?

We are talking about a simple netns-to-netns case, so XPS won't
be hit again once leaves it.

Another _dumb_ question:

veth is virtual device, it has literally no queues, I know technically
there is a queue for installing qdisc.

So, why does even queue mapping matters here???
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next 0/2] net: preserve sock reference when scrubbing the skb.

2018-06-28 Thread Cong Wang
On Wed, Jun 27, 2018 at 12:39 PM Cong Wang  wrote:
>
> Let me rephrase why I don't like this patchset:
>
> 1. Let's forget about TSQ for a moment, skb_orphan() before leaving
> the stack is not just reasonable but also aligning to network isolation
> design. You can't claim skb_orphan() is broken from beginning, it is
> designed in this way and it is intentional.
>
> 2. Now, let's consider the current TSQ behavior (without any patch):
>
> 2a. For packets leaving the host or just leaving the stack to another
> netns, there is no difference, and this should be expected from user's
> point of view, because I don't need to think about its destination to
> decide how I should configure tcp_limit_output_bytes.
>
> 2b. The hidden pipeline behind TSQ is well defined, that is, any
> queues in between L4 and L2, most importantly qdisc. I can easily
> predict the number of queues my packets will go through with a
> given configuration. This also aligns with 2a.
>
> 2c. Isolation is respected as it should. TCP sockets in this netns
> won't be influenced by any factor in another netns.
>
> Now with your patchset:
>
> 2a. There is an apparent difference for packets leaving the host
> and for packets just leaving this stack.
>
> 2b. You extend the pipeline to another netns's L3, which means
> the number of queues is now unpredictable.
>
> 2c. Isolation is now slightly broken, the other netns could influence
> the source netns.
>
> I don't see you have any good argument on any of these 3 points.

No one finishes reading this.
I will send a revert with quote of the above.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-28 Thread Cong Wang
On Wed, Jun 27, 2018 at 1:19 PM Flavio Leitner  wrote:
>
> On Wed, Jun 27, 2018 at 12:06:16PM -0700, Cong Wang wrote:
> > On Wed, Jun 27, 2018 at 5:32 AM Flavio Leitner  wrote:
> > >
> > > On Tue, Jun 26, 2018 at 06:28:27PM -0700, Cong Wang wrote:
> > > > On Tue, Jun 26, 2018 at 5:39 PM Flavio Leitner  wrote:
> > > > >
> > > > > On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> > > > > > On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner  
> > > > > > wrote:
> > > > > > >
> > > > > > > It is still isolated, the sk carries the netns info and it is
> > > > > > > orphaned when it re-enters the stack.
> > > > > >
> > > > > > Then what difference does your patch make?
> > > > >
> > > > > Don't forget it is fixing two issues.
> > > >
> > > > Sure. I am only talking about TSQ from the very beginning.
> > > > Let me rephrase my above question:
> > > > What difference does your patch make to TSQ?
> > >
> > > It avoids burstiness.
> >
> > Never even mentioned in changelog or in your patch. :-/
>
> It's part of queueing and helping the bufferbloat problem in the
> commit message.

Please don't add all queues in this scope. Are you really
going to put all queues in networking into your "bufferbloat" claim?
Seriously? Please get it defined, seriously. You really need to
read into the other reply from me, none of you or David even
seriously finish reading it.


>
> > > > > > Before your patch:
> > > > > > veth orphans skb in its xmit
> > > > > >
> > > > > > After your patch:
> > > > > > RX orphans it when re-entering stack (as you claimed, I don't know)
> > > > >
> > > > > ip_rcv, and equivalents.
> > > >
> > > > ip_rcv() is L3, we enter a stack from L1. So your above claim is 
> > > > incorrect. :)
> > >
> > > Maybe you found a problem, could you please point me to where in
> > > between L1 to L3 the socket is relevant?
> >
> > Of course, ingress qdisc is in L2. Do I need to say more? This
> > is where we can re-route the packets, for example, redirecting it to
> > yet another netns. This is in fact what we use in production, not anything
> > that only in my imagination.
> >
> > You really have to think about why you allow a different netns influence
> > another netns by holding the skb to throttle the source TCP socket.
>
> Maybe I wasn't clear and you didn't understand the question. Please find
> a spot where the preserved socket is used incorrectly.


It's sad you still don't get what I mean, I never complain you leak skb->sk,
I complain you break TSQ. Dragging discussion into skb->sk doesn't
even help.


>
> > > > which means you have to update Documentation/networking/ip-sysctl.txt
> > > > too.
> > >
> > > How it is never targeted? Whole point is to avoid queueing traffic.
> >
> > What queues? You really need to define it, seriously.
> >
> >
> > > Would you be okay if I include this chunk?
> >
> > No, still lack of an explanation why it comes across netns for
> > a good reason.
>
> Because it doesn't. Since you talk more about veth, let's pick it
> as an example. The TX is nothing more than add to the CPU backlog,


That's RX, assume "CPU backlog" here still means softnet_data.


> right? That is netns agnostic. The same for processing that queue
> which will push the skb anyways and will call skb_orphan().


Once it leaves TX, it leaves the stack. skb_orphan() called
in L3 (as you claimed) is already in yet another stack.


>
> How can one netns avoid/delay the skb_orphan()? And even if does
> that, what gain will you have to allow queuing of more and more
> packets in the CPU backlog? It is stalled.

Please read the other reply from me, you don't even understand
what a boundary of a stack is.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next 0/2] net: preserve sock reference when scrubbing the skb.

2018-06-28 Thread Cong Wang
On Thu, Jun 28, 2018 at 6:20 AM David Miller  wrote:
>
> From: Cong Wang 
> Date: Wed, 27 Jun 2018 12:39:01 -0700
>
> > Let me rephrase why I don't like this patchset:
>
> Cong, I don't think you are seeing the situation clearly and
> I am certainly going to apply this patch series even in the
> face of your objections.
>
> Suggesting that solving the lack of back pressure on a UDP
> socket caused by this problem by using cgroups or cpu
> usage controllers is just complete and utter madness.

Pretty sure you didn't even read the rest of my reply,
I can't help you if you just stop at where you quoted.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-27 Thread Cong Wang
On Wed, Jun 27, 2018 at 12:33 PM Eric Dumazet  wrote:
>
>
>
> On 06/27/2018 11:59 AM, Cong Wang wrote:
>
> >
> > IIRC, this skb_orphan() was introduced much earlier than TSQ, probably
> > from the beginning of veth.
>
> Sigh
>
> SO_SNDBUF was invented years ago before veth.

Yeah, probably when there was only one stack on one host.
SO_SNDBUF is aligned to networking stack basis.

>
> You focus on TSQ while it is only one of the many things that are broken.
>

I think it is the opposite: this patchset _potentially_ breaks things, not fixes
anything.


> >
> > Leaving the stack should be effectively equivalent to leaving the host,
> > from the view of network isolation.
> >
>
>
> Having a UDP socket being able to burn a cpu and fill a qdisc is a major bug.
>


Very true, network isolation never isolates CPU or memory.
It is cpuset's job to provide physical CPU isolation, not networking
namespace. I don't want to defend this, but it is the current design.


> Bu default (blocking send() syscalls) the following loop should
> block the thread if socket sk_wmem_alloc hits sk_sndbuf, this is
> the beauty of backpressure.
>
> while (1)
> send(fd, ...);
>
> With skb_orphan(), sk_wmem_alloc will stay around 0, so the loop will burn a 
> cpu
> and fill a qdisc, eventually breaking "network isolation", since other sockets
> might be unable to send a single packet.

Won't the same happen when congestion on a physical connection
between two hosts? Does 'host isolation' break too?

>
> If you have a concrete case where the skb_orphan() is needed, then you will 
> have
> to add a parameter to let the admin opt-in for this.

Please see the other reply from me, where I list 3 or 4 reasons.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next 0/2] net: preserve sock reference when scrubbing the skb.

2018-06-27 Thread Cong Wang
Let me rephrase why I don't like this patchset:

1. Let's forget about TSQ for a moment, skb_orphan() before leaving
the stack is not just reasonable but also aligning to network isolation
design. You can't claim skb_orphan() is broken from beginning, it is
designed in this way and it is intentional.

2. Now, let's consider the current TSQ behavior (without any patch):

2a. For packets leaving the host or just leaving the stack to another
netns, there is no difference, and this should be expected from user's
point of view, because I don't need to think about its destination to
decide how I should configure tcp_limit_output_bytes.

2b. The hidden pipeline behind TSQ is well defined, that is, any
queues in between L4 and L2, most importantly qdisc. I can easily
predict the number of queues my packets will go through with a
given configuration. This also aligns with 2a.

2c. Isolation is respected as it should. TCP sockets in this netns
won't be influenced by any factor in another netns.

Now with your patchset:

2a. There is an apparent difference for packets leaving the host
and for packets just leaving this stack.

2b. You extend the pipeline to another netns's L3, which means
the number of queues is now unpredictable.

2c. Isolation is now slightly broken, the other netns could influence
the source netns.

I don't see you have any good argument on any of these 3 points.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-27 Thread Cong Wang
On Wed, Jun 27, 2018 at 5:32 AM Flavio Leitner  wrote:
>
> On Tue, Jun 26, 2018 at 06:28:27PM -0700, Cong Wang wrote:
> > On Tue, Jun 26, 2018 at 5:39 PM Flavio Leitner  wrote:
> > >
> > > On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> > > > On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner  wrote:
> > > > >
> > > > > It is still isolated, the sk carries the netns info and it is
> > > > > orphaned when it re-enters the stack.
> > > >
> > > > Then what difference does your patch make?
> > >
> > > Don't forget it is fixing two issues.
> >
> > Sure. I am only talking about TSQ from the very beginning.
> > Let me rephrase my above question:
> > What difference does your patch make to TSQ?
>
> It avoids burstiness.

Never even mentioned in changelog or in your patch. :-/


>
> > > > Before your patch:
> > > > veth orphans skb in its xmit
> > > >
> > > > After your patch:
> > > > RX orphans it when re-entering stack (as you claimed, I don't know)
> > >
> > > ip_rcv, and equivalents.
> >
> > ip_rcv() is L3, we enter a stack from L1. So your above claim is incorrect. 
> > :)
>
> Maybe you found a problem, could you please point me to where in
> between L1 to L3 the socket is relevant?
>

Of course, ingress qdisc is in L2. Do I need to say more? This
is where we can re-route the packets, for example, redirecting it to
yet another netns. This is in fact what we use in production, not anything
that only in my imagination.

You really have to think about why you allow a different netns influence
another netns by holding the skb to throttle the source TCP socket.


>
> > > > And for veth pair:
> > > > xmit from one side is RX for the other side
> > > > So, where is the queueing? Where is the buffer bloat? GRO list??
> > >
> > > CPU backlog.
> >
> > Yeah, but this is never targeted by TSQ:
> >
> > tcp_limit_output_bytes limits the number of bytes on qdisc
> > or device to reduce artificial RTT/cwnd and reduce bufferbloat.
> >
> > which means you have to update Documentation/networking/ip-sysctl.txt
> > too.
>
> How it is never targeted? Whole point is to avoid queueing traffic.

What queues? You really need to define it, seriously.


> Would you be okay if I include this chunk?

No, still lack of an explanation why it comes across netns for
a good reason.

>
> diff --git a/Documentation/networking/ip-sysctl.txt 
> b/Documentation/networking/ip-sysctl.txt
> index ce8fbf5aa63c..f4c042be0216 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -733,11 +733,11 @@ tcp_limit_output_bytes - INTEGER
> Controls TCP Small Queue limit per tcp socket.
> TCP bulk sender tends to increase packets in flight until it
> gets losses notifications. With SNDBUF autotuning, this can
> -   result in a large amount of packets queued in qdisc/device
> -   on the local machine, hurting latency of other flows, for
> -   typical pfifo_fast qdiscs.
> -   tcp_limit_output_bytes limits the number of bytes on qdisc
> -   or device to reduce artificial RTT/cwnd and reduce bufferbloat.
> +   result in a large amount of packets queued on the local machine
> +   (e.g.: qdiscs, CPU backlog, or device) hurting latency of other


Apparently CPU backlog never happens when leaving the host.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-27 Thread Cong Wang
On Tue, Jun 26, 2018 at 7:35 PM Eric Dumazet  wrote:
>
>
>
> On 06/26/2018 05:44 PM, Cong Wang wrote:
>
> > With this, a netns could totally throttle a TCP socket in a different
> > netns by holding the packets infinitely (e.g. putting them in a loop).
> > This is where the isolation breaks.
> >
>
> That is fine, really. Admin error -> Working as intended.

The point is never it is an error or not, the point is one netns could
influence another one with this change.

>
> The current scrubbing is simply wrong, not documented, and added by someone
> who had absolutely not intended all the side effects.
>

IIRC, this skb_orphan() was introduced much earlier than TSQ, probably
from the beginning of veth.

Leaving the stack should be effectively equivalent to leaving the host,
from the view of network isolation.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-26 Thread Cong Wang
On Tue, Jun 26, 2018 at 5:39 PM Flavio Leitner  wrote:
>
> On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> > On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner  wrote:
> > >
> > > It is still isolated, the sk carries the netns info and it is
> > > orphaned when it re-enters the stack.
> >
> > Then what difference does your patch make?
>
> Don't forget it is fixing two issues.

Sure. I am only talking about TSQ from the very beginning.
Let me rephrase my above question:
What difference does your patch make to TSQ?


>
> > Before your patch:
> > veth orphans skb in its xmit
> >
> > After your patch:
> > RX orphans it when re-entering stack (as you claimed, I don't know)
>
> ip_rcv, and equivalents.


ip_rcv() is L3, we enter a stack from L1. So your above claim is incorrect. :)

>
> > And for veth pair:
> > xmit from one side is RX for the other side
> > So, where is the queueing? Where is the buffer bloat? GRO list??
>
> CPU backlog.

Yeah, but this is never targeted by TSQ:

tcp_limit_output_bytes limits the number of bytes on qdisc
or device to reduce artificial RTT/cwnd and reduce bufferbloat.

which means you have to update Documentation/networking/ip-sysctl.txt
too.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-26 Thread Cong Wang
On Tue, Jun 26, 2018 at 4:53 PM Eric Dumazet  wrote:
>
>
>
> On 06/26/2018 03:47 PM, Cong Wang wrote:
> >
> > You need to justify why you want to break the TSQ's scope here,
> > which is obviously not compatible with netns design.
>
> You have to explain why you do not want us to fix this buggy behavior.
>
> Right now TSQ (and more generally back pressure) is broken by this 
> skb_orphan()
>
> So we want to restore TSQ (and back pressure)
>
> TSQ scope never mentioned netns.

Conceptually, it is limiting the pipeline from L4 to L2 within one stack, now
you are extending it to NS1 (L4...L2)...NS2 (L2...L4) which could across
multiple stacks and _in theory_ could be infinitely long.

And TSQ setting is per-netns:

2180 static bool tcp_small_queue_check(struct sock *sk, const struct
sk_buff *skb,
2181   unsigned int factor)
2182 {
2183 unsigned int limit;
2184
2185 limit = max(2 * skb->truesize, sk->sk_pacing_rate >>
sk->sk_pacing_shift);
2186 limit = min_t(u32, limit,
2187   sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2188 limit <<= factor;

Should I expect to increase sysctl_tcp_limit_output_bytes based on the
number of stacks it crosses?

> We (TCP stack TSQ handler) want to be notified when this packet leaves the 
> host,
> even if it had to traverse multiple netns (for whatever reasons).
>
> _If_ a packet is locally 'consumed' (like on loopback device, or veth pair),
> then the skb_orphan() will automatically be done.
>
> If you have a case where this skb_orphan() is needed, please add it at the 
> needed place.


With this, a netns could totally throttle a TCP socket in a different
netns by holding the packets infinitely (e.g. putting them in a loop).
This is where the isolation breaks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-26 Thread Cong Wang
On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner  wrote:
>
> It is still isolated, the sk carries the netns info and it is
> orphaned when it re-enters the stack.

Then what difference does your patch make?

Before your patch:
veth orphans skb in its xmit

After your patch:
RX orphans it when re-entering stack (as you claimed, I don't know)

And for veth pair:
xmit from one side is RX for the other side

So, where is the queueing? Where is the buffer bloat? GRO list??
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-26 Thread Cong Wang
On Tue, Jun 26, 2018 at 3:03 PM Flavio Leitner  wrote:
>
> On Tue, Jun 26, 2018 at 02:48:47PM -0700, Cong Wang wrote:
> > On Mon, Jun 25, 2018 at 11:41 PM Eric Dumazet  
> > wrote:
> > > When a packet is attached to a socket, we should keep the association as 
> > > much as possible.
> >
> > As much as possible within one stack, I agree. I still don't understand
> > why we should keep it across the stack boundary.
> >
> > > Only when a new association needs to be done, skb_orphan() needs to be 
> > > called.
> > >
> > > Doing this skb_orphan() too soon breaks back pressure in general, this is 
> > > bad, since a socket
> > > can evades SO_SNDBUF limits.
> >
> > Right before leaving the stack is not too soon, it is the latest
> > actually, for veth case.
>
> Depends on how you view things - it's the same host/stack sharing the
> same resources, so why should we not keep it?

Because stacks are supposed to be independent, netdevices are
isolated, iptables and route tables too. This is how netns is designed
from the beginning. The trend today is actually more isolation instead
of more sharing, given the popularity of containers.

You need to justify why you want to break the TSQ's scope here,
which is obviously not compatible with netns design.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-26 Thread Cong Wang
On Mon, Jun 25, 2018 at 11:41 PM Eric Dumazet  wrote:
>
>
>
> On 06/25/2018 09:15 PM, Cong Wang wrote:
> > On Mon, Jun 25, 2018 at 8:59 AM Flavio Leitner  wrote:
> >>
> >> The sock reference is lost when scrubbing the packet and that breaks
> >> TSQ (TCP Small Queues) and XPS (Transmit Packet Steering) causing
> >> performance impacts of about 50% in a single TCP stream when crossing
> >> network namespaces.
> >>
> >> XPS breaks because the queue mapping stored in the socket is not
> >> available, so another random queue might be selected when the stack
> >> needs to transmit something like a TCP ACK, or TCP Retransmissions.
> >> That causes packet re-ordering and/or performance issues.
> >>
> >> TSQ breaks because it orphans the packet while it is still in the
> >> host, so packets are queued contributing to the buffer bloat problem.
> >
> > Why should TSQ in one stack care about buffer bloat in another stack?
> >
> > Actually, I think the current behavior is correct, once the packet leaves
> > its current stack (or netns), it should relief the backpressure on TCP
> > socket in this stack, whether it will be queued in another stack is beyond
> > its concern. This breaks the isolation between networking stacks.
> >
>
> We discussed about this during netconf Cong, nobody was against this planned 
> removal.

I agreed to keep skb->sk, but didn't realize it actually impacts TSQ too.


>
> When a packet is attached to a socket, we should keep the association as much 
> as possible.

As much as possible within one stack, I agree. I still don't understand
why we should keep it across the stack boundary.

>
> Only when a new association needs to be done, skb_orphan() needs to be called.
>
> Doing this skb_orphan() too soon breaks back pressure in general, this is 
> bad, since a socket
> can evades SO_SNDBUF limits.

Right before leaving the stack is not too soon, it is the latest
actually, for veth case.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.

2018-06-25 Thread Cong Wang
On Mon, Jun 25, 2018 at 8:59 AM Flavio Leitner  wrote:
>
> The sock reference is lost when scrubbing the packet and that breaks
> TSQ (TCP Small Queues) and XPS (Transmit Packet Steering) causing
> performance impacts of about 50% in a single TCP stream when crossing
> network namespaces.
>
> XPS breaks because the queue mapping stored in the socket is not
> available, so another random queue might be selected when the stack
> needs to transmit something like a TCP ACK, or TCP Retransmissions.
> That causes packet re-ordering and/or performance issues.
>
> TSQ breaks because it orphans the packet while it is still in the
> host, so packets are queued contributing to the buffer bloat problem.

Why should TSQ in one stack care about buffer bloat in another stack?

Actually, I think the current behavior is correct, once the packet leaves
its current stack (or netns), it should relief the backpressure on TCP
socket in this stack, whether it will be queued in another stack is beyond
its concern. This breaks the isolation between networking stacks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/14] Modify action API for implementing lockless actions

2018-05-25 Thread Cong Wang
On Fri, May 25, 2018 at 1:39 PM, Vlad Buslov  wrote:
>
> On Thu 24 May 2018 at 23:34, Cong Wang  wrote:
>> On Mon, May 14, 2018 at 7:27 AM, Vlad Buslov  wrote:
>>> Currently, all netlink protocol handlers for updating rules, actions and
>>> qdiscs are protected with single global rtnl lock which removes any
>>> possibility for parallelism. This patch set is a first step to remove
>>> rtnl lock dependency from TC rules update path. It updates act API to
>>> use atomic operations, rcu and spinlocks for fine-grained locking. It
>>> also extend API with functions that are needed to update existing
>>> actions for parallel execution.
>>
>> Can you give a summary here for what and how it is achieved?
>
> Got it, will expand cover letter in V2 with summary.
>>
>> You said this is the first step, what do you want to achieve in this
>> very first step? And how do you achieve it? Do you break the RTNL
>
> But aren't this questions answered in paragraph you quoted?


Obviously not, you said to remove it, but never explains why it can
be removed and how it is removed. This is crucial for review.

"use atomic operations, rcu and spinlocks for fine-grained locking"
is literately nothing, why atomic/rcu makes RTNL unnecessary?
How RCU is used? What spinlocks are you talking about? What
do these spinlocks protect after removing RTNL? Why are they
safe with other netdevice and netns operations?

You explain _nothing_ here. Really. Please don't force people to
read 14 patches to understand how it works. In fact, no one wants
to read the code unless there is some high-level explanation that
makes basic sense.


> What: Change act API to not rely on one-big-global-RTNL-lock and to use
> more fine-grained synchronization methods to allow safe concurrent
> execution.

Sure, how fine-grained it is after your patchset? Why this fine-grained
lock could safely replace RTNL?

Could you stop letting us guess your puzzle words? It would save your
time from exchanging emails with me, it would save my time from
guessing you too. It is a win-win.


> How: Refactor act API code to use atomics, rcu and spinlocks, etc. for
> protecting shared data structures, add new functions required to update


What shared data structures? The per-netns idr which is already protected
by a spinlock? The TC hierarchy? The shared standalone actions? Hey,
why do I have to guess? :-/


> specific actions implementation for parallel execution. (step 2)


Claim is easy, prove is hard. I can easily claim I break RTNL down
to a per-netns lock, but I can't prove it really works. :-D


>
> If you feel that this cover letter is too terse, I will add outline of
> changes in V2.

It is not my rule, it is how you have to help people to review your
14 patches. I think it is a fair game: you help people like me to
review your patches, we help you to get them reviewed and merged
if they all make sense.



>
>> lock down to, for a quick example, a per-device lock? Or perhaps you
>> completely remove it because of what reason?
>
> I want to remove RTNL _dependency_ from act API data structures and
> code. I probably should me more specific in this case:
>
> Florian recently made a change that allows registering netlink protocol
> handlers with flag RTNL_FLAG_DOIT_UNLOCKED. Handlers registered with
> this flag are called without RTNL taken. My end goal is to have rule
> update handlers(RTM_NEWTFILTER, RTM_DELTFILTER, etc.) to be registered
> with UNLOCKED flag to allow parallel execution.


Please add this paragraph in your cover letter, it is very important for review.

>
> I do not intend to globally remove or break RTNL.
>
>>
>> I go through all the descriptions of your 14 patches (but not any code),
>> I still have no clue how you successfully avoid RTNL. Please don't
>> let me read into your code to understand that, there must be some
>> high-level justification on how it works. Without it, I don't event want
>> to read into the code.
>
> On internal code review I've been asked not to duplicate info from
> commit messages in cover letter, but I guess I can expand it with some
> high level outline in V2.

In cover letter, you should put a high-level overview of "why" and "how".
If, in the worst case, on high-level it doesn't make sense, why should
we bother to read the code? In short, you have to convince people to
read your code here.

In each patch description, you should explain what a single patch does.
I don't see any duplication.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/14] Modify action API for implementing lockless actions

2018-05-24 Thread Cong Wang
On Mon, May 14, 2018 at 7:27 AM, Vlad Buslov  wrote:
> Currently, all netlink protocol handlers for updating rules, actions and
> qdiscs are protected with single global rtnl lock which removes any
> possibility for parallelism. This patch set is a first step to remove
> rtnl lock dependency from TC rules update path. It updates act API to
> use atomic operations, rcu and spinlocks for fine-grained locking. It
> also extend API with functions that are needed to update existing
> actions for parallel execution.

Can you give a summary here for what and how it is achieved?

You said this is the first step, what do you want to achieve in this
very first step? And how do you achieve it? Do you break the RTNL
lock down to, for a quick example, a per-device lock? Or perhaps you
completely remove it because of what reason?

I go through all the descriptions of your 14 patches (but not any code),
I still have no clue how you successfully avoid RTNL. Please don't
let me read into your code to understand that, there must be some
high-level justification on how it works. Without it, I don't event want
to read into the code.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch nf] ipvs: initialize tbl->entries in ip_vs_lblc_init_svc()

2018-04-23 Thread Cong Wang
Similarly, tbl->entries is not initialized after kmalloc(),
therefore causes an uninit-value warning in ip_vs_lblc_check_expire(),
as reported by syzbot.

Reported-by: 
Cc: Simon Horman 
Cc: Julian Anastasov 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/ipvs/ip_vs_lblc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..83918119ceb8 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -371,6 +371,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+   atomic_set(&tbl->entries, 0);
 
/*
 *Hook periodic timer for garbage collection
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch nf] ipvs: initialize tbl->entries after allocation

2018-04-23 Thread Cong Wang
tbl->entries is not initialized after kmalloc(), therefore
causes an uninit-value warning in ip_vs_lblc_check_expire()
as reported by syzbot.

Reported-by: 
Cc: Simon Horman 
Cc: Julian Anastasov 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/ipvs/ip_vs_lblcr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..bc2bc5eebcb8 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -534,6 +534,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+   atomic_set(&tbl->entries, 0);
 
/*
 *Hook periodic timer for garbage collection
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: linux-next: build failure after merge of the netfilter tree

2018-04-16 Thread Cong Wang
On Mon, Apr 16, 2018 at 4:28 PM, Stephen Rothwell  wrote:
> Hi all,
>
> After merging the netfilter tree, today's linux-next build (powerpc
> ppc64_defconfig) failed like this:
>
> net/netfilter/nf_conntrack_extend.c: In function 'nf_ct_ext_
> add':
> net/netfilter/nf_conntrack_extend.c:74:2: error: implicit declaration of 
> function 'kmemleak_not_leak' [-Werror=implicit-function-declaration]
>   kmemleak_not_leak(old);
>   ^
> cc1: some warnings being treated as errors
>
> Caused by commit
>
>   114aa35d06d4 ("netfilter: conntrack: silent a memory leak warning")
>
> I have added this patch for today:

Ouch, this is actually a correct fix.

Can you please send it formally or maybe Pablo can just take it directly?

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch nf] nf_conntrack_extend: silent a memory leak warning

2018-03-30 Thread Cong Wang
The following memory leak is false postive:

unreferenced object 0x8f37f156fb38 (size 128):
  comm "softirq", pid 0, jiffies 4294899665 (age 11.292s)
  hex dump (first 32 bytes):
6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  
00 00 00 00 30 00 20 00 48 6b 6b 6b 6b 6b 6b 6b  0. .Hkkk
  backtrace:
[<4fda266a>] __kmalloc_track_caller+0x10d/0x141
[<7b0a7e3c>] __krealloc+0x45/0x62
[<d08e0bfb>] nf_ct_ext_add+0xdc/0x133
[<99b47fd8>] init_conntrack+0x1b1/0x392
[<86dc36ec>] nf_conntrack_in+0x1ee/0x34b
[<940592de>] nf_hook_slow+0x36/0x95
[<d1bd4da7>] nf_hook.constprop.43+0x1c3/0x1dd
[<c3673266>] __ip_local_out+0xae/0xb4
[<3e4192a6>] ip_local_out+0x17/0x33
[<b64356de>] igmp_ifc_timer_expire+0x23e/0x26f
[<6a8f3032>] call_timer_fn+0x14c/0x2a5
[<650c1725>] __run_timers.part.34+0x150/0x182
[<90e6946e>] run_timer_softirq+0x2a/0x4c
[<4d1e7293>] __do_softirq+0x1d1/0x3c2
[<4643557d>] irq_exit+0x53/0xa2
[<29ddee8f>] smp_apic_timer_interrupt+0x22a/0x235

because __krealloc() is not supposed to release the old
memory and it is released later via kfree_rcu(). Since this is
the only external user of __krealloc(), just mark it as not leak
here.

Cc: Pablo Neira Ayuso 
Cc: Jozsef Kadlecsik 
Cc: Florian Westphal 
Signed-off-by: Cong Wang 
---
 net/netfilter/nf_conntrack_extend.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nf_conntrack_extend.c 
b/net/netfilter/nf_conntrack_extend.c
index 9fe0ddc333fb..bd71a828ebde 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -71,6 +71,7 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, 
gfp_t gfp)
rcu_read_unlock();
 
alloc = max(newlen, NF_CT_EXT_PREALLOC);
+   kmemleak_not_leak(old);
new = __krealloc(old, alloc, gfp);
if (!new)
return NULL;
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: WARNING in __proc_create

2018-03-09 Thread Cong Wang
On Fri, Mar 9, 2018 at 3:21 PM, Eric Dumazet  wrote:
>
>
> On 03/09/2018 03:05 PM, Cong Wang wrote:
>>
>>
>> BTW, the warning itself is all about empty names, so perhaps
>> it's better to fix them separately.
>
>
> Huh ? You want more syzbot reports ? I do not.

I always prefer one patch to fix one problem, and, as I already said,
checking "." and ".." is probably not as trivial as checking empty.

This why I believe 2 (or more) patches here are better than 1 single
patch.

BTW, I don't have any patch, it is so trivial that I don't want to spend
my time on it.

>
> I unblocked this report today [1], you can be sure that as soon
> as syzbot gets the correct tag (Reported-by: ...), it will find the other
> problems, which are currently on hold to avoid spam.
>
> [1] It has been sitting for a while here at Google, since January 28th.
>At some point you have to ping Pablo/Florian again ;)

Sure, it can always find more problems... why just complain about
this one alone? ;)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: WARNING in __proc_create

2018-03-09 Thread Cong Wang
On Fri, Mar 9, 2018 at 2:58 PM, Eric Dumazet  wrote:
>
>
> On 03/09/2018 02:56 PM, Eric Dumazet wrote:
>
>>
>> I sent a patch a while back, but Pablo/Florian wanted more than that
>> simple fix.
>>
>> We also need to filter special characters like '/'

proc_create_data() itself accepts '/', so it must be xt_hashlimit doesn't
want it.

>>
>> Or maybe I am mixing with something else.
>
>
> Yes, Florian mentioned that we also had to reject "."  and ".."
>

It could be, but looks like not as trivial as just strstr(".")?

BTW, the warning itself is all about empty names, so perhaps
it's better to fix them separately.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: WARNING in __proc_create

2018-03-09 Thread Cong Wang
On Fri, Mar 9, 2018 at 1:59 PM, syzbot
 wrote:
> Hello,
>
> syzbot hit the following crash on net-next commit
> 617aebe6a97efa539cc4b8a52adccd89596e6be0 (Sun Feb 4 00:25:42 2018 +)
> Merge tag 'usercopy-v4.16-rc1' of
> git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
>
> So far this crash happened 12 times on net-next, upstream.
> C reproducer is attached.
> syzkaller reproducer is attached.
> Raw console output is attached.
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+0502b00edac2a0680...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
> audit: type=1400 audit(1518223777.419:7): avc:  denied  { map } for
> pid=4159 comm="syzkaller598581" path="/root/syzkaller598581546" dev="sda1"
> ino=16481 scontext=unconfined_u:system_r:insmod_t:s0-s0:c0.c1023
> tcontext=unconfined_u:object_r:user_home_t:s0 tclass=file permissive=1
> [ cut here ]
> name len 0
> WARNING: CPU: 0 PID: 4159 at fs/proc/generic.c:354 __proc_create+0x696/0x880
> fs/proc/generic.c:354

We need to reject empty names.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch nf-next] netfilter: make xt_rateest hash table per net

2018-03-01 Thread Cong Wang
As suggested by Eric, we need to make the xt_rateest
hash table and its lock per netns to reduce lock
contentions.

Cc: Florian Westphal 
Cc: Eric Dumazet 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 include/net/netfilter/xt_rateest.h |  4 +-
 net/netfilter/xt_RATEEST.c | 91 +++---
 net/netfilter/xt_rateest.c | 10 ++---
 3 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/include/net/netfilter/xt_rateest.h 
b/include/net/netfilter/xt_rateest.h
index b1db13772554..832ab69efda5 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -21,7 +21,7 @@ struct xt_rateest {
struct net_rate_estimator __rcu *rate_est;
 };
 
-struct xt_rateest *xt_rateest_lookup(const char *name);
-void xt_rateest_put(struct xt_rateest *est);
+struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name);
+void xt_rateest_put(struct net *net, struct xt_rateest *est);
 
 #endif /* _XT_RATEEST_H */
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 141c295191f6..dec843cadf46 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -14,15 +14,21 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_MUTEX(xt_rateest_mutex);
-
 #define RATEEST_HSIZE  16
-static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
+
+struct xt_rateest_net {
+   struct mutex hash_lock;
+   struct hlist_head hash[RATEEST_HSIZE];
+};
+
+static unsigned int xt_rateest_id;
+
 static unsigned int jhash_rnd __read_mostly;
 
 static unsigned int xt_rateest_hash(const char *name)
@@ -31,21 +37,23 @@ static unsigned int xt_rateest_hash(const char *name)
   (RATEEST_HSIZE - 1);
 }
 
-static void xt_rateest_hash_insert(struct xt_rateest *est)
+static void xt_rateest_hash_insert(struct xt_rateest_net *xn,
+  struct xt_rateest *est)
 {
unsigned int h;
 
h = xt_rateest_hash(est->name);
-   hlist_add_head(&est->list, &rateest_hash[h]);
+   hlist_add_head(&est->list, &xn->hash[h]);
 }
 
-static struct xt_rateest *__xt_rateest_lookup(const char *name)
+static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn,
+ const char *name)
 {
struct xt_rateest *est;
unsigned int h;
 
h = xt_rateest_hash(name);
-   hlist_for_each_entry(est, &rateest_hash[h], list) {
+   hlist_for_each_entry(est, &xn->hash[h], list) {
if (strcmp(est->name, name) == 0) {
est->refcnt++;
return est;
@@ -55,20 +63,23 @@ static struct xt_rateest *__xt_rateest_lookup(const char 
*name)
return NULL;
 }
 
-struct xt_rateest *xt_rateest_lookup(const char *name)
+struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name)
 {
+   struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
struct xt_rateest *est;
 
-   mutex_lock(&xt_rateest_mutex);
-   est = __xt_rateest_lookup(name);
-   mutex_unlock(&xt_rateest_mutex);
+   mutex_lock(&xn->hash_lock);
+   est = __xt_rateest_lookup(xn, name);
+   mutex_unlock(&xn->hash_lock);
return est;
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
-void xt_rateest_put(struct xt_rateest *est)
+void xt_rateest_put(struct net *net, struct xt_rateest *est)
 {
-   mutex_lock(&xt_rateest_mutex);
+   struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
+
+   mutex_lock(&xn->hash_lock);
if (--est->refcnt == 0) {
hlist_del(&est->list);
gen_kill_estimator(&est->rate_est);
@@ -78,7 +89,7 @@ void xt_rateest_put(struct xt_rateest *est)
 */
kfree_rcu(est, rcu);
}
-   mutex_unlock(&xt_rateest_mutex);
+   mutex_unlock(&xn->hash_lock);
 }
 EXPORT_SYMBOL_GPL(xt_rateest_put);
 
@@ -98,6 +109,7 @@ xt_rateest_tg(struct sk_buff *skb, const struct 
xt_action_param *par)
 
 static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 {
+   struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id);
struct xt_rateest_target_info *info = par->targinfo;
struct xt_rateest *est;
struct {
@@ -108,10 +120,10 @@ static int xt_rateest_tg_checkentry(const struct 
xt_tgchk_param *par)
 
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
-   mutex_lock(&xt_rateest_mutex);
-   est = __xt_rateest_lookup(info->name);
+   mutex_lock(&xn->hash_lock);
+   est = __xt_rateest_lookup(xn, info->name);
if (est) {
-   mutex_unlock(&xt_rateest_mutex);
+   mutex_unlock(&xn->hash_lock);
/*
 * If estimator parameters are specified, they

[Patch net v2] ipt_CLUSTERIP: fix a refcount bug in clusterip_config_find_get()

2018-02-08 Thread Cong Wang
In clusterip_config_find_get() we hold RCU read lock so it could
run concurrently with clusterip_config_entry_put(), as a result,
the refcnt could go back to 1 from 0, which leads to a double
list_del()... Just replace refcount_inc() with
refcount_inc_not_zero(), as for c->refcount.

Fixes: d73f33b16883 ("netfilter: CLUSTERIP: RCU conversion")
Cc: Eric Dumazet 
Cc: Pablo Neira Ayuso 
Cc: Florian Westphal 
Signed-off-by: Cong Wang 
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1ff72b87a066..4b02ab39ebc5 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -154,8 +154,12 @@ clusterip_config_find_get(struct net *net, __be32 
clusterip, int entry)
 #endif
if (unlikely(!refcount_inc_not_zero(&c->refcount)))
c = NULL;
-   else if (entry)
-   refcount_inc(&c->entries);
+   else if (entry) {
+   if (unlikely(!refcount_inc_not_zero(&c->entries))) {
+   clusterip_config_put(c);
+   c = NULL;
+   }
+   }
}
rcu_read_unlock_bh();
 
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch net] ipt_CLUSTERIP: fix a refcount bug in clusterip_config_find_get()

2018-02-08 Thread Cong Wang
On Thu, Feb 8, 2018 at 12:01 AM, Florian Westphal  wrote:
> Cong Wang  wrote:
>> In clusterip_config_find_get() we hold RCU read lock so it could
>> run concurrently with clusterip_config_entry_put(), as a result,
>> the refcnt could go back to 1 from 0, which leads to a double
>> list_del()... Just replace refcount_inc() with
>> refcount_inc_not_zero(), as for c->refcount.
>>
>> Fixes: d73f33b16883 ("netfilter: CLUSTERIP: RCU conversion")
>> Cc: Eric Dumazet 
>> Cc: Pablo Neira Ayuso 
>> Signed-off-by: Cong Wang 
>> ---
>>  net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
>> b/net/ipv4/netfilter/ipt_CLUSTERIP.c
>> index 1ff72b87a066..4537b1686c7c 100644
>> --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
>> +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
>> @@ -154,8 +154,10 @@ clusterip_config_find_get(struct net *net, __be32 
>> clusterip, int entry)
>>  #endif
>>   if (unlikely(!refcount_inc_not_zero(&c->refcount)))
>>   c = NULL;
>> - else if (entry)
>> - refcount_inc(&c->entries);
>> + else if (entry) {
>> + if (unlikely(!refcount_inc_not_zero(&c->entries)))
>
> this needs to call clusterip_config_put(c); too, else we leak one
> reference.
>
> Other than that this looks good.

Right, good catch! I will send v2.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch net] ipt_CLUSTERIP: fix a refcount bug in clusterip_config_find_get()

2018-02-07 Thread Cong Wang
In clusterip_config_find_get() we hold RCU read lock so it could
run concurrently with clusterip_config_entry_put(), as a result,
the refcnt could go back to 1 from 0, which leads to a double
list_del()... Just replace refcount_inc() with
refcount_inc_not_zero(), as for c->refcount.

Fixes: d73f33b16883 ("netfilter: CLUSTERIP: RCU conversion")
Cc: Eric Dumazet 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1ff72b87a066..4537b1686c7c 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -154,8 +154,10 @@ clusterip_config_find_get(struct net *net, __be32 
clusterip, int entry)
 #endif
if (unlikely(!refcount_inc_not_zero(&c->refcount)))
c = NULL;
-   else if (entry)
-   refcount_inc(&c->entries);
+   else if (entry) {
+   if (unlikely(!refcount_inc_not_zero(&c->entries)))
+   c = NULL;
+   }
}
rcu_read_unlock_bh();
 
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch net] ipt_CLUSTERIP: fix a race condition of proc file creation

2018-02-07 Thread Cong Wang
There is a race condition between clusterip_config_entry_put()
and clusterip_config_init(), after we release the spinlock in
clusterip_config_entry_put(), a new proc file with a same IP could
be created immediately since it is already removed from the configs
list, therefore it triggers this warning:

[ cut here ]
proc_dir_entry 'ipt_CLUSTERIP/172.20.0.170' already registered
WARNING: CPU: 1 PID: 4152 at fs/proc/generic.c:330 proc_register+0x2a4/0x370 
fs/proc/generic.c:329
Kernel panic - not syncing: panic_on_warn set ...

As a quick fix, just move the proc_remove() inside the spinlock.

Reported-by: 
Fixes: 6c5d5cfbe3c5 ("netfilter: ipt_CLUSTERIP: check duplicate config when 
initializing")
Tested-by: Paolo Abeni 
Cc: Xin Long 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a84a60f6b39..1ff72b87a066 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -107,12 +107,6 @@ clusterip_config_entry_put(struct net *net, struct 
clusterip_config *c)
 
local_bh_disable();
if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
-   list_del_rcu(&c->list);
-   spin_unlock(&cn->lock);
-   local_bh_enable();
-
-   unregister_netdevice_notifier(&c->notifier);
-
/* In case anyone still accesses the file, the open/close
 * functions are also incrementing the refcount on their own,
 * so it's safe to remove the entry even if it's in use. */
@@ -120,6 +114,12 @@ clusterip_config_entry_put(struct net *net, struct 
clusterip_config *c)
if (cn->procdir)
proc_remove(c->pde);
 #endif
+   list_del_rcu(&c->list);
+   spin_unlock(&cn->lock);
+   local_bh_enable();
+
+   unregister_netdevice_notifier(&c->notifier);
+
return;
}
local_bh_enable();
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: WARNING: proc registration bug in clusterip_tg_check

2018-02-06 Thread Cong Wang
On Tue, Feb 6, 2018 at 6:27 AM, syzbot
 wrote:
> Hello,
>
> syzbot hit the following crash on net-next commit
> 617aebe6a97efa539cc4b8a52adccd89596e6be0 (Sun Feb 4 00:25:42 2018 +)
> Merge tag 'usercopy-v4.16-rc1' of
> git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
>
> So far this crash happened 5 times on net-next, upstream.
> C reproducer is attached.
> syzkaller reproducer is attached.
> Raw console output is attached.
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+03218bcdba6aa7644...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
> x_tables: ip_tables: osf match: only valid for protocol 6
> x_tables: ip_tables: osf match: only valid for protocol 6
> x_tables: ip_tables: osf match: only valid for protocol 6
> [ cut here ]
> proc_dir_entry 'ipt_CLUSTERIP/172.20.0.170' already registered
> WARNING: CPU: 1 PID: 4152 at fs/proc/generic.c:330 proc_register+0x2a4/0x370
> fs/proc/generic.c:329
> Kernel panic - not syncing: panic_on_warn set ...
>
> CPU: 1 PID: 4152 Comm: syzkaller851476 Not tainted 4.15.0+ #221
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  panic+0x1e4/0x41c kernel/panic.c:183
>  __warn+0x1dc/0x200 kernel/panic.c:547
>  report_bug+0x211/0x2d0 lib/bug.c:184
>  fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
>  fixup_bug arch/x86/kernel/traps.c:247 [inline]
>  do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
>  invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:1097
> RIP: 0010:proc_register+0x2a4/0x370 fs/proc/generic.c:329
> RSP: 0018:8801cbd6ee20 EFLAGS: 00010286
> RAX: dc08 RBX: 8801d2181038 RCX: 815a57ae
> RDX:  RSI: 1100397add74 RDI: 1100397add49
> RBP: 8801cbd6ee70 R08: 1100397add0b R09: 
> R10: 8801cbd6ecd8 R11:  R12: 8801b2bb1cc0
> R13: dc00 R14: 8801b0d8dbc8 R15: 8801b2bb1d81
>  proc_create_data+0xf8/0x180 fs/proc/generic.c:494
>  clusterip_config_init net/ipv4/netfilter/ipt_CLUSTERIP.c:250 [inline]

I think there is probably a race condition between clusterip_config_entry_put()
and clusterip_config_init(), after we release the spinlock, a new proc
with the same IP could be created therefore triggers this warning

I am not sure if it is enough to just move the proc_remove() under
spinlock...


diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a84a60f6b39..1ff72b87a066 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -107,12 +107,6 @@ clusterip_config_entry_put(struct net *net,
struct clusterip_config *c)

local_bh_disable();
if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
-   list_del_rcu(&c->list);
-   spin_unlock(&cn->lock);
-   local_bh_enable();
-
-   unregister_netdevice_notifier(&c->notifier);
-
/* In case anyone still accesses the file, the open/close
 * functions are also incrementing the refcount on their own,
 * so it's safe to remove the entry even if it's in use. */
@@ -120,6 +114,12 @@ clusterip_config_entry_put(struct net *net,
struct clusterip_config *c)
if (cn->procdir)
proc_remove(c->pde);
 #endif
+   list_del_rcu(&c->list);
+   spin_unlock(&cn->lock);
+   local_bh_enable();
+
+   unregister_netdevice_notifier(&c->notifier);
+
return;
}
local_bh_enable();


>  clusterip_tg_check+0xf9c/0x16d0 net/ipv4/netfilter/ipt_CLUSTERIP.c:488
>  xt_check_target+0x22c/0x7d0 net/netfilter/x_tables.c:850
>  check_target net/ipv4/netfilter/ip_tables.c:513 [inline]
>  find_check_entry.isra.8+0x8c8/0xcb0 net/ipv4/netfilter/ip_tables.c:554
>  translate_table+0xed1/0x1610 net/ipv4/netfilter/ip_tables.c:725
>  do_replace net/ipv4/netfilter/ip_tables.c:1141 [inline]
>  do_ipt_set_ctl+0x370/0x5f0 net/ipv4/netfilter/ip_tables.c:1675
>  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
>  nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115
>  ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1259
>  sctp_setsockopt+0x2b6/0x61d0 net/sctp/socket.c:4104
>  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2975
>  SYSC_setsockopt net/socket.c:1849 [inline]
>  SyS_setsockopt+0x189/0x360 net/socket.c:1828
>  entry_SYSCALL_64_fastpath+0x29/0xa0
> RIP: 0033:0x446839
> RSP: 002b:7f0309d0fdb8 EFLAGS: 0246 ORIG_RAX: 0036
> RAX: ffda RBX: 00

[Patch net v2] xt_RATEEST: acquire xt_rateest_mutex for hash insert

2018-02-05 Thread Cong Wang
rateest_hash is supposed to be protected by xt_rateest_mutex,
and, as suggested by Eric, lookup and insert should be atomic,
so we should acquire the xt_rateest_mutex once for both.

So introduce a non-locking helper for internal use and keep the
locking one for external.

Reported-by: 
Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
Cc: Pablo Neira Ayuso 
Cc: Eric Dumazet 
Signed-off-by: Cong Wang 
---
 net/netfilter/xt_RATEEST.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 498b54fd04d7..141c295191f6 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -39,23 +39,31 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
hlist_add_head(&est->list, &rateest_hash[h]);
 }
 
-struct xt_rateest *xt_rateest_lookup(const char *name)
+static struct xt_rateest *__xt_rateest_lookup(const char *name)
 {
struct xt_rateest *est;
unsigned int h;
 
h = xt_rateest_hash(name);
-   mutex_lock(&xt_rateest_mutex);
hlist_for_each_entry(est, &rateest_hash[h], list) {
if (strcmp(est->name, name) == 0) {
est->refcnt++;
-   mutex_unlock(&xt_rateest_mutex);
return est;
}
}
-   mutex_unlock(&xt_rateest_mutex);
+
return NULL;
 }
+
+struct xt_rateest *xt_rateest_lookup(const char *name)
+{
+   struct xt_rateest *est;
+
+   mutex_lock(&xt_rateest_mutex);
+   est = __xt_rateest_lookup(name);
+   mutex_unlock(&xt_rateest_mutex);
+   return est;
+}
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
 void xt_rateest_put(struct xt_rateest *est)
@@ -100,8 +108,10 @@ static int xt_rateest_tg_checkentry(const struct 
xt_tgchk_param *par)
 
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
-   est = xt_rateest_lookup(info->name);
+   mutex_lock(&xt_rateest_mutex);
+   est = __xt_rateest_lookup(info->name);
if (est) {
+   mutex_unlock(&xt_rateest_mutex);
/*
 * If estimator parameters are specified, they must match the
 * existing estimator.
@@ -139,11 +149,13 @@ static int xt_rateest_tg_checkentry(const struct 
xt_tgchk_param *par)
 
info->est = est;
xt_rateest_hash_insert(est);
+   mutex_unlock(&xt_rateest_mutex);
return 0;
 
 err2:
kfree(est);
 err1:
+   mutex_unlock(&xt_rateest_mutex);
return ret;
 }
 
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch net] xt_RATEEST: acquire xt_rateest_mutex for hash insert

2018-02-01 Thread Cong Wang
On Wed, Jan 31, 2018 at 5:44 PM, Eric Dumazet  wrote:
> On Wed, 2018-01-31 at 16:26 -0800, Cong Wang wrote:
>> rateest_hash is supposed to be protected by xt_rateest_mutex.
>>
>> Reported-by: 
>> Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
>> Cc: Pablo Neira Ayuso 
>> Signed-off-by: Cong Wang 
>> ---
>>  net/netfilter/xt_RATEEST.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
>> index 498b54fd04d7..83ec3a282755 100644
>> --- a/net/netfilter/xt_RATEEST.c
>> +++ b/net/netfilter/xt_RATEEST.c
>> @@ -36,7 +36,9 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
>>   unsigned int h;
>>
>>   h = xt_rateest_hash(est->name);
>> + mutex_lock(&xt_rateest_mutex);
>>   hlist_add_head(&est->list, &rateest_hash[h]);
>> + mutex_unlock(&xt_rateest_mutex);
>>  }
>
> We probably should make this module netns aware, otherwise bad things
> will happen.

Right, both the lock and the hashtable. I can do it for net-next,
if you don't.

>
> (It seems multiple threads could run, so getting the mutex twice
> (xt_rateest_lookup then xt_rateest_hash_insert() is racy)

Yeah, need to merge these two critical sections.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch net] xt_RATEEST: acquire xt_rateest_mutex for hash insert

2018-01-31 Thread Cong Wang
rateest_hash is supposed to be protected by xt_rateest_mutex.

Reported-by: 
Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/xt_RATEEST.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 498b54fd04d7..83ec3a282755 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -36,7 +36,9 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
unsigned int h;
 
h = xt_rateest_hash(est->name);
+   mutex_lock(&xt_rateest_mutex);
hlist_add_head(&est->list, &rateest_hash[h]);
+   mutex_unlock(&xt_rateest_mutex);
 }
 
 struct xt_rateest *xt_rateest_lookup(const char *name)
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch net] xt_cgroup: initialize info->priv in cgroup_mt_check_v1()

2018-01-31 Thread Cong Wang
xt_cgroup_info_v1->priv is an internal pointer only used for kernel,
we should not trust what user-space provides.

Reported-by: 
Fixes: c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/xt_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 1db1ce59079f..891f4e7e8ea7 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -52,6 +52,7 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param 
*par)
return -EINVAL;
}
 
+   info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
if (IS_ERR(cgrp)) {
-- 
2.13.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Memory leaks in conntrack

2017-09-13 Thread Cong Wang
On Wed, Sep 13, 2017 at 9:45 AM, Cong Wang  wrote:
> On Wed, Sep 13, 2017 at 1:05 AM, Florian Westphal  wrote:
>> Cong Wang  wrote:
>>> While testing my TC filter patches (so not related to conntrack), the
>>> following memory leaks are shown up:
>>>
>>> unreferenced object 0x9b19ba551228 (size 128):
>>>   comm "chronyd", pid 338, jiffies 4294910829 (age 53.188s)
>>>   hex dump (first 32 bytes):
>>> 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  
>>> 00 00 00 00 18 00 00 30 00 00 00 00 00 00 00 00  ...0
>>>   backtrace:
>>> [] create_object+0x169/0x2aa
>>> [] kmemleak_alloc+0x25/0x41
>>> [] slab_post_alloc_hook+0x44/0x65
>>> [] __kmalloc_track_caller+0x113/0x146
>>> [] __krealloc+0x4a/0x69
>>> [] nf_ct_ext_add+0xe1/0x145
>>> [] init_conntrack+0x1f7/0x36e
>>> [] nf_conntrack_in+0x1d3/0x326
>>> [] ipv4_conntrack_local+0x4d/0x50
>>> [] nf_hook_slow+0x3c/0x9b
>>> [] nf_hook.constprop.40+0xbe/0xd8
>>> [] __ip_local_out+0xb3/0xbf
>>> [] ip_local_out+0x1c/0x36
>>> [] ip_send_skb+0x19/0x3d
>>> [] udp_send_skb+0x17e/0x1df
>>> [] udp_sendmsg+0x5a2/0x77c
>>> unreferenced object 0x9b19a69b3340 (size 336):
>>>   comm "chronyd", pid 338, jiffies 4294910868 (age 53.032s)
>>>   hex dump (first 32 bytes):
>>> 01 00 00 00 5a 5a 5a 5a 00 00 00 00 ad 4e ad de  .N..
>>> ff ff ff ff 5a 5a 5a 5a ff ff ff ff ff ff ff ff  
>>>   backtrace:
>>> [] create_object+0x169/0x2aa
>>> [] kmemleak_alloc+0x25/0x41
>>> [] slab_post_alloc_hook+0x44/0x65
>>> [] kmem_cache_alloc+0xd7/0x1f1
>>> [] __nf_conntrack_alloc+0xa2/0x146
>>> [] init_conntrack+0xb2/0x36e
>>> [] nf_conntrack_in+0x1d3/0x326
>>> [] ipv4_conntrack_local+0x4d/0x50
>>> [] nf_hook_slow+0x3c/0x9b
>>> [] nf_hook.constprop.40+0xbe/0xd8
>>> [] __ip_local_out+0xb3/0xbf
>>> [] ip_local_out+0x1c/0x36
>>> [] ip_send_skb+0x19/0x3d
>>> [] udp_send_skb+0x17e/0x1df
>>> [] udp_sendmsg+0x5a2/0x77c
>>> [] inet_sendmsg+0x37/0x5e
>>>
>>> I don't touch chronyd in my VM, so I have no idea why it sends out UDP
>>> packets, my guess is it is some periodical packet.
>>>
>>> I don't think I use conntrack either, since /proc/net/ip_conntrack
>>> does not exist.
>>
>> You probably do, can you try "cat /proc/net/nf_conntrack" instead?
>>
>> (otherwise there should be no ipv4_conntrack_local() invocation
>>  since we would not register this hook at all).
>
> Yeah it is very weird but it is true:
>
> [root@localhost ~]# echo scan > /sys/kernel/debug/kmemleak
> [  133.450823] kmemleak: 18 new suspected memory leaks (see
> /sys/kernel/debug/kmemleak)
> [root@localhost ~]# cat /proc/net/ip_conntrack
> cat: /proc/net/ip_conntrack: No such file or directory
> [root@localhost ~]# cat /sys/kernel/debug/kmemleak
> unreferenced object 0x95c1e0b24040 (size 336):
> ...

Oops, you mean nf_conntrack... Here we go:

[root@localhost ~]# cat /proc/net/nf_conntrack
ipv4 2 udp  17 116 src=192.168.124.6 dst=204.2.134.162
sport=123 dport=123 src=204.2.134.162 dst=192.168.124.6 sport=123
dport=123 [ASSURED] mark=0 zone=0 use=2
ipv4 2 udp  17 117 src=192.168.124.6 dst=45.79.187.10
sport=123 dport=123 src=45.79.187.10 dst=192.168.124.6 sport=123
dport=123 [ASSURED] mark=0 zone=0 use=2
ipv4 2 udp  17 110 src=192.168.124.6 dst=192.168.124.1
sport=35486 dport=53 src=192.168.124.1 dst=192.168.124.6 sport=53
dport=35486 [ASSURED] mark=0 zone=0 use=2
ipv4 2 udp  17 110 src=192.168.124.6 dst=192.168.124.1
sport=52373 dport=53 src=192.168.124.1 dst=192.168.124.6 sport=53
dport=52373 [ASSURED] mark=0 zone=0 use=2
ipv4 2 unknown  2 518 src=192.168.124.6 dst=224.0.0.22 [UNREPLIED]
src=224.0.0.22 dst=192.168.124.6 mark=0 zone=0 use=2
ipv4 2 udp  17 110 src=192.168.124.6 dst=192.168.124.1
sport=43242 dport=53 src=192.168.124.1 dst=192.168.124.6 sport=53
dport=43242 [ASSURED] mark=0 zone=0 use=2
ipv4 2 udp  17 116 src=192.168.124.6 dst=96.226.123.196
sport=123 dport=123 src=96.226.123.196 dst=192.168.124.6 sport=123
dport=123 [ASSURED] mark=0 zone=0 use=2
ipv4 2 udp  17 110 src=192.168.124.6 dst=192.168.124.1
sport=42838 dport=53 src=192.168.124.1 dst=192.168.124.6 sport=53
dport=42838 [ASSURED] mark=0 zone=0 use=2
ipv4 2 udp  17 117 src=192.168.124.6 dst=97.127.104.4
sport=123 dport=123 src=97.127.104.4 dst=192.168.124.6 sport=123
dport=123 [ASSURED] mark=0 zone=0 use=2
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Memory leaks in conntrack

2017-09-13 Thread Cong Wang
On Wed, Sep 13, 2017 at 1:05 AM, Florian Westphal  wrote:
> Cong Wang  wrote:
>> While testing my TC filter patches (so not related to conntrack), the
>> following memory leaks are shown up:
>>
>> unreferenced object 0x9b19ba551228 (size 128):
>>   comm "chronyd", pid 338, jiffies 4294910829 (age 53.188s)
>>   hex dump (first 32 bytes):
>> 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  
>> 00 00 00 00 18 00 00 30 00 00 00 00 00 00 00 00  ...0
>>   backtrace:
>> [] create_object+0x169/0x2aa
>> [] kmemleak_alloc+0x25/0x41
>> [] slab_post_alloc_hook+0x44/0x65
>> [] __kmalloc_track_caller+0x113/0x146
>> [] __krealloc+0x4a/0x69
>> [] nf_ct_ext_add+0xe1/0x145
>> [] init_conntrack+0x1f7/0x36e
>> [] nf_conntrack_in+0x1d3/0x326
>> [] ipv4_conntrack_local+0x4d/0x50
>> [] nf_hook_slow+0x3c/0x9b
>> [] nf_hook.constprop.40+0xbe/0xd8
>> [] __ip_local_out+0xb3/0xbf
>> [] ip_local_out+0x1c/0x36
>> [] ip_send_skb+0x19/0x3d
>> [] udp_send_skb+0x17e/0x1df
>> [] udp_sendmsg+0x5a2/0x77c
>> unreferenced object 0x9b19a69b3340 (size 336):
>>   comm "chronyd", pid 338, jiffies 4294910868 (age 53.032s)
>>   hex dump (first 32 bytes):
>> 01 00 00 00 5a 5a 5a 5a 00 00 00 00 ad 4e ad de  .N..
>> ff ff ff ff 5a 5a 5a 5a ff ff ff ff ff ff ff ff  
>>   backtrace:
>> [] create_object+0x169/0x2aa
>> [] kmemleak_alloc+0x25/0x41
>> [] slab_post_alloc_hook+0x44/0x65
>> [] kmem_cache_alloc+0xd7/0x1f1
>> [] __nf_conntrack_alloc+0xa2/0x146
>> [] init_conntrack+0xb2/0x36e
>> [] nf_conntrack_in+0x1d3/0x326
>> [] ipv4_conntrack_local+0x4d/0x50
>> [] nf_hook_slow+0x3c/0x9b
>> [] nf_hook.constprop.40+0xbe/0xd8
>> [] __ip_local_out+0xb3/0xbf
>> [] ip_local_out+0x1c/0x36
>> [] ip_send_skb+0x19/0x3d
>> [] udp_send_skb+0x17e/0x1df
>> [] udp_sendmsg+0x5a2/0x77c
>> [] inet_sendmsg+0x37/0x5e
>>
>> I don't touch chronyd in my VM, so I have no idea why it sends out UDP
>> packets, my guess is it is some periodical packet.
>>
>> I don't think I use conntrack either, since /proc/net/ip_conntrack
>> does not exist.
>
> You probably do, can you try "cat /proc/net/nf_conntrack" instead?
>
> (otherwise there should be no ipv4_conntrack_local() invocation
>  since we would not register this hook at all).

Yeah it is very weird but it is true:

[root@localhost ~]# echo scan > /sys/kernel/debug/kmemleak
[  133.450823] kmemleak: 18 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)
[root@localhost ~]# cat /proc/net/ip_conntrack
cat: /proc/net/ip_conntrack: No such file or directory
[root@localhost ~]# cat /sys/kernel/debug/kmemleak
unreferenced object 0x95c1e0b24040 (size 336):
...


>
> I tried to reproduce this but so far I had no success.
> If you can identify something that could give a hint when this
> is happening (only once after boot, periodically, only with udp, etc)
> please let us know.
>
> (A reproducer would be even better of course ;-) )

Actually, it is even simpler to reproduce, nothing is needed
but wait. I thought it is somewhat triggered by my tests, but
actually no. For me, just boot the VM and wait for several
seconds, memleak will show up.

(chronyd is started by systemd during boot, not me.)

>
> Is this with current net tree?

Yes, I just pulled DaveM's net tree and recompiled the kernel,
still 100% reproducible here.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Memory leaks in conntrack

2017-09-12 Thread Cong Wang
Hello,

While testing my TC filter patches (so not related to conntrack), the
following memory leaks are shown up:


unreferenced object 0x9b19ba551228 (size 128):
  comm "chronyd", pid 338, jiffies 4294910829 (age 53.188s)
  hex dump (first 32 bytes):
6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  
00 00 00 00 18 00 00 30 00 00 00 00 00 00 00 00  ...0
  backtrace:
[] create_object+0x169/0x2aa
[] kmemleak_alloc+0x25/0x41
[] slab_post_alloc_hook+0x44/0x65
[] __kmalloc_track_caller+0x113/0x146
[] __krealloc+0x4a/0x69
[] nf_ct_ext_add+0xe1/0x145
[] init_conntrack+0x1f7/0x36e
[] nf_conntrack_in+0x1d3/0x326
[] ipv4_conntrack_local+0x4d/0x50
[] nf_hook_slow+0x3c/0x9b
[] nf_hook.constprop.40+0xbe/0xd8
[] __ip_local_out+0xb3/0xbf
[] ip_local_out+0x1c/0x36
[] ip_send_skb+0x19/0x3d
[] udp_send_skb+0x17e/0x1df
[] udp_sendmsg+0x5a2/0x77c
unreferenced object 0x9b19a69b3340 (size 336):
  comm "chronyd", pid 338, jiffies 4294910868 (age 53.032s)
  hex dump (first 32 bytes):
01 00 00 00 5a 5a 5a 5a 00 00 00 00 ad 4e ad de  .N..
ff ff ff ff 5a 5a 5a 5a ff ff ff ff ff ff ff ff  
  backtrace:
[] create_object+0x169/0x2aa
[] kmemleak_alloc+0x25/0x41
[] slab_post_alloc_hook+0x44/0x65
[] kmem_cache_alloc+0xd7/0x1f1
[] __nf_conntrack_alloc+0xa2/0x146
[] init_conntrack+0xb2/0x36e
[] nf_conntrack_in+0x1d3/0x326
[] ipv4_conntrack_local+0x4d/0x50
[] nf_hook_slow+0x3c/0x9b
[] nf_hook.constprop.40+0xbe/0xd8
[] __ip_local_out+0xb3/0xbf
[] ip_local_out+0x1c/0x36
[] ip_send_skb+0x19/0x3d
[] udp_send_skb+0x17e/0x1df
[] udp_sendmsg+0x5a2/0x77c
[] inet_sendmsg+0x37/0x5e

This seems new because I never see this before.

I don't touch chronyd in my VM, so I have no idea why it sends out UDP
packets, my guess is it is some periodical packet.

I don't think I use conntrack either, since /proc/net/ip_conntrack
does not exist.

Here are some related config of my kernel:

$ grep CONNTRACK .config
CONFIG_NF_CONNTRACK=y
CONFIG_NF_CONNTRACK_MARK=y
CONFIG_NF_CONNTRACK_SECMARK=y
CONFIG_NF_CONNTRACK_ZONES=y
CONFIG_NF_CONNTRACK_PROCFS=y
CONFIG_NF_CONNTRACK_EVENTS=y
# CONFIG_NF_CONNTRACK_TIMEOUT is not set
CONFIG_NF_CONNTRACK_TIMESTAMP=y
CONFIG_NF_CONNTRACK_AMANDA=y
CONFIG_NF_CONNTRACK_FTP=y
CONFIG_NF_CONNTRACK_H323=y
CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_BROADCAST=y
CONFIG_NF_CONNTRACK_NETBIOS_NS=y
CONFIG_NF_CONNTRACK_SNMP=y
CONFIG_NF_CONNTRACK_PPTP=y
CONFIG_NF_CONNTRACK_SANE=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CONNTRACK_TFTP=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
CONFIG_NF_CONNTRACK_IPV4=y
CONFIG_NF_CONNTRACK_IPV6=y

Please let me know if you need any other information.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] net: sched: fix NULL pointer dereference when action calls some targets

2017-08-16 Thread Cong Wang
On Wed, Aug 16, 2017 at 1:39 AM, Xin Long  wrote:
> On Wed, Aug 9, 2017 at 7:33 AM, Cong Wang  wrote:
>> On Mon, Aug 7, 2017 at 7:33 PM, Xin Long  wrote:
>>> On Tue, Aug 8, 2017 at 9:15 AM, Cong Wang  wrote:
>>>> This looks like a completely API burden?
>>> netfilter xt targets are not really compatible with netsched action.
>>> I've got to say, the patch is just a way to make checkentry return
>>> false and avoid panic. like [1] said
>>
>> I don't doubt you fix a crash, I am thinking if we can
>> "fix" the API instead of fixing the caller.
> Hi, Cong,
>
> For now, I don't think it's possible to change APIs or  some of their targets
> for the panic caused by action xt calling.
>
> The common way should be fixed in net_sched side.
>
> Given that the issue is very easy to triggered,
> let's wait for netfilter's replies for another few days,
> otherwise I will repost the fix, agree ?

Yeah, no objections from me.

By the way, do you know how other callers of this API
use 'entryinfo'? Do they pass the address of the struct
on stack too?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] net: sched: fix NULL pointer dereference when action calls some targets

2017-08-08 Thread Cong Wang
On Mon, Aug 7, 2017 at 7:33 PM, Xin Long  wrote:
> On Tue, Aug 8, 2017 at 9:15 AM, Cong Wang  wrote:
>> This looks like a completely API burden?
> netfilter xt targets are not really compatible with netsched action.
> I've got to say, the patch is just a way to make checkentry return
> false and avoid panic. like [1] said

I don't doubt you fix a crash, I am thinking if we can
"fix" the API instead of fixing the caller.

I am not familiar with this API, so just my 2 cents...
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] net: sched: fix NULL pointer dereference when action calls some targets

2017-08-07 Thread Cong Wang
(Cc'ing netfilter and Jamal)

On Sat, Aug 5, 2017 at 4:35 AM, Xin Long  wrote:
> As we know in some target's checkentry it may dereference par.entryinfo
> to check entry stuff inside. But when sched action calls xt_check_target,
> par.entryinfo is set with NULL. It would cause kernel panic when calling
> some targets.
>
> It can be reproduce with:
>   # tc qd add dev eth1 ingress handle :
>   # tc filter add dev eth1 parent : u32 match u32 0 0 action xt \
> -j ECN --ecn-tcp-remove
>
> It could also crash kernel when using target CLUSTERIP or TPROXY.
>
> By now there's no proper value for par.entryinfo in ipt_init_target,
> but it can not be set with NULL. This patch is to void all these
> panics by setting it with an ipt_entry obj with all members 0.
>
> Signed-off-by: Xin Long 
> ---
>  net/sched/act_ipt.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
> index 7c4816b..0f09f70 100644
> --- a/net/sched/act_ipt.c
> +++ b/net/sched/act_ipt.c
> @@ -41,6 +41,7 @@ static int ipt_init_target(struct net *net, struct 
> xt_entry_target *t,
>  {
> struct xt_tgchk_param par;
> struct xt_target *target;
> +   struct ipt_entry e;
> int ret = 0;
>
> target = xt_request_find_target(AF_INET, t->u.user.name,
> @@ -48,10 +49,11 @@ static int ipt_init_target(struct net *net, struct 
> xt_entry_target *t,
> if (IS_ERR(target))
> return PTR_ERR(target);
>
> +   memset(&e, 0, sizeof(e));
> t->u.kernel.target = target;
> par.net   = net;
> par.table = table;
> -   par.entryinfo = NULL;
> +   par.entryinfo = &e;

This looks like a completely API burden?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net-next STATUS page

2017-07-11 Thread Cong Wang
On Tue, Jul 11, 2017 at 7:24 AM, David Miller  wrote:
>
> It has gotten to the point that even casually walking around
> Faro, Portugal last week, random German tourists would stop
> me in the street and ask if net-next was open or not.
>
> Therefore, in order to avoid any and all confusion I have created this
> web site:
>
> http://vger.kernel.org/~davem/net-next.html
>
> Please refer to it if you are ever in doubt.

Can we update netdev-FAQ too? ;)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH nf-next] netns: add and use net_ns_barrier

2017-06-13 Thread Cong Wang
On Tue, Jun 13, 2017 at 11:07 AM, Florian Westphal  wrote:
> Historically it wasn't needed because we just clear out the helper area
> in the affected conntracks (i.e, future packets are not inspected by
> the helper anymore).
>
> When conntracks were made per-netns this problem was added as we're not
> guaranteed to see all net namespace because module_exit and netns cleanup
> can run concurrently.
>
> We can still use the "old" model if we guarantee that we wait for
> netns cleanup to finish (which is what this patch does).
>
> The alternative, as you pointed out, is to take a module reference for
> each conntrack that uses the helper (and put again when connection is
> destroyed).

Yeah, this is exactly what I am suggesting and I fully expect this could
need more work than this barrier.

>
> I don't really care that much except that if we go for the latter
> solution users cannot "just rmmod" the module anymore but might have
> to manually remove the affected connections first.

This is not bad because the module is indeed being used in this scenario
so EBUSY is expected, or do we need to guarantee conntrack modules
are not held by existing connections?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH nf-next] netns: add and use net_ns_barrier

2017-06-13 Thread Cong Wang
On Mon, Jun 12, 2017 at 11:16 PM, Florian Westphal  wrote:
> Cong Wang  wrote:
>> On Thu, Jun 1, 2017 at 1:52 AM, Florian Westphal  wrote:
>> > Joe described it nicely, problem is that after unload we may have
>> > conntracks that still have a nf_conn_help extension attached that
>> > has a pointer to a structure that resided in the (unloaded) module.
>>
>> Why not hold a refcnt for its module?
>
> That would work as well.
>
> I'm not sure its nice to disallow rmmod of helper modules if they are
> used by a connection however.

I am _not_ suggesting to disallow rmmod.

>
> Right now you can "rmmod nf_conntrack_foo" at any time and this should
> work just fine without first having to flush affected conntracks
> manually.

My point is that since netns wq could invoke code of that module,
why it doesn't hold a refcnt of that module?

I am not familiar with netfilter code base so not sure if that is
hard to do or not, but it looks more elegant than this barrier.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH nf-next] netns: add and use net_ns_barrier

2017-06-12 Thread Cong Wang
On Thu, Jun 1, 2017 at 1:52 AM, Florian Westphal  wrote:
> Joe described it nicely, problem is that after unload we may have
> conntracks that still have a nf_conn_help extension attached that
> has a pointer to a structure that resided in the (unloaded) module.

Why not hold a refcnt for its module?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: possible deadlock in skb_queue_tail

2017-02-20 Thread Cong Wang
On Mon, Feb 20, 2017 at 5:29 AM, Andrey Konovalov  wrote:
> other info that might help us debug this:
>
>  Possible unsafe locking scenario:
>
>CPU0CPU1
>
>   lock(&(&pcpu->lock)->rlock);
>lock(&(&list->lock)->rlock#3);
>lock(&(&pcpu->lock)->rlock);
>   lock(&(&list->lock)->rlock#3);
>

They are different types of sockets and different lists of skb's,
one is netlink socket the other is udp socket, so I don't think
we could have a deadlock in this scenario, we probably need to
explicitly mark them as different lockdep classes.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: suspicious RCU usage in nf_hook

2017-02-02 Thread Cong Wang
On Wed, Feb 1, 2017 at 3:59 PM, Eric Dumazet  wrote:
> On Wed, 2017-02-01 at 15:48 -0800, Eric Dumazet wrote:
>> On Wed, Feb 1, 2017 at 3:29 PM, Cong Wang  wrote:
>>
>> > Not sure if it is better. The difference is caught up in 
>> > net_enable_timestamp(),
>> > which is called setsockopt() path and sk_clone() path, so we could be
>> > in netstamp_needed state for a long time too until user-space exercises
>> > these paths.
>> >
>> > I am feeling we probably need to get rid of netstamp_needed_deferred,
>> > and simply defer the whole static_key_slow_dec(), like the attached patch
>> > (compile only).
>> >
>> > What do you think?
>>
>> I think we need to keep the atomic.
>>
>> If two cpus call net_disable_timestamp() roughly at the same time, the
>> work will be scheduled once.

Good point! Yeah, the same work will not be schedule twice.

>
> Updated patch (but not tested yet)

I can't think out a better way to fix this. I expect jump_label to provide
an API for this, but it doesn't, static_key_slow_dec_deferred()
is just for batching. Probably we should introduce one to avoid these
ugly #ifdef HAVE_JUMP_LABEL here, but that is a -next material.

So, please feel free to send it formally.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: suspicious RCU usage in nf_hook

2017-02-01 Thread Cong Wang
On Wed, Feb 1, 2017 at 1:16 PM, Eric Dumazet  wrote:
> On Wed, 2017-02-01 at 12:51 -0800, Cong Wang wrote:
>> On Tue, Jan 31, 2017 at 7:44 AM, Eric Dumazet  wrote:
>> > On Mon, 2017-01-30 at 22:19 -0800, Cong Wang wrote:
>> >
>> >>
>> >> The context is process context (TX path before hitting qdisc), and
>> >> BH is not disabled, so in_interrupt() doesn't catch it. Hmm, this
>> >> makes me thinking maybe we really need to disable BH in this
>> >> case for nf_hook()? But it is called in RX path too, and BH is
>> >> already disabled there.
>> >
>> > ipt_do_table() and similar netfilter entry points disable BH.
>> >
>> > Maybe it is done too late.
>>
>> I think we need a fix like the following one for minimum impact.
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 727b6fd..eee7d63 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -1720,12 +1720,10 @@ EXPORT_SYMBOL(net_enable_timestamp);
>>  void net_disable_timestamp(void)
>>  {
>>  #ifdef HAVE_JUMP_LABEL
>> -   if (in_interrupt()) {
>> -   atomic_inc(&netstamp_needed_deferred);
>> -   return;
>> -   }
>> -#endif
>> +   atomic_inc(&netstamp_needed_deferred);
>> +#else
>> static_key_slow_dec(&netstamp_needed);
>> +#endif
>>  }
>>  EXPORT_SYMBOL(net_disable_timestamp);
>
> This would permanently leave the kernel in the netstamp_needed state.
>
> I would prefer the patch using a process context to perform the
> cleanup ? Note there is a race window, but probably not a big deal.

Not sure if it is better. The difference is caught up in net_enable_timestamp(),
which is called setsockopt() path and sk_clone() path, so we could be
in netstamp_needed state for a long time too until user-space exercises
these paths.

I am feeling we probably need to get rid of netstamp_needed_deferred,
and simply defer the whole static_key_slow_dec(), like the attached patch
(compile only).

What do you think?
diff --git a/net/core/dev.c b/net/core/dev.c
index 727b6fd..0ef1734 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1694,38 +1694,28 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
 
 static struct static_key netstamp_needed __read_mostly;
-#ifdef HAVE_JUMP_LABEL
-/* We are not allowed to call static_key_slow_dec() from irq context
- * If net_disable_timestamp() is called from irq context, defer the
- * static_key_slow_dec() calls.
- */
-static atomic_t netstamp_needed_deferred;
-#endif
 
-void net_enable_timestamp(void)
+static void netstamp_clear(struct work_struct *work)
 {
-#ifdef HAVE_JUMP_LABEL
-   int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+   static_key_slow_dec(&netstamp_needed);
+}
 
-   if (deferred) {
-   while (--deferred)
-   static_key_slow_dec(&netstamp_needed);
-   return;
-   }
-#endif
+static DECLARE_WORK(netstamp_work, netstamp_clear);
+
+void net_enable_timestamp(void)
+{
+   flush_work(&netstamp_work);
static_key_slow_inc(&netstamp_needed);
 }
 EXPORT_SYMBOL(net_enable_timestamp);
 
 void net_disable_timestamp(void)
 {
-#ifdef HAVE_JUMP_LABEL
-   if (in_interrupt()) {
-   atomic_inc(&netstamp_needed_deferred);
-   return;
-   }
-#endif
-   static_key_slow_dec(&netstamp_needed);
+   /* We are not allowed to call static_key_slow_dec() from irq context
+* If net_disable_timestamp() is called from irq context, defer the
+* static_key_slow_dec() calls.
+*/
+   schedule_work(&netstamp_work);
 }
 EXPORT_SYMBOL(net_disable_timestamp);
 


Re: net: suspicious RCU usage in nf_hook

2017-02-01 Thread Cong Wang
On Tue, Jan 31, 2017 at 7:44 AM, Eric Dumazet  wrote:
> On Mon, 2017-01-30 at 22:19 -0800, Cong Wang wrote:
>
>>
>> The context is process context (TX path before hitting qdisc), and
>> BH is not disabled, so in_interrupt() doesn't catch it. Hmm, this
>> makes me thinking maybe we really need to disable BH in this
>> case for nf_hook()? But it is called in RX path too, and BH is
>> already disabled there.
>
> ipt_do_table() and similar netfilter entry points disable BH.
>
> Maybe it is done too late.

I think we need a fix like the following one for minimum impact.

diff --git a/net/core/dev.c b/net/core/dev.c
index 727b6fd..eee7d63 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1720,12 +1720,10 @@ EXPORT_SYMBOL(net_enable_timestamp);
 void net_disable_timestamp(void)
 {
 #ifdef HAVE_JUMP_LABEL
-   if (in_interrupt()) {
-   atomic_inc(&netstamp_needed_deferred);
-   return;
-   }
-#endif
+   atomic_inc(&netstamp_needed_deferred);
+#else
static_key_slow_dec(&netstamp_needed);
+#endif
 }
 EXPORT_SYMBOL(net_disable_timestamp);
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: suspicious RCU usage in nf_hook

2017-01-30 Thread Cong Wang
On Fri, Jan 27, 2017 at 5:31 PM, Eric Dumazet  wrote:
> On Fri, 2017-01-27 at 17:00 -0800, Cong Wang wrote:
>> On Fri, Jan 27, 2017 at 3:35 PM, Eric Dumazet  wrote:
>> > Oh well, I forgot to submit the official patch I think, Jan 9th.
>> >
>> > https://groups.google.com/forum/#!topic/syzkaller/BhyN5OFd7sQ
>> >
>>
>> Hmm, but why only fragments need skb_orphan()? It seems like
>> any kfree_skb() inside a nf hook needs to have a preceding
>> skb_orphan().
>
>
>>
>> Also, I am not convinced it is similar to commit 8282f27449bf15548
>> which is on RX path.
>
> Well, we clearly see IPv6 reassembly being part of the equation in both
> cases.

Yeah, of course. My worry is that this problem is more than just
IPv6 reassembly.

>
> I was replying to first part of the splat [1], which was already
> diagnosed and had a non official patch.
>
> use after free is also a bug, regardless of jump label being used or
> not.
>
> I still do not really understand this nf_hook issue, I thought we were
> disabling BH in netfilter.

It is a different warning from use-after-free, this one is about sleep
in atomic context, mutex lock is acquired with RCU read lock held.


>
> So the in_interrupt() check in net_disable_timestamp() should trigger,
> this was the intent of netstamp_needed_deferred existence.
>
> Not sure if we can test for rcu_read_lock() as well.
>

The context is process context (TX path before hitting qdisc), and
BH is not disabled, so in_interrupt() doesn't catch it. Hmm, this
makes me thinking maybe we really need to disable BH in this
case for nf_hook()? But it is called in RX path too, and BH is
already disabled there.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: suspicious RCU usage in nf_hook

2017-01-27 Thread Cong Wang
On Fri, Jan 27, 2017 at 3:35 PM, Eric Dumazet  wrote:
> Oh well, I forgot to submit the official patch I think, Jan 9th.
>
> https://groups.google.com/forum/#!topic/syzkaller/BhyN5OFd7sQ
>

Hmm, but why only fragments need skb_orphan()? It seems like
any kfree_skb() inside a nf hook needs to have a preceding
skb_orphan().

Also, I am not convinced it is similar to commit 8282f27449bf15548
which is on RX path.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: suspicious RCU usage in nf_hook

2017-01-27 Thread Cong Wang
On Fri, Jan 27, 2017 at 3:22 PM, Cong Wang  wrote:
> On Fri, Jan 27, 2017 at 1:15 PM, Dmitry Vyukov  wrote:
>> stack backtrace:
>> CPU: 2 PID: 23111 Comm: syz-executor14 Not tainted 4.10.0-rc5+ #192
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:15 [inline]
>>  dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
>>  lockdep_rcu_suspicious+0x139/0x180 kernel/locking/lockdep.c:4452
>>  rcu_preempt_sleep_check include/linux/rcupdate.h:560 [inline]
>>  ___might_sleep+0x560/0x650 kernel/sched/core.c:7748
>>  __might_sleep+0x95/0x1a0 kernel/sched/core.c:7739
>>  mutex_lock_nested+0x24f/0x1730 kernel/locking/mutex.c:752
>>  atomic_dec_and_mutex_lock+0x119/0x160 kernel/locking/mutex.c:1060
>>  __static_key_slow_dec+0x7a/0x1e0 kernel/jump_label.c:149
>>  static_key_slow_dec+0x51/0x90 kernel/jump_label.c:174
>>  net_disable_timestamp+0x3b/0x50 net/core/dev.c:1728
>>  sock_disable_timestamp+0x98/0xc0 net/core/sock.c:403
>>  __sk_destruct+0x27d/0x6b0 net/core/sock.c:1441
>>  sk_destruct+0x47/0x80 net/core/sock.c:1460
>
> jump label uses a mutex and we call jump label API in softIRQ context...
> Maybe we have to move it to a work struct as what we did for netlink.

Correct myself: process context but with RCU read lock held...
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: net: suspicious RCU usage in nf_hook

2017-01-27 Thread Cong Wang
On Fri, Jan 27, 2017 at 1:15 PM, Dmitry Vyukov  wrote:
> stack backtrace:
> CPU: 2 PID: 23111 Comm: syz-executor14 Not tainted 4.10.0-rc5+ #192
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:15 [inline]
>  dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
>  lockdep_rcu_suspicious+0x139/0x180 kernel/locking/lockdep.c:4452
>  rcu_preempt_sleep_check include/linux/rcupdate.h:560 [inline]
>  ___might_sleep+0x560/0x650 kernel/sched/core.c:7748
>  __might_sleep+0x95/0x1a0 kernel/sched/core.c:7739
>  mutex_lock_nested+0x24f/0x1730 kernel/locking/mutex.c:752
>  atomic_dec_and_mutex_lock+0x119/0x160 kernel/locking/mutex.c:1060
>  __static_key_slow_dec+0x7a/0x1e0 kernel/jump_label.c:149
>  static_key_slow_dec+0x51/0x90 kernel/jump_label.c:174
>  net_disable_timestamp+0x3b/0x50 net/core/dev.c:1728
>  sock_disable_timestamp+0x98/0xc0 net/core/sock.c:403
>  __sk_destruct+0x27d/0x6b0 net/core/sock.c:1441
>  sk_destruct+0x47/0x80 net/core/sock.c:1460

jump label uses a mutex and we call jump label API in softIRQ context...
Maybe we have to move it to a work struct as what we did for netlink.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch net] ipvs: use IPVS_CMD_ATTR_MAX for family.maxattr

2016-11-03 Thread Cong Wang
family.maxattr is the max index for policy[], the size of
ops[] is determined with ARRAY_SIZE().

Reported-by: Andrey Konovalov 
Tested-by: Andrey Konovalov 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/ipvs/ip_vs_ctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c3c809b..a6e44ef 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2845,7 +2845,7 @@ static struct genl_family ip_vs_genl_family = {
.hdrsize= 0,
.name   = IPVS_GENL_NAME,
.version= IPVS_GENL_VERSION,
-   .maxattr= IPVS_CMD_MAX,
+   .maxattr= IPVS_CMD_ATTR_MAX,
.netnsok= true, /* Make ipvsadm to work on netns */
 };
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] net: ipv4 -- Introduce ifa limit per net

2016-03-10 Thread Cong Wang
On Thu, Mar 10, 2016 at 11:55 AM, David Miller  wrote:
> Indeed, good catch.  Therefore:
>
> 1) Keep the masq netdev notifier.  That will flush the conntrack table
>for the inetdev_destroy event.
>
> 2) Make the inetdev notifier only do something if inetdev->dead is
>false.  (ie. we are flushing an individual address)
>
> And then we don't need the NETDEV_UNREGISTER thing at all:


This makes sense to me. I guess similar thing needs to do for IPv6 masq too.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] net: ipv4 -- Introduce ifa limit per net

2016-03-10 Thread Cong Wang
On Thu, Mar 10, 2016 at 10:01 AM, David Miller  wrote:
> I'm tempted to say that we should provide these notifier handlers with
> the information they need, explicitly, to handle this case.
>
> Most intdev notifiers actually want to know the individual addresses
> that get removed, one by one.  That's handled by the existing
> NETDEV_DOWN event and the ifa we pass to that.
>
> But some, like this netfilter masq case, would be satisfied with a
> single event that tells them the whole inetdev instance is being torn
> down.  Which is the case we care about here.
>
> We currently don't use NETDEV_UNREGISTER for inetdev notifiers, so
> maybe we could use that.
>
> And that is consistent with the core netdev notifier that triggers
> this call chain in the first place.
>
> Roughly, something like this:
>
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index 8c3df2c..6eee5cb 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -292,6 +292,11 @@ static void inetdev_destroy(struct in_device *in_dev)
>
> in_dev->dead = 1;
>
> +   if (in_dev->ifa_list)
> +   blocking_notifier_call_chain(&inetaddr_chain,
> +NETDEV_UNREGISTER,
> +in_dev->ifa_list);
> +
> ip_mc_destroy_dev(in_dev);


Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
is happening and masq already registers a netdev notifier...



>
> while ((ifa = in_dev->ifa_list) != NULL) {
> diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c 
> b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> index c6eb421..1bb8026 100644
> --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> @@ -111,6 +111,10 @@ static int masq_inet_event(struct notifier_block *this,
> struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
> struct netdev_notifier_info info;
>
> +   if (event != NETDEV_UNREGISTER)
> +   return NOTIFY_DONE;
> +   event = NETDEV_DOWN;
> +
> netdev_notifier_info_init(&info, dev);
> return masq_device_event(this, event, &info);
>  }

If masq really doesn't care about inetdev destroy or inetaddr removal,
we should just remove its inetaddr notifier.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html