Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-11-18 Thread Xin Long
On Mon, Nov 19, 2018 at 5:49 AM Jakub Audykowicz
 wrote:
>
> Calling send on a connected SCTP socket results in kernel panic if
> spp_pathmtu was configured manually before an association is established
> and it was not reconfigured to another value once the association is
> established.
>
> Steps to reproduce:
> 1. Set up a listening SCTP server socket.
> 2. Set up an SCTP client socket.
> 3. Configure client socket using setsockopt SCTP_PEER_ADDR_PARAMS with
> spp_pathmtu set to a legal value (e.g. 1000) and
> SPP_PMTUD_DISABLE set in spp_flags.
> 4. Connect client to server.
> 5. Send message from client to server.
>
> At this point oom-killer is invoked, which will eventually lead to:
> [5.197262] Out of memory and no killable processes...
> [5.198107] Kernel panic - not syncing: System is deadlocked on memory
>
> Commit 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> introduces sctp_assoc_update_frag_point, but this function is not called
> in this case, causing frag_point to be zero:
>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
>  {
> -   if (asoc->pathmtu != pmtu)
> +   if (asoc->pathmtu != pmtu) {
> asoc->pathmtu = pmtu;
> +   sctp_assoc_update_frag_point(asoc);
> +   }
>
> In this scenario, on association establishment, asoc->pathmtu is already
> 1000 and pmtu will be as well. Before this commit the frag_point was being
> correctly set in the scenario described. Moving the call outside the if
> block fixes the issue.
>
> I will be providing a separate patch to lksctp-tools with a simple test
> reproducing this problem ("func_tests: frag_point should never be zero").
>
> I have also taken the liberty to introduce a sanity check in chunk.c to
> set the frag_point to a non-negative value in order to avoid chunking
> endlessly (which is the reason for the eventual panic).
>
> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> Signed-off-by: Jakub Audykowicz 
> ---
>  include/net/sctp/constants.h |  3 +++
>  net/sctp/associola.c | 13 +++--
>  net/sctp/chunk.c |  6 ++
>  3 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> index 8dadc74c22e7..90316fab6f04 100644
> --- a/include/net/sctp/constants.h
> +++ b/include/net/sctp/constants.h
> @@ -293,6 +293,9 @@ enum { SCTP_MAX_GABS = 16 };
>  */
>  #define SCTP_DEFAULT_MINSEGMENT 512/* MTU size ... if no mtu disc */
>
> +/* An association's fragmentation point should never be non-positive */
> +#define SCTP_FRAG_POINT_MIN 1
> +
>  #define SCTP_SECRET_SIZE 32/* Number of octets in a 256 bits. */
>
>  #define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 6a28b96e779e..44d71a1af62e 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -1431,13 +1431,14 @@ void sctp_assoc_update_frag_point(struct 
> sctp_association *asoc)
>
>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
>  {
> -   if (asoc->pathmtu != pmtu) {
> -   asoc->pathmtu = pmtu;
> -   sctp_assoc_update_frag_point(asoc);
> -   }
> +   pr_debug("%s: before asoc:%p, pmtu:%d, frag_point:%d\n",
> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
> +
> +   asoc->pathmtu = pmtu;
> +   sctp_assoc_update_frag_point(asoc);
>
> -   pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
> -asoc->pathmtu, asoc->frag_point);
> +   pr_debug("%s: after asoc:%p, pmtu:%d, frag_point:%d\n",
> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
>  }
The idea was whenever asoc->pathmtu changes,  frag_point should
be updated, but we missed one place in sctp_association_init().

Another issue is after 4-shakehand, the client's asoc->intl_enable
may be changed from 0 to 1, which means the frag_point should
also be updated, since [1]:

void sctp_assoc_update_frag_point(struct sctp_association *asoc)
{
int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
sctp_datachk_len(&asoc->stream)); <--- [1]

So one fix for both issues is:

diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 0a78cdf..19d596d 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1327,4 +1327,5 @@ void sctp_stream_interleave_init(struct
sctp_stream *stream)
asoc = container_of(stream, struct sctp_association, stream);
stream->si = asoc->intl_enable ? &sctp_stream_interleave_1
   : &sctp_stream_interleave_0;
+   sctp_assoc_update_frag_point(asoc);
 }


>
>  /* Update the association's pmtu and frag_point by going through all the
> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> index ce8

RE: [PATCH v2 2/2] dpaa_eth: add ethtool coalesce control

2018-11-18 Thread Madalin-cristian Bucur
> -Original Message-
> From: David Miller 
> Sent: Saturday, November 17, 2018 5:42 AM
> To: Madalin-cristian Bucur 
> Subject: Re: [PATCH v2 2/2] dpaa_eth: add ethtool coalesce control
> 
> From: Madalin Bucur 
> Date: Tue, 13 Nov 2018 18:29:51 +0200
> 
> > +   for_each_cpu(cpu, cpus) {
> > +   portal = qman_get_affine_portal(cpu);
> > +   res = qman_portal_set_iperiod(portal, period);
> > +   if (res)
> > +   return res;
> > +   res = qman_dqrr_set_ithresh(portal, thresh);
> > +   if (res)
> > +   return res;
> 
> Nope, you can't do it like this.
> 
> If any intermediate change fails, you have to unwind all of the
> changes made up until that point.
> 
> Which means you'll have to store the previous setting somewhere
> and reinstall those saved values in the error path.

Thank you, I'll come back with a v3.

Madalin


Re: selftests: net: udpgro.sh hangs on DUT devices running Linux -next

2018-11-18 Thread Naresh Kamboju
Hi Paolo,

On Sun, 18 Nov 2018 at 05:01, Paolo Abeni  wrote:
>
> Hi,
>
> On Fri, 2018-11-16 at 14:55 +0530, Naresh Kamboju wrote:
> > Kernel selftests: net: udpgro.sh hangs / waits forever on x86_64 and
> > arm32 devices running Linux -next. Test getting PASS on arm64 devices.
> >
> > Do you see this problem ?
> >
> > Short error log:
> > -
> > ip6tables v1.6.1: can't initialize ip6tables table `nat': Table does
> > not exist (do you need to insmod?)
>
> Thank you for the report.
>
> It looks like your kernel config has
>
> # CONFIG_NF_NAT_IPV6 is not set
>
> Can you please confirm ?

 # CONFIG_NF_NAT_IPV6 is not set on the devices where the test is getting hang.

>
> net selftests do not explicitly ask for that, despiting using such
> functionality (my bad).
>
> I'll be travelling up to Monday (included). I'll have a better look
> after that.

Thank you.

- Naresh

>
> Cheers,
>
> Paolo
>


[PATCH net-next] MAINTAINERS: Add myself as third phylib maintainer

2018-11-18 Thread Heiner Kallweit
Add myself as third phylib maintainer.

Signed-off-by: Heiner Kallweit 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7ff8865a9..2988ecbf6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5540,6 +5540,7 @@ F:net/bridge/
 ETHERNET PHY LIBRARY
 M: Andrew Lunn 
 M: Florian Fainelli 
+M: Heiner Kallweit 
 L: netdev@vger.kernel.org
 S: Maintained
 F: Documentation/ABI/testing/sysfs-bus-mdio
-- 
2.19.1



Re: "WARNING: CPU: 4 PID: 1 at ./include/net/genetlink.h:294 genlmsg_multicast_netns.isra.10.constprop.19+0x34/0x5e" with mlx4 under 4.9.137

2018-11-18 Thread Krzysztof Olędzki

-> j...@mellanox.com


Hi,

I just upgraded kernel from 4.4.161 to 4.9.137 on one of my servers (Dell 
PowerEdge T110 II) and found this in the dmesg:

[0.796066] mlx4_core: Mellanox ConnectX core driver v2.2-1 (Feb, 2014)
[0.796251] [ cut here ]
[0.796435] WARNING: CPU: 4 PID: 1 at ./include/net/genetlink.h:294 
genlmsg_multicast_netns.isra.10.constprop.19+0x34/0x5e
[0.796724] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 4.9.137-o2 #1
[0.796898] Hardware name: Dell Inc. PowerEdge T110 II/0PM2CW, BIOS 2.10.0 
05/24/2018
[0.797188]  c9013c08 813ea8f6  

[0.797679]  c9013c48 8110df65 01260ec0 
ffea
[0.798157]  88081a32 0003 fff4 
818eb360
[0.798648] Call Trace:
[0.798819]  [] dump_stack+0x4f/0x65
[0.798993]  [] __warn+0xce/0xe9
[0.799164]  [] warn_slowpath_null+0x18/0x1a
[0.799343]  [] 
genlmsg_multicast_netns.isra.10.constprop.19+0x34/0x5e
[0.799637]  [] devlink_notify+0x54/0x5b
[0.799813]  [] devlink_register+0x46/0x59
[0.799989]  [] mlx4_init_one+0x104/0x4c7
[0.800163]  [] ? preempt_latency_start+0x21/0x57
[0.800341]  [] ? preempt_count_add+0x70/0x73
[0.800515]  [] ? pci_match_id+0x32/0x57
[0.800689]  [] pci_device_probe+0x6e/0xb9
[0.800867]  [] driver_probe_device+0xf9/0x222
[0.804465]  [] __driver_attach+0x6d/0x8f
[0.804638]  [] ? driver_probe_device+0x222/0x222
[0.804814]  [] bus_for_each_dev+0x6a/0x82
[0.804988]  [] driver_attach+0x19/0x1b
[0.805164]  [] bus_add_driver+0xee/0x1c9
[0.805342]  [] ? do_early_param+0x90/0x90
[0.805519]  [] driver_register+0x83/0xba
[0.805689]  [] ? tg3_driver_init+0x1b/0x1b
[0.805862]  [] __pci_register_driver+0x44/0x46
[0.806034]  [] mlx4_init+0x109/0x12a
[0.806200]  [] ? tg3_driver_init+0x1b/0x1b
[0.806377]  [] do_one_initcall+0x8b/0x10e
[0.806547]  [] ? do_early_param+0x90/0x90
[0.806717]  [] kernel_init_freeable+0x151/0x1dd
[0.806890]  [] ? rest_init+0x7d/0x7d
[0.807061]  [] kernel_init+0x9/0x105
[0.807232]  [] ret_from_fork+0x54/0x60
[0.807415] ---[ end trace 34e550e5d3d83d0a ]---
[0.807582] mlx4_core: Initializing :01:00.0

The NIC seems to work correctly, but the warning appears after each boot.
There is also BCM5722 (tg3) NIC in the system. The only thing that changes
between boots is the CPU reported in the message.

I can try bisec or test a more recent kernel, but this will have to wait after
I come back from my Thanksgiving travel.

$ lspci -nn shows:
01:00.0 Ethernet controller [0200]: Mellanox Technologies MT26448 [ConnectX EN 
10GigE, PCIe 2.0 5GT/s] [15b3:6750] (rev b0)

Thanks,
  Krzysztof



"WARNING: CPU: 4 PID: 1 at ./include/net/genetlink.h:294 genlmsg_multicast_netns.isra.10.constprop.19+0x34/0x5e" with mlx4 under 4.9.137

2018-11-18 Thread Krzysztof Olędzki
Hi,

I just upgraded kernel from 4.4.161 to 4.9.137 on one of my servers (Dell 
PowerEdge T110 II) and found this in the dmesg:

[0.796066] mlx4_core: Mellanox ConnectX core driver v2.2-1 (Feb, 2014)
[0.796251] [ cut here ]
[0.796435] WARNING: CPU: 4 PID: 1 at ./include/net/genetlink.h:294 
genlmsg_multicast_netns.isra.10.constprop.19+0x34/0x5e
[0.796724] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 4.9.137-o2 #1
[0.796898] Hardware name: Dell Inc. PowerEdge T110 II/0PM2CW, BIOS 2.10.0 
05/24/2018
[0.797188]  c9013c08 813ea8f6  

[0.797679]  c9013c48 8110df65 01260ec0 
ffea
[0.798157]  88081a32 0003 fff4 
818eb360
[0.798648] Call Trace:
[0.798819]  [] dump_stack+0x4f/0x65
[0.798993]  [] __warn+0xce/0xe9
[0.799164]  [] warn_slowpath_null+0x18/0x1a
[0.799343]  [] 
genlmsg_multicast_netns.isra.10.constprop.19+0x34/0x5e
[0.799637]  [] devlink_notify+0x54/0x5b
[0.799813]  [] devlink_register+0x46/0x59
[0.799989]  [] mlx4_init_one+0x104/0x4c7
[0.800163]  [] ? preempt_latency_start+0x21/0x57
[0.800341]  [] ? preempt_count_add+0x70/0x73
[0.800515]  [] ? pci_match_id+0x32/0x57
[0.800689]  [] pci_device_probe+0x6e/0xb9
[0.800867]  [] driver_probe_device+0xf9/0x222
[0.804465]  [] __driver_attach+0x6d/0x8f
[0.804638]  [] ? driver_probe_device+0x222/0x222
[0.804814]  [] bus_for_each_dev+0x6a/0x82
[0.804988]  [] driver_attach+0x19/0x1b
[0.805164]  [] bus_add_driver+0xee/0x1c9
[0.805342]  [] ? do_early_param+0x90/0x90
[0.805519]  [] driver_register+0x83/0xba
[0.805689]  [] ? tg3_driver_init+0x1b/0x1b
[0.805862]  [] __pci_register_driver+0x44/0x46
[0.806034]  [] mlx4_init+0x109/0x12a
[0.806200]  [] ? tg3_driver_init+0x1b/0x1b
[0.806377]  [] do_one_initcall+0x8b/0x10e
[0.806547]  [] ? do_early_param+0x90/0x90
[0.806717]  [] kernel_init_freeable+0x151/0x1dd
[0.806890]  [] ? rest_init+0x7d/0x7d
[0.807061]  [] kernel_init+0x9/0x105
[0.807232]  [] ret_from_fork+0x54/0x60
[0.807415] ---[ end trace 34e550e5d3d83d0a ]---
[0.807582] mlx4_core: Initializing :01:00.0

The NIC seems to work correctly, but the warning appears after each boot.
There is also BCM5722 (tg3) NIC in the system. The only thing that changes
between boots is the CPU reported in the message.

I can try bisec or test a more recent kernel, but this will have to wait after
I come back from my Thanksgiving travel.

$ lspci -nn shows:
01:00.0 Ethernet controller [0200]: Mellanox Technologies MT26448 [ConnectX EN 
10GigE, PCIe 2.0 5GT/s] [15b3:6750] (rev b0)

Thanks,
 Krzysztof


Re: [PATCH] rhashtable: detect when object movement between tables might have invalidated a lookup

2018-11-18 Thread David Miller
From: Herbert Xu 
Date: Mon, 19 Nov 2018 12:06:34 +0800

> On Mon, Nov 19, 2018 at 11:56:35AM +0800, Herbert Xu wrote:
>>
>> I take that back.  Because of your shift which cancels out the
>> shift in NULLS_MARKER, it would appear that this should work just
>> fine with RHT_NULLS_MARRKER(0), no? IOW, it would appear that
>> 
>>  RHT_NULLS_MARKER(0) = RHT_NULLS_MARKER(RHT_NULLS_MARKER(0))
> 
> My emails to Neil are bouncing:
> 
>   ne...@suse.com
> host smtp.glb1.softwaregrp.com [15.124.2.87]
> SMTP error from remote mail server after RCPT TO::
> 550 Cannot process address

Yeah this just started happening 2 days ago.


Re: [PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-18 Thread Marcelo Ricardo Leitner
On Sun, Nov 18, 2018 at 04:02:25PM +0900, Xin Long wrote:
> On Sat, Nov 17, 2018 at 12:12 AM Neil Horman  wrote:
> >
> > On Thu, Nov 15, 2018 at 09:41:01PM -0200, Marcelo Ricardo Leitner wrote:
> > > [ re-sending, without html this time ]
> > >
> > > On Thu, Nov 15, 2018, 15:26 Neil Horman  > >
> > > > On Thu, Nov 15, 2018 at 08:25:36PM -0200, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, Nov 15, 2018 at 04:43:10PM -0500, Neil Horman wrote:
> > > > > > On Thu, Nov 15, 2018 at 03:22:21PM -0200, Marcelo Ricardo Leitner
> > > > wrote:
> > > > > > > On Thu, Nov 15, 2018 at 07:14:28PM +0800, Xin Long wrote:
> > > > > > > > As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:
> > > > > > > >
> > > > > > > >This socket option allows the enabling or disabling of the
> > > > > > > >negotiation of PR-SCTP support for future associations.  For
> > > > existing
> > > > > > > >associations, it allows one to query whether or not PR-SCTP
> > > > support
> > > > > > > >was negotiated on a particular association.
> > > > > > > >
> > > > > > > > It means only sctp sock's prsctp_enable can be set.
> > > > > > > >
> > > > > > > > Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we 
> > > > > > > > will
> > > > > > > > add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for 
> > > > > > > > linux
> > > > > > > > sctp in another patchset.
> > > > > > > >
> > > > > > > > Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp 
> > > > > > > > sockopt")
> > > > > > > > Reported-by: Ying Xu 
> > > > > > > > Signed-off-by: Xin Long 
> > > > > > > > ---
> > > > > > > >  net/sctp/socket.c | 13 +++--
> > > > > > > >  1 file changed, 3 insertions(+), 10 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > > > > > > index 739f3e5..e9b8232 100644
> > > > > > > > --- a/net/sctp/socket.c
> > > > > > > > +++ b/net/sctp/socket.c
> > > > > > > > @@ -3940,7 +3940,6 @@ static int
> > > > sctp_setsockopt_pr_supported(struct sock *sk,
> > > > > > > > unsigned int optlen)
> > > > > > > >  {
> > > > > > > > struct sctp_assoc_value params;
> > > > > > > > -   struct sctp_association *asoc;
> > > > > > > > int retval = -EINVAL;
> > > > > > > >
> > > > > > > > if (optlen != sizeof(params))
> > > > > > > > @@ -3951,16 +3950,10 @@ static int
> > > > sctp_setsockopt_pr_supported(struct sock *sk,
> > > > > > > > goto out;
> > > > > > > > }
> > > > > > > >
> > > > > > > > -   asoc = sctp_id2assoc(sk, params.assoc_id);
> > > > > > > > -   if (asoc) {
> > > > > > > > -   asoc->prsctp_enable = !!params.assoc_value;
> > > > > > > > -   } else if (!params.assoc_id) {
> > > > > > > > -   struct sctp_sock *sp = sctp_sk(sk);
> > > > > > > > -
> > > > > > > > -   sp->ep->prsctp_enable = !!params.assoc_value;
> > > > > > > > -   } else {
> > > > > > > > +   if (sctp_style(sk, UDP) && sctp_id2assoc(sk,
> > > > params.assoc_id))
> > > > > > >
> > > > > > > This would allow using a non-existent assoc id on UDP-style 
> > > > > > > sockets
> > > > to
> > > > > > > set it at the socket, which is not expected. It should be more 
> > > > > > > like:
> > > > > > >
> > > > > > > + if (sctp_style(sk, UDP) && params.assoc_id)
> > > > > > How do you see that to be the case? sctp_id2assoc will return NULL 
> > > > > > if
> > > > an
> > > > > > association isn't found, so the use of sctp_id2assoc should work 
> > > > > > just
> > > > fine.
> > > > >
> > > > > Right, it will return NULL, and because of that it won't bail out as
> > > > > it should and will adjust the socket config instead.
> > > > >
> > > >
> > > > Oh, duh, you're absolutely right, NULL will evalutate to false there, 
> > > > and
> > > > skip
> > > > the conditional goto out;
> > > >
> > > > that said, It would make more sense to me to just change the sense of 
> > > > the
> > > > second
> > > > condition to !sctp_id2assoc(sk, params.assoc_id), so that we goto out 
> > > > if no
> > > > association is found.  it still seems a
> > >
> > >
> > > That would break setting it on the socket without an assoc so far.
> > >
> > ok, yes, I see what xin is getting at now.  The RFC indicates that the
> > setsockopt method for this socket option is meant to set the prsctp enabled
> > value on _future_ associations, implying that we should not operate at all 
> > on
> > already existing associations (i.e. we should ignore the assoc_id in the 
> > passed
> > in structure and only operate on the socket).  That said, heres the entire 
> > text
> > of the RFC section:
> >
> > 4.5.  Socket Option for Getting and Setting the PR-SCTP Support
> >   (SCTP_PR_SUPPORTED)
> >
> >This socket option allows the enabling or disabling of the
> >negotiation of PR-SCTP support for future associations.  For existing
> >associations, it allows one to query whether or not PR-SCTP support
> >was n

Re: [PATCH] rhashtable: detect when object movement between tables might have invalidated a lookup

2018-11-18 Thread Herbert Xu
On Mon, Nov 19, 2018 at 11:56:35AM +0800, Herbert Xu wrote:
>
> I take that back.  Because of your shift which cancels out the
> shift in NULLS_MARKER, it would appear that this should work just
> fine with RHT_NULLS_MARRKER(0), no? IOW, it would appear that
> 
>   RHT_NULLS_MARKER(0) = RHT_NULLS_MARKER(RHT_NULLS_MARKER(0))

My emails to Neil are bouncing:

ne...@suse.com
  host smtp.glb1.softwaregrp.com [15.124.2.87]
  SMTP error from remote mail server after RCPT TO::
  550 Cannot process address

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [Patch net] net: invert the check of detecting hardware RX checksum fault

2018-11-18 Thread Herbert Xu
On Fri, Nov 16, 2018 at 01:32:50PM -0800, Cong Wang wrote:
>
> This is true only when there is a skb_checksum_init*() or
> skb_checksum_validate*() prior to it, it seems not true for
> nf_ip_checksum() where skb->csum is correctly set to pesudo header
> checksum but there is no validation of the original skb->csum.
> So this check should be still inverted there??
> 
> Or am I still missing anything here?

What do you mean? My copy of nf_ip_checksum seems to be doing the
right thing as far as verifying CHECKSUM_COMPLETED goes.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] rhashtable: detect when object movement between tables might have invalidated a lookup

2018-11-18 Thread Herbert Xu
On Mon, Nov 19, 2018 at 11:54:15AM +0800, Herbert Xu wrote:
>
> > >> diff --git a/lib/rhashtable.c b/lib/rhashtable.c
> > >> index 30526afa8343..852ffa5160f1 100644
> > >> --- a/lib/rhashtable.c
> > >> +++ b/lib/rhashtable.c
> > >> @@ -1179,8 +1179,7 @@ struct rhash_head __rcu **rht_bucket_nested(const 
> > >> struct bucket_table *tbl,
> > >>  unsigned int hash)
> > >>  {
> > >>  const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
> > >> -static struct rhash_head __rcu *rhnull =
> > >> -(struct rhash_head __rcu *)NULLS_MARKER(0);
> > >> +static struct rhash_head __rcu *rhnull;
> > >
> > > I don't understand why you can't continue to do NULLS_MARKER(0) or
> > > RHT_NULLS_MARKER(0).
> > 
> > Because then the test
> > 
> > +   } while (he != RHT_NULLS_MARKER(head));
> > 
> > in __rhashtable_lookup() would always succeed, and it would loop
> > forever.
> 
> This change is only necessary because of your shifting change
> above, which AFAICS adds no real benefit.

I take that back.  Because of your shift which cancels out the
shift in NULLS_MARKER, it would appear that this should work just
fine with RHT_NULLS_MARRKER(0), no? IOW, it would appear that

RHT_NULLS_MARKER(0) = RHT_NULLS_MARKER(RHT_NULLS_MARKER(0))

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] rhashtable: detect when object movement between tables might have invalidated a lookup

2018-11-18 Thread Herbert Xu
On Fri, Nov 16, 2018 at 05:59:19PM +1100, NeilBrown wrote:
>
> NULLS_MARKER assumes a hash value in which the bottom bits are most
> likely to be unique.  To convert this to a pointer which certainly not
> valid, it shifts left by 1 and sets the lsb.
> We aren't passing a hash value, but are passing an address instead.
> In this case the bottom 2 bits are certain to be 0, and the top bit
> could contain valuable information (on a 32bit system).
> The best way to turn a pointer into a certainly-invalid pointer
> is to just set the lsb.  By shifting right by one, we discard an
> uninteresting bit, preserve all the interesting bits, and effectively
> just set the lsb.
> 
> I could add a comment explaining that if you like.

The top-bit is most likely to be fixed and offer no real value.

> >> diff --git a/lib/rhashtable.c b/lib/rhashtable.c
> >> index 30526afa8343..852ffa5160f1 100644
> >> --- a/lib/rhashtable.c
> >> +++ b/lib/rhashtable.c
> >> @@ -1179,8 +1179,7 @@ struct rhash_head __rcu **rht_bucket_nested(const 
> >> struct bucket_table *tbl,
> >>unsigned int hash)
> >>  {
> >>const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
> >> -  static struct rhash_head __rcu *rhnull =
> >> -  (struct rhash_head __rcu *)NULLS_MARKER(0);
> >> +  static struct rhash_head __rcu *rhnull;
> >
> > I don't understand why you can't continue to do NULLS_MARKER(0) or
> > RHT_NULLS_MARKER(0).
> 
> Because then the test
> 
> + } while (he != RHT_NULLS_MARKER(head));
> 
> in __rhashtable_lookup() would always succeed, and it would loop
> forever.

This change is only necessary because of your shifting change
above, which AFAICS adds no real benefit.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


netns_id in bpf_sk_lookup_{tcp,udp}

2018-11-18 Thread David Ahern
Hi Joe:

The netns_id to the bpf_sk_lookup_{tcp,udp} functions in
net/core/filter.c is a u64, yet the APIs in include/uapi/linux/bpf.h
shows a u32. Is that intentional or an oversight through the iterations?

David


Re: DSA support for Marvell 88e6065 switch

2018-11-18 Thread Andrew Lunn
> If I wanted it to work, what do I need to do? AFAICT phy autoprobing
> should just attach it as soon as it is compiled in?

Nope. It is a switch, not a PHY. Switches are never auto-probed
because they are not guaranteed to have ID registers.

You need to use the legacy device tree binding. Look in
Documentation/devicetree/bindings/net/dsa/dsa.txt, section Deprecated
Binding. You can get more examples if you checkout old kernels. Or
kirkwood-rd88f6281.dtsi, the dsa { } node which is disabled.

Andrew


[GIT] Networking

2018-11-18 Thread David Miller


1) Fix some potentially uninitialized variables and use-after-free in
   kvaser_usb can drier, from Jimmy Assarsson.

2) Fix leaks in qed driver, from Denis Bolotin.

3) Socket leak in l2tp, from Xin Long.

4) RSS context allocation fix in bnxt_en from Michael Chan.

5) Fix cxgb4 build errors, from Ganesh Goudar.

6) Route leaks in ipv6 when removing exceptions, from Xin Long.

7) Memory leak in IDR allocation handling of act_pedit, from Davide
   Caratti.

8) Use-after-free of bridge vlan stats, from Nikolay Aleksandrov.

9) When MTU is locked, do not force DF bit on ipv4 tunnels.  From
   Sabrina Dubroca.

10) When NAPI cached skb is reused, we must set it to the proper
initial state which includes skb->pkt_type.  From Eric Dumazet.

11) Lockdep and non-linear SKB handling fix in tipc from Jon Maloy.

12) Set RX queue properly in various tuntap receive paths, from
Matthew Cover.

Please pull, thanks a lot!

The following changes since commit ccda4af0f4b92f7b4c308d3acc262f4a7e3affad:

  Linux 4.20-rc2 (2018-11-11 17:12:31 -0600)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 

for you to fetch changes up to 8ebebcba559a1bfbaec7bbda64feb9870b9c58da:

  tuntap: fix multiqueue rx (2018-11-18 19:05:43 -0800)


Alexander Stein (1):
  can: flexcan: Always use last mailbox for TX

Andrew Morton (1):
  drivers/net/ethernet/qlogic/qed/qed_rdma.h: fix typo

Aya Levin (1):
  net/mlx4: Fix UBSAN warning of signed integer overflow

Brenda J. Butler (1):
  tc-testing: tdc.py: Guard against lack of returncode in executed command

Christophe JAILLET (1):
  net: lantiq: Fix returned value in case of error in 'xrx200_probe()'

David Ahern (1):
  ipv6: Fix PMTU updates for UDP/raw sockets in presence of VRF

David Howells (1):
  rxrpc: Fix life check

David S. Miller (7):
  Merge tag 'linux-can-fixes-for-4.20-20181109' of 
ssh://gitolite.kernel.org/.../mkl/linux-can
  Merge branch 'qed-Miscellaneous-bug-fixes'
  Merge branch 'bnxt_en-Bug-fixes'
  Merge branch 'mlx4-fixes'
  Merge tag 'batadv-net-for-davem-20181114' of 
git://git.open-mesh.org/linux-merge
  Revert "net: phy: mdio-gpio: Fix working over slow can_sleep GPIOs"
  Merge branch 'tdc-fixes'

Davide Caratti (1):
  net/sched: act_pedit: fix memory leak when IDR allocation fails

Denis Bolotin (3):
  qed: Fix PTT leak in qed_drain()
  qed: Fix overriding offload_tc by protocols without APP TLV
  qed: Fix reading wrong value in loop condition

Eric Dumazet (2):
  net_sched: sch_fq: ensure maxrate fq parameter applies to EDT flows
  net-gro: reset skb->pkt_type in napi_reuse_skb()

Eugeniu Rosca (1):
  dt-bindings: can: rcar_can: document r8a77965 support

Fabrizio Castro (2):
  can: rcar_can: Fix erroneous registration
  dt-bindings: can: rcar_can: Add r8a774a1 support

Ganesh Goudar (1):
  cxgb4: fix thermal zone build error

Jack Morgenstein (1):
  net/mlx4_core: Zero out lkey field in SW2HW_MPT fw command

Jimmy Assarsson (2):
  can: kvaser_usb: Fix potential uninitialized variable use
  can: kvaser_usb: Fix accessing freed memory in kvaser_usb_start_xmit()

Jon Maloy (2):
  tipc: fix lockdep warning when reinitilaizing sockets
  tipc: don't assume linear buffer when reading ancillary data

Lucas Bates (1):
  tc-testing: tdc.py: ignore errors when decoding stdout/stderr

Lukas Wunner (1):
  can: hi311x: Use level-triggered interrupt

Marc Kleine-Budde (5):
  can: flexcan: remove not needed struct flexcan_priv::tx_mb and struct 
flexcan_priv::tx_mb_idx
  can: dev: can_get_echo_skb(): factor out non sending code to 
__can_get_echo_skb()
  can: dev: __can_get_echo_skb(): replace struct can_frame by canfd_frame 
to access frame length
  can: dev: __can_get_echo_skb(): Don't crash the kernel if 
can_priv::echo_skb is accessed out of bounds
  can: dev: __can_get_echo_skb(): print error message, if trying to echo 
non existing skb

Martin Schiller (2):
  net: phy: mdio-gpio: Fix working over slow can_sleep GPIOs
  net: phy: mdio-gpio: Fix working over slow can_sleep GPIOs

Matthew Cover (1):
  tuntap: fix multiqueue rx

Maxime Chevallier (1):
  net: mvneta: Don't advertise 2.5G modes

Michael Chan (5):
  bnxt_en: Fix RSS context allocation.
  bnxt_en: Fix rx_l4_csum_errors counter on 57500 devices.
  bnxt_en: Disable RDMA support on the 57500 chips.
  bnxt_en: Workaround occasional TX timeout on 57500 A0.
  bnxt_en: Add software "missed_irqs" counter.

Michal Kalderon (1):
  qed: Fix rdma_info structure allocation

Nikolay Aleksandrov (1):
  net: bridge: fix vlan stats use-after-free on destruction

Oleksij Rempel (4):
  can: rx-offload: introduce can_rx_offload_get_echo_skb() and 
can_rx_offload_queue_sorted() functions
  can: flexcan: handle tx-complete 

Re: [PATCH net] tuntap: fix multiqueue rx

2018-11-18 Thread David Miller
From: Matthew Cover 
Date: Sun, 18 Nov 2018 00:46:00 -0700

> When writing packets to a descriptor associated with a combined queue, the
> packets should end up on that queue.
> 
> Before this change all packets written to any descriptor associated with a
> tap interface end up on rx-0, even when the descriptor is associated with a
> different queue.
> 
> The rx traffic can be generated by either of the following.
>   1. a simple tap program which spins up multiple queues and writes packets
>  to each of the file descriptors
>   2. tx from a qemu vm with a tap multiqueue netdev
> 
> The queue for rx traffic can be observed by either of the following (done
> on the hypervisor in the qemu case).
>   1. a simple netmap program which opens and reads from per-queue
>  descriptors
>   2. configuring RPS and doing per-cpu captures with rxtxcpu
> 
> Alternatively, if you printk() the return value of skb_get_rx_queue() just
> before each instance of netif_receive_skb() in tun.c, you will get 65535
> for every skb.
> 
> Calling skb_record_rx_queue() to set the rx queue to the queue_index fixes
> the association between descriptor and rx queue.
> 
> Signed-off-by: Matthew Cover 

Applied and queued up for -stable, thanks.


Re: [PATCH net] ipv6: Fix PMTU updates for UDP/raw sockets in presence of VRF

2018-11-18 Thread David Miller
From: David Ahern 
Date: Sun, 18 Nov 2018 10:45:30 -0800

> From: David Ahern 
> 
> Preethi reported that PMTU discovery for UDP/raw applications is not
> working in the presence of VRF when the socket is not bound to a device.
> The problem is that ip6_sk_update_pmtu does not consider the L3 domain
> of the skb device if the socket is not bound. Update the function to
> set oif to the L3 master device if relevant.
> 
> Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
> Reported-by: Preethi Ramachandra 
> Signed-off-by: David Ahern 

Applied and queued up for -stable.


Re: [PATCH net-next] mlxsw: spectrum: Expose discard counters via ethtool

2018-11-18 Thread David Miller
From: Ido Schimmel 
Date: Sun, 18 Nov 2018 16:43:03 +

> From: Shalom Toledo 
> 
> Expose packets discard counters via ethtool to help with debugging.
> 
> Signed-off-by: Shalom Toledo 
> Reviewed-by: Jiri Pirko 
> Signed-off-by: Ido Schimmel 

Applied, thanks.


Re: [PATCH net-next] tun: use netdev_alloc_frag() in tun_napi_alloc_frags()

2018-11-18 Thread David Miller
From: Eric Dumazet 
Date: Sun, 18 Nov 2018 07:37:33 -0800

> In order to cook skbs in the same way than Ethernet drivers,
> it is probably better to not use GFP_KERNEL, but rather
> use the GFP_ATOMIC and PFMEMALLOC mechanisms provided by
> netdev_alloc_frag().
> 
> This would allow to use tun driver even in memory stress
> situations, especially if swap is used over this tun channel.
> 
> Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
> Signed-off-by: Eric Dumazet 

Applied.


[PATCH net-next 12/12] qede: use ethtool_rx_flow_rule() to remove duplicated parser code

2018-11-18 Thread Pablo Neira Ayuso
The qede driver supports for ethtool_rx_flow_spec and flower, both
codebases look very similar.

This patch uses the ethtool_rx_flow_rule() infrastructure to remove the
duplicated ethtool_rx_flow_spec parser and consolidate ACL offload
support around the flow_rule infrastructure.

Furthermore, more code can be consolidated by merging
qede_add_cls_rule() and qede_add_tc_flower_fltr(), these two functions
also look very similar.

This driver currently provides simple ACL support, such as 5-tuple
matching, drop policy and queue to CPU.

Drivers that support more features can benefit from this infrastructure
to save even more redundant codebase.

Signed-off-by: Pablo Neira Ayuso 
---
Note that, after this patch, qede_add_cls_rule() and
qede_add_tc_flower_fltr() can be also consolidated since their code is
redundant.

 drivers/net/ethernet/qlogic/qede/qede_filter.c | 246 ++---
 1 file changed, 53 insertions(+), 193 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c 
b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index aca302c3261b..f82b26ba8f80 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1578,30 +1578,6 @@ static void qede_flow_build_ipv6_hdr(struct 
qede_arfs_tuple *t,
ports[1] = t->dst_port;
 }
 
-/* Validate fields which are set and not accepted by the driver */
-static int qede_flow_spec_validate_unused(struct qede_dev *edev,
- struct ethtool_rx_flow_spec *fs)
-{
-   if (fs->flow_type & FLOW_MAC_EXT) {
-   DP_INFO(edev, "Don't support MAC extensions\n");
-   return -EOPNOTSUPP;
-   }
-
-   if ((fs->flow_type & FLOW_EXT) &&
-   (fs->h_ext.vlan_etype || fs->h_ext.vlan_tci)) {
-   DP_INFO(edev, "Don't support vlan-based classification\n");
-   return -EOPNOTSUPP;
-   }
-
-   if ((fs->flow_type & FLOW_EXT) &&
-   (fs->h_ext.data[0] || fs->h_ext.data[1])) {
-   DP_INFO(edev, "Don't support user defined data\n");
-   return -EOPNOTSUPP;
-   }
-
-   return 0;
-}
-
 static int qede_set_v4_tuple_to_profile(struct qede_dev *edev,
struct qede_arfs_tuple *t)
 {
@@ -1665,132 +1641,6 @@ static int qede_set_v6_tuple_to_profile(struct qede_dev 
*edev,
return 0;
 }
 
-static int qede_flow_spec_to_tuple_ipv4_common(struct qede_dev *edev,
-  struct qede_arfs_tuple *t,
-  struct ethtool_rx_flow_spec *fs)
-{
-   if ((fs->h_u.tcp_ip4_spec.ip4src &
-fs->m_u.tcp_ip4_spec.ip4src) != fs->h_u.tcp_ip4_spec.ip4src) {
-   DP_INFO(edev, "Don't support IP-masks\n");
-   return -EOPNOTSUPP;
-   }
-
-   if ((fs->h_u.tcp_ip4_spec.ip4dst &
-fs->m_u.tcp_ip4_spec.ip4dst) != fs->h_u.tcp_ip4_spec.ip4dst) {
-   DP_INFO(edev, "Don't support IP-masks\n");
-   return -EOPNOTSUPP;
-   }
-
-   if ((fs->h_u.tcp_ip4_spec.psrc &
-fs->m_u.tcp_ip4_spec.psrc) != fs->h_u.tcp_ip4_spec.psrc) {
-   DP_INFO(edev, "Don't support port-masks\n");
-   return -EOPNOTSUPP;
-   }
-
-   if ((fs->h_u.tcp_ip4_spec.pdst &
-fs->m_u.tcp_ip4_spec.pdst) != fs->h_u.tcp_ip4_spec.pdst) {
-   DP_INFO(edev, "Don't support port-masks\n");
-   return -EOPNOTSUPP;
-   }
-
-   if (fs->h_u.tcp_ip4_spec.tos) {
-   DP_INFO(edev, "Don't support tos\n");
-   return -EOPNOTSUPP;
-   }
-
-   t->eth_proto = htons(ETH_P_IP);
-   t->src_ipv4 = fs->h_u.tcp_ip4_spec.ip4src;
-   t->dst_ipv4 = fs->h_u.tcp_ip4_spec.ip4dst;
-   t->src_port = fs->h_u.tcp_ip4_spec.psrc;
-   t->dst_port = fs->h_u.tcp_ip4_spec.pdst;
-
-   return qede_set_v4_tuple_to_profile(edev, t);
-}
-
-static int qede_flow_spec_to_tuple_tcpv4(struct qede_dev *edev,
-struct qede_arfs_tuple *t,
-struct ethtool_rx_flow_spec *fs)
-{
-   t->ip_proto = IPPROTO_TCP;
-
-   if (qede_flow_spec_to_tuple_ipv4_common(edev, t, fs))
-   return -EINVAL;
-
-   return 0;
-}
-
-static int qede_flow_spec_to_tuple_udpv4(struct qede_dev *edev,
-struct qede_arfs_tuple *t,
-struct ethtool_rx_flow_spec *fs)
-{
-   t->ip_proto = IPPROTO_UDP;
-
-   if (qede_flow_spec_to_tuple_ipv4_common(edev, t, fs))
-   return -EINVAL;
-
-   return 0;
-}
-
-static int qede_flow_spec_to_tuple_ipv6_common(struct qede_dev *edev,
-  struct qede_arfs_tuple *t,
-  struct ethtool_rx_flow_spec *fs)
-{
-   struct in6_addr zero_addr;
-
-   memset(&zer

[PATCH net-next,v2 06/12] drivers: net: use flow action infrastructure

2018-11-18 Thread Pablo Neira Ayuso
This patch updates drivers to use the new flow action infrastructure.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c   |  74 +++---
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 250 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 266 ++---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c |   2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  |  55 +++--
 drivers/net/ethernet/netronome/nfp/flower/action.c | 185 +++---
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  12 +-
 7 files changed, 417 insertions(+), 427 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 3d71b2530d67..11c5a0b495b6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -61,9 +61,9 @@ static u16 bnxt_flow_get_dst_fid(struct bnxt *pf_bp, struct 
net_device *dev)
 
 static int bnxt_tc_parse_redir(struct bnxt *bp,
   struct bnxt_tc_actions *actions,
-  const struct tc_action *tc_act)
+  const struct flow_action_key *act)
 {
-   struct net_device *dev = tcf_mirred_dev(tc_act);
+   struct net_device *dev = act->dev;
 
if (!dev) {
netdev_info(bp->dev, "no dev in mirred action");
@@ -77,16 +77,16 @@ static int bnxt_tc_parse_redir(struct bnxt *bp,
 
 static int bnxt_tc_parse_vlan(struct bnxt *bp,
  struct bnxt_tc_actions *actions,
- const struct tc_action *tc_act)
+ const struct flow_action_key *act)
 {
-   switch (tcf_vlan_action(tc_act)) {
-   case TCA_VLAN_ACT_POP:
+   switch (act->id) {
+   case FLOW_ACTION_KEY_VLAN_POP:
actions->flags |= BNXT_TC_ACTION_FLAG_POP_VLAN;
break;
-   case TCA_VLAN_ACT_PUSH:
+   case FLOW_ACTION_KEY_VLAN_PUSH:
actions->flags |= BNXT_TC_ACTION_FLAG_PUSH_VLAN;
-   actions->push_vlan_tci = htons(tcf_vlan_push_vid(tc_act));
-   actions->push_vlan_tpid = tcf_vlan_push_proto(tc_act);
+   actions->push_vlan_tci = htons(act->vlan.vid);
+   actions->push_vlan_tpid = act->vlan.proto;
break;
default:
return -EOPNOTSUPP;
@@ -96,10 +96,10 @@ static int bnxt_tc_parse_vlan(struct bnxt *bp,
 
 static int bnxt_tc_parse_tunnel_set(struct bnxt *bp,
struct bnxt_tc_actions *actions,
-   const struct tc_action *tc_act)
+   const struct flow_action_key *act)
 {
-   struct ip_tunnel_info *tun_info = tcf_tunnel_info(tc_act);
-   struct ip_tunnel_key *tun_key = &tun_info->key;
+   const struct ip_tunnel_info *tun_info = act->tunnel;
+   const struct ip_tunnel_key *tun_key = &tun_info->key;
 
if (ip_tunnel_info_af(tun_info) != AF_INET) {
netdev_info(bp->dev, "only IPv4 tunnel-encap is supported");
@@ -113,51 +113,43 @@ static int bnxt_tc_parse_tunnel_set(struct bnxt *bp,
 
 static int bnxt_tc_parse_actions(struct bnxt *bp,
 struct bnxt_tc_actions *actions,
-struct tcf_exts *tc_exts)
+struct flow_action *flow_action)
 {
-   const struct tc_action *tc_act;
+   struct flow_action_key *act;
int i, rc;
 
-   if (!tcf_exts_has_actions(tc_exts)) {
+   if (!flow_action_has_keys(flow_action)) {
netdev_info(bp->dev, "no actions");
return -EINVAL;
}
 
-   tcf_exts_for_each_action(i, tc_act, tc_exts) {
-   /* Drop action */
-   if (is_tcf_gact_shot(tc_act)) {
+   flow_action_for_each(i, act, flow_action) {
+   switch (act->id) {
+   case FLOW_ACTION_KEY_DROP:
actions->flags |= BNXT_TC_ACTION_FLAG_DROP;
return 0; /* don't bother with other actions */
-   }
-
-   /* Redirect action */
-   if (is_tcf_mirred_egress_redirect(tc_act)) {
-   rc = bnxt_tc_parse_redir(bp, actions, tc_act);
+   case FLOW_ACTION_KEY_REDIRECT:
+   rc = bnxt_tc_parse_redir(bp, actions, act);
if (rc)
return rc;
-   continue;
-   }
-
-   /* Push/pop VLAN */
-   if (is_tcf_vlan(tc_act)) {
-   rc = bnxt_tc_parse_vlan(bp, actions, tc_act);
+   break;
+   case FLOW_ACTION_KEY_VLAN_POP:
+   case FLOW_ACTION_KEY_VLAN_PUSH:
+   case FLOW_ACTION_KEY_VLAN_MANGLE:
+   rc = bnxt_tc_parse_vlan(

[PATCH net-next,v2 09/12] flow_dissector: add basic ethtool_rx_flow_spec to flow_rule structure translator

2018-11-18 Thread Pablo Neira Ayuso
This patch adds a function to translate the ethtool_rx_flow_spec
structure to the flow_rule representation.

This allows us to reuse code from the driver side given that both flower
and ethtool_rx_flow interfaces use the same representation.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 include/net/flow_dissector.h |   5 ++
 net/core/flow_dissector.c| 190 +++
 2 files changed, 195 insertions(+)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 7a4683646d5a..ec9036232538 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -485,4 +485,9 @@ static inline bool flow_rule_match_key(const struct 
flow_rule *rule,
return dissector_uses_key(rule->match.dissector, key);
 }
 
+struct ethtool_rx_flow_spec;
+
+struct flow_rule *ethtool_rx_flow_rule(const struct ethtool_rx_flow_spec *fs);
+void ethtool_rx_flow_rule_free(struct flow_rule *rule);
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b9368349f0f7..ef5bdb62620c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -276,6 +277,195 @@ void flow_action_free(struct flow_action *flow_action)
 }
 EXPORT_SYMBOL(flow_action_free);
 
+struct ethtool_rx_flow_key {
+   struct flow_dissector_key_basic basic;
+   union {
+   struct flow_dissector_key_ipv4_addrsipv4;
+   struct flow_dissector_key_ipv6_addrsipv6;
+   };
+   struct flow_dissector_key_ports tp;
+   struct flow_dissector_key_ipip;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
+
+struct ethtool_rx_flow_match {
+   struct flow_dissector   dissector;
+   struct ethtool_rx_flow_key  key;
+   struct ethtool_rx_flow_key  mask;
+};
+
+struct flow_rule *ethtool_rx_flow_rule(const struct ethtool_rx_flow_spec *fs)
+{
+   static struct in6_addr zero_addr = {};
+   struct ethtool_rx_flow_match *match;
+   struct flow_action_key *act;
+   struct flow_rule *rule;
+
+   rule = kmalloc(sizeof(struct flow_rule), GFP_KERNEL);
+   if (!rule)
+   return NULL;
+
+   match = kzalloc(sizeof(struct ethtool_rx_flow_match), GFP_KERNEL);
+   if (!match)
+   goto err_match;
+
+   rule->match.dissector   = &match->dissector;
+   rule->match.mask= &match->mask;
+   rule->match.key = &match->key;
+
+   match->mask.basic.n_proto = 0x;
+
+   switch (fs->flow_type & ~FLOW_EXT) {
+   case TCP_V4_FLOW:
+   case UDP_V4_FLOW: {
+   const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
+
+   match->key.basic.n_proto = htons(ETH_P_IP);
+
+   v4_spec = &fs->h_u.tcp_ip4_spec;
+   v4_m_spec = &fs->m_u.tcp_ip4_spec;
+
+   if (v4_m_spec->ip4src) {
+   match->key.ipv4.src = v4_spec->ip4src;
+   match->mask.ipv4.src = v4_m_spec->ip4src;
+   }
+   if (v4_m_spec->ip4dst) {
+   match->key.ipv4.dst = v4_spec->ip4dst;
+   match->mask.ipv4.dst = v4_m_spec->ip4dst;
+   }
+   if (v4_m_spec->ip4src ||
+   v4_m_spec->ip4dst) {
+   match->dissector.used_keys |=
+   FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+   match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] =
+   offsetof(struct ethtool_rx_flow_key, ipv4);
+   }
+   if (v4_m_spec->psrc) {
+   match->key.tp.src = v4_spec->psrc;
+   match->mask.tp.src = v4_m_spec->psrc;
+   }
+   if (v4_m_spec->pdst) {
+   match->key.tp.dst = v4_spec->pdst;
+   match->mask.tp.dst = v4_m_spec->pdst;
+   }
+   if (v4_m_spec->psrc ||
+   v4_m_spec->pdst) {
+   match->dissector.used_keys |= FLOW_DISSECTOR_KEY_PORTS;
+   match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] =
+   offsetof(struct ethtool_rx_flow_key, tp);
+   }
+   if (v4_m_spec->tos) {
+   match->key.ip.tos = v4_spec->pdst;
+   match->mask.ip.tos = v4_m_spec->pdst;
+   match->dissector.used_keys |= FLOW_DISSECTOR_KEY_IP;
+   match->dissector.offset[FLOW_DISSECTOR_KEY_IP] =
+   offsetof(struct ethtool_rx_flow_key, ip);
+   }
+   }
+   break;
+   case TCP_V6_FLOW:
+   case UDP_V6_FLOW: {
+   const struct ethtool_tcpip6_spec *v6_spec, *v6_m_sp

[PATCH net-next 11/12] qede: place ethtool_rx_flow_spec after code after TC flower codebase

2018-11-18 Thread Pablo Neira Ayuso
This is a preparation patch to reuse the existing TC flower codebase
from ethtool_rx_flow_spec.

This patch is merely moving the core ethtool_rx_flow_spec parser after
tc flower offload driver code so we can skip a few forward function
declarations in the follow up patch.

Signed-off-by: Pablo Neira Ayuso 
---
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 264 -
 1 file changed, 132 insertions(+), 132 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c 
b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index e71e0ff13452..aca302c3261b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1791,72 +1791,6 @@ static int qede_flow_spec_to_tuple_udpv6(struct qede_dev 
*edev,
return 0;
 }
 
-static int qede_flow_spec_to_tuple(struct qede_dev *edev,
-  struct qede_arfs_tuple *t,
-  struct ethtool_rx_flow_spec *fs)
-{
-   memset(t, 0, sizeof(*t));
-
-   if (qede_flow_spec_validate_unused(edev, fs))
-   return -EOPNOTSUPP;
-
-   switch ((fs->flow_type & ~FLOW_EXT)) {
-   case TCP_V4_FLOW:
-   return qede_flow_spec_to_tuple_tcpv4(edev, t, fs);
-   case UDP_V4_FLOW:
-   return qede_flow_spec_to_tuple_udpv4(edev, t, fs);
-   case TCP_V6_FLOW:
-   return qede_flow_spec_to_tuple_tcpv6(edev, t, fs);
-   case UDP_V6_FLOW:
-   return qede_flow_spec_to_tuple_udpv6(edev, t, fs);
-   default:
-   DP_VERBOSE(edev, NETIF_MSG_IFUP,
-  "Can't support flow of type %08x\n", fs->flow_type);
-   return -EOPNOTSUPP;
-   }
-
-   return 0;
-}
-
-static int qede_flow_spec_validate(struct qede_dev *edev,
-  struct ethtool_rx_flow_spec *fs,
-  struct qede_arfs_tuple *t)
-{
-   if (fs->location >= QEDE_RFS_MAX_FLTR) {
-   DP_INFO(edev, "Location out-of-bounds\n");
-   return -EINVAL;
-   }
-
-   /* Check location isn't already in use */
-   if (test_bit(fs->location, edev->arfs->arfs_fltr_bmap)) {
-   DP_INFO(edev, "Location already in use\n");
-   return -EINVAL;
-   }
-
-   /* Check if the filtering-mode could support the filter */
-   if (edev->arfs->filter_count &&
-   edev->arfs->mode != t->mode) {
-   DP_INFO(edev,
-   "flow_spec would require filtering mode %08x, but %08x 
is configured\n",
-   t->mode, edev->arfs->filter_count);
-   return -EINVAL;
-   }
-
-   /* If drop requested then no need to validate other data */
-   if (fs->ring_cookie == RX_CLS_FLOW_DISC)
-   return 0;
-
-   if (ethtool_get_flow_spec_ring_vf(fs->ring_cookie))
-   return 0;
-
-   if (fs->ring_cookie >= QEDE_RSS_COUNT(edev)) {
-   DP_INFO(edev, "Queue out-of-bounds\n");
-   return -EINVAL;
-   }
-
-   return 0;
-}
-
 /* Must be called while qede lock is held */
 static struct qede_arfs_fltr_node *
 qede_flow_find_fltr(struct qede_dev *edev, struct qede_arfs_tuple *t)
@@ -1896,72 +1830,6 @@ static void qede_flow_set_destination(struct qede_dev 
*edev,
   "Configuring N-tuple for VF 0x%02x\n", n->vfid - 1);
 }
 
-int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
-{
-   struct ethtool_rx_flow_spec *fsp = &info->fs;
-   struct qede_arfs_fltr_node *n;
-   struct qede_arfs_tuple t;
-   int min_hlen, rc;
-
-   __qede_lock(edev);
-
-   if (!edev->arfs) {
-   rc = -EPERM;
-   goto unlock;
-   }
-
-   /* Translate the flow specification into something fittign our DB */
-   rc = qede_flow_spec_to_tuple(edev, &t, fsp);
-   if (rc)
-   goto unlock;
-
-   /* Make sure location is valid and filter isn't already set */
-   rc = qede_flow_spec_validate(edev, fsp, &t);
-   if (rc)
-   goto unlock;
-
-   if (qede_flow_find_fltr(edev, &t)) {
-   rc = -EINVAL;
-   goto unlock;
-   }
-
-   n = kzalloc(sizeof(*n), GFP_KERNEL);
-   if (!n) {
-   rc = -ENOMEM;
-   goto unlock;
-   }
-
-   min_hlen = qede_flow_get_min_header_size(&t);
-   n->data = kzalloc(min_hlen, GFP_KERNEL);
-   if (!n->data) {
-   kfree(n);
-   rc = -ENOMEM;
-   goto unlock;
-   }
-
-   n->sw_id = fsp->location;
-   set_bit(n->sw_id, edev->arfs->arfs_fltr_bmap);
-   n->buf_len = min_hlen;
-
-   memcpy(&n->tuple, &t, sizeof(n->tuple));
-
-   qede_flow_set_destination(edev, n, fsp);
-
-   /* Build a minimal header according to the flow */
-   n->tuple.build_hdr(&n->tuple, n->data);
-
-   rc = qede_enqueue_fltr_and_c

[PATCH net-next,v2 10/12] dsa: bcm_sf2: use flow_rule infrastructure

2018-11-18 Thread Pablo Neira Ayuso
Update this driver to use the flow_rule infrastructure, hence we can use
the same code to populate hardware IR from ethtool_rx_flow and the
cls_flower interfaces.

Signed-off-by: Pablo Neira Ayuso 
---
v2: remove unused variables, requested by David S. Miller.

 drivers/net/dsa/bcm_sf2_cfp.c | 108 +++---
 1 file changed, 70 insertions(+), 38 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index e14663ab6dbc..8d8f00c7d43f 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -257,7 +257,8 @@ static int bcm_sf2_cfp_act_pol_set(struct bcm_sf2_priv 
*priv,
 }
 
 static void bcm_sf2_cfp_slice_ipv4(struct bcm_sf2_priv *priv,
-  struct ethtool_tcpip4_spec *v4_spec,
+  struct flow_dissector_key_ipv4_addrs *addrs,
+  struct flow_dissector_key_ports *ports,
   unsigned int slice_num,
   bool mask)
 {
@@ -278,7 +279,7 @@ static void bcm_sf2_cfp_slice_ipv4(struct bcm_sf2_priv 
*priv,
 * UDF_n_A6 [23:8]
 * UDF_n_A5 [7:0]
 */
-   reg = be16_to_cpu(v4_spec->pdst) >> 8;
+   reg = be16_to_cpu(ports->dst) >> 8;
if (mask)
offset = CORE_CFP_MASK_PORT(3);
else
@@ -289,9 +290,9 @@ static void bcm_sf2_cfp_slice_ipv4(struct bcm_sf2_priv 
*priv,
 * UDF_n_A4 [23:8]
 * UDF_n_A3 [7:0]
 */
-   reg = (be16_to_cpu(v4_spec->pdst) & 0xff) << 24 |
- (u32)be16_to_cpu(v4_spec->psrc) << 8 |
- (be32_to_cpu(v4_spec->ip4dst) & 0xff00) >> 8;
+   reg = (be16_to_cpu(ports->dst) & 0xff) << 24 |
+ (u32)be16_to_cpu(ports->src) << 8 |
+ (be32_to_cpu(addrs->dst) & 0xff00) >> 8;
if (mask)
offset = CORE_CFP_MASK_PORT(2);
else
@@ -302,9 +303,9 @@ static void bcm_sf2_cfp_slice_ipv4(struct bcm_sf2_priv 
*priv,
 * UDF_n_A2 [23:8]
 * UDF_n_A1 [7:0]
 */
-   reg = (u32)(be32_to_cpu(v4_spec->ip4dst) & 0xff) << 24 |
- (u32)(be32_to_cpu(v4_spec->ip4dst) >> 16) << 8 |
- (be32_to_cpu(v4_spec->ip4src) & 0xff00) >> 8;
+   reg = (u32)(be32_to_cpu(addrs->dst) & 0xff) << 24 |
+ (u32)(be32_to_cpu(addrs->dst) >> 16) << 8 |
+ (be32_to_cpu(addrs->src) & 0xff00) >> 8;
if (mask)
offset = CORE_CFP_MASK_PORT(1);
else
@@ -317,8 +318,8 @@ static void bcm_sf2_cfp_slice_ipv4(struct bcm_sf2_priv 
*priv,
 * Slice ID [3:2]
 * Slice valid  [1:0]
 */
-   reg = (u32)(be32_to_cpu(v4_spec->ip4src) & 0xff) << 24 |
- (u32)(be32_to_cpu(v4_spec->ip4src) >> 16) << 8 |
+   reg = (u32)(be32_to_cpu(addrs->src) & 0xff) << 24 |
+ (u32)(be32_to_cpu(addrs->src) >> 16) << 8 |
  SLICE_NUM(slice_num) | SLICE_VALID;
if (mask)
offset = CORE_CFP_MASK_PORT(0);
@@ -332,9 +333,13 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv 
*priv, int port,
 unsigned int queue_num,
 struct ethtool_rx_flow_spec *fs)
 {
-   struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
const struct cfp_udf_layout *layout;
unsigned int slice_num, rule_index;
+   struct flow_match_ipv4_addrs ipv4;
+   struct flow_match_ports ports;
+   struct flow_match_basic basic;
+   struct flow_rule *flow_rule;
+   struct flow_match_ip ip;
u8 ip_proto, ip_frag;
u8 num_udf;
u32 reg;
@@ -343,13 +348,9 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv 
*priv, int port,
switch (fs->flow_type & ~FLOW_EXT) {
case TCP_V4_FLOW:
ip_proto = IPPROTO_TCP;
-   v4_spec = &fs->h_u.tcp_ip4_spec;
-   v4_m_spec = &fs->m_u.tcp_ip4_spec;
break;
case UDP_V4_FLOW:
ip_proto = IPPROTO_UDP;
-   v4_spec = &fs->h_u.udp_ip4_spec;
-   v4_m_spec = &fs->m_u.udp_ip4_spec;
break;
default:
return -EINVAL;
@@ -367,11 +368,22 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv 
*priv, int port,
if (rule_index > bcm_sf2_cfp_rule_size(priv))
return -ENOSPC;
 
+   flow_rule = ethtool_rx_flow_rule(fs);
+   if (!flow_rule)
+   return -ENOMEM;
+
+   flow_rule_match_ipv4_addrs(flow_rule, &ipv4);
+   flow_rule_match_ports(flow_rule, &ports);
+   flow_rule_match_basic(flow_rule, &basic);
+   flow_rule_match_ip(flow_rule, &ip);
+
layout = &udf_tcpip4_layout;
/* We only use one UDF slice for now */
slice_num = bcm_sf2_get_slice_number(layout, 0);
-   if (sl

[PATCH net-next,v2 07/12] cls_flower: don't expose TC actions to drivers anymore

2018-11-18 Thread Pablo Neira Ayuso
Now that drivers have been converted to use the flow action
infrastructure, remove this field from the tc_cls_flower_offload
structure.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 include/net/pkt_cls.h  | 1 -
 net/sched/cls_flower.c | 5 -
 2 files changed, 6 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 7f9a8d5ca945..fe64638034f8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -769,7 +769,6 @@ struct tc_cls_flower_offload {
enum tc_fl_command command;
unsigned long cookie;
struct flow_rule rule;
-   struct tcf_exts *exts;
u32 classid;
struct tc_cls_flower_stats stats;
 };
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ee67f1ae8786..440d475c55d0 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -389,7 +389,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.rule.match.dissector = &f->mask->dissector;
cls_flower.rule.match.mask = &f->mask->key;
cls_flower.rule.match.key = &f->mkey;
-   cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
 
if (tc_setup_flow_action(&f->action, &f->exts) < 0)
@@ -425,7 +424,6 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
cls_flower.command = TC_CLSFLOWER_STATS;
cls_flower.cookie = (unsigned long) f;
-   cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
 
tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
@@ -1484,7 +1482,6 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, 
tc_setup_cb_t *cb,
cls_flower.rule.match.dissector = &mask->dissector;
cls_flower.rule.match.mask = &mask->key;
cls_flower.rule.match.key = &f->mkey;
-   cls_flower.exts = &f->exts;
cls_flower.rule.action.num_keys = f->action.num_keys;
cls_flower.rule.action.keys = f->action.keys;
cls_flower.classid = f->res.classid;
@@ -1509,7 +1506,6 @@ static void fl_hw_create_tmplt(struct tcf_chain *chain,
 {
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = chain->block;
-   struct tcf_exts dummy_exts = { 0, };
 
cls_flower.common.chain_index = chain->index;
cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
@@ -1517,7 +1513,6 @@ static void fl_hw_create_tmplt(struct tcf_chain *chain,
cls_flower.rule.match.dissector = &tmplt->dissector;
cls_flower.rule.match.mask = &tmplt->mask;
cls_flower.rule.match.key = &tmplt->dummy_key;
-   cls_flower.exts = &dummy_exts;
 
/* We don't care if driver (any of them) fails to handle this
 * call. It serves just as a hint for it.
-- 
2.11.0



[PATCH net-next,v2 08/12] flow_dissector: add wake-up-on-lan and queue to flow_action

2018-11-18 Thread Pablo Neira Ayuso
These actions need to be added to support bcm sf2 features available
through the ethtool_rx_flow interface.

Reviewed-by: Florian Fainelli 
Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 include/net/flow_dissector.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 925c208816f1..7a4683646d5a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -418,6 +418,8 @@ enum flow_action_key_id {
FLOW_ACTION_KEY_ADD,
FLOW_ACTION_KEY_CSUM,
FLOW_ACTION_KEY_MARK,
+   FLOW_ACTION_KEY_WAKE,
+   FLOW_ACTION_KEY_QUEUE,
 };
 
 /* This is mirroring enum pedit_header_type definition for easy mapping between
@@ -452,6 +454,7 @@ struct flow_action_key {
const struct ip_tunnel_info *tunnel;/* 
FLOW_ACTION_KEY_TUNNEL_ENCAP */
u32 csum_flags; /* FLOW_ACTION_KEY_CSUM 
*/
u32 mark;   /* FLOW_ACTION_KEY_MARK 
*/
+   u32 queue_index;/* 
FLOW_ACTION_KEY_QUEUE */
};
 };
 
-- 
2.11.0



[PATCH net-next,v2 05/12] cls_flower: add statistics retrieval infrastructure and use it

2018-11-18 Thread Pablo Neira Ayuso
This patch provides a tc_cls_flower_stats structure that acts as
container for tc_cls_flower_offload, then we can use to restore the
statistics on the existing TC actions. Hence, tcf_exts_stats_update() is
not used from drivers.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c  |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c  |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c |  2 +-
 drivers/net/ethernet/netronome/nfp/flower/offload.c   |  6 +++---
 include/net/pkt_cls.h | 15 +++
 net/sched/cls_flower.c|  4 
 7 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index b82143d6cdde..3d71b2530d67 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -1366,8 +1366,8 @@ static int bnxt_tc_get_flow_stats(struct bnxt *bp,
lastused = flow->lastused;
spin_unlock(&flow->stats_lock);
 
-   tcf_exts_stats_update(tc_flow_cmd->exts, stats.bytes, stats.packets,
- lastused);
+   tc_cls_flower_stats_update(tc_flow_cmd, stats.bytes, stats.packets,
+  lastused);
return 0;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 39c5af5dad3d..2c7d1aebe214 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -807,9 +807,9 @@ int cxgb4_tc_flower_stats(struct net_device *dev,
if (ofld_stats->packet_count != packets) {
if (ofld_stats->prev_packet_count != packets)
ofld_stats->last_used = jiffies;
-   tcf_exts_stats_update(cls->exts, bytes - ofld_stats->byte_count,
- packets - ofld_stats->packet_count,
- ofld_stats->last_used);
+   tc_cls_flower_stats_update(cls, bytes - ofld_stats->byte_count,
+  packets - ofld_stats->packet_count,
+  ofld_stats->last_used);
 
ofld_stats->packet_count = packets;
ofld_stats->byte_count = bytes;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 2645e5d1e790..c5f0b826fa91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3224,7 +3224,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 
mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
 
-   tcf_exts_stats_update(f->exts, bytes, packets, lastuse);
+   tc_cls_flower_stats_update(f, bytes, packets, lastuse);
 
return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 193a6f9acf79..3398984ffb2a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -460,7 +460,7 @@ int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
if (err)
goto err_rule_get_stats;
 
-   tcf_exts_stats_update(f->exts, bytes, packets, lastuse);
+   tc_cls_flower_stats_update(f, bytes, packets, lastuse);
 
mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c 
b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 708331234908..bec74d84756c 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -532,9 +532,9 @@ nfp_flower_get_stats(struct nfp_app *app, struct net_device 
*netdev,
ctx_id = be32_to_cpu(nfp_flow->meta.host_ctx_id);
 
spin_lock_bh(&priv->stats_lock);
-   tcf_exts_stats_update(flow->exts, priv->stats[ctx_id].bytes,
- priv->stats[ctx_id].pkts,
- priv->stats[ctx_id].used);
+   tc_cls_flower_stats_update(flow, priv->stats[ctx_id].bytes,
+  priv->stats[ctx_id].pkts,
+  priv->stats[ctx_id].used);
 
priv->stats[ctx_id].pkts = 0;
priv->stats[ctx_id].bytes = 0;
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 7d7aefa5fcd2..7f9a8d5ca945 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -758,6 +758,12 @@ enum tc_fl_command {
TC_CLSFLOWER_TMPLT_DESTROY,
 };
 
+struct tc_cls_flower_stats {
+   u64 pkts;
+   u64 bytes;
+   u64 lastused;
+};
+
 struct tc_cls_flower_offload {

[PATCH net-next,v2 04/12] cls_api: add translator to flow_action representation

2018-11-18 Thread Pablo Neira Ayuso
This patch implements a new function to translate from native TC action
to the new flow_action representation. Moreover, this patch also updates
cls_flower to use this new function.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 include/net/pkt_cls.h  |   3 ++
 net/sched/cls_api.c| 113 +
 net/sched/cls_flower.c |  15 ++-
 3 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8b79a1a3a5c7..7d7aefa5fcd2 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -619,6 +619,9 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 #endif /* CONFIG_NET_CLS_IND */
 
+int tc_setup_flow_action(struct flow_action *flow_action,
+const struct tcf_exts *exts);
+
 int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
 enum tc_setup_type type, void *type_data, bool err_stop);
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index d92f44ac4c39..6ab44e650f43 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -31,6 +31,14 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
 
@@ -2567,6 +2575,111 @@ int tc_setup_cb_call(struct tcf_block *block, struct 
tcf_exts *exts,
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
+int tc_setup_flow_action(struct flow_action *flow_action,
+const struct tcf_exts *exts)
+{
+   const struct tc_action *act;
+   int num_acts = 0, i, j, k;
+
+   if (!exts)
+   return 0;
+
+   tcf_exts_for_each_action(i, act, exts) {
+   if (is_tcf_pedit(act))
+   num_acts += tcf_pedit_nkeys(act);
+   else
+   num_acts++;
+   }
+   if (!num_acts)
+   return 0;
+
+   if (flow_action_init(flow_action, num_acts) < 0)
+   return -ENOMEM;
+
+   j = 0;
+   tcf_exts_for_each_action(i, act, exts) {
+   struct flow_action_key *key;
+
+   key = &flow_action->keys[j];
+   if (is_tcf_gact_ok(act)) {
+   key->id = FLOW_ACTION_KEY_ACCEPT;
+   } else if (is_tcf_gact_shot(act)) {
+   key->id = FLOW_ACTION_KEY_DROP;
+   } else if (is_tcf_gact_trap(act)) {
+   key->id = FLOW_ACTION_KEY_TRAP;
+   } else if (is_tcf_gact_goto_chain(act)) {
+   key->id = FLOW_ACTION_KEY_GOTO;
+   key->chain_index = tcf_gact_goto_chain_index(act);
+   } else if (is_tcf_mirred_egress_redirect(act)) {
+   key->id = FLOW_ACTION_KEY_REDIRECT;
+   key->dev = tcf_mirred_dev(act);
+   } else if (is_tcf_mirred_egress_mirror(act)) {
+   key->id = FLOW_ACTION_KEY_MIRRED;
+   key->dev = tcf_mirred_dev(act);
+   } else if (is_tcf_vlan(act)) {
+   switch (tcf_vlan_action(act)) {
+   case TCA_VLAN_ACT_PUSH:
+   key->id = FLOW_ACTION_KEY_VLAN_PUSH;
+   key->vlan.vid = tcf_vlan_push_vid(act);
+   key->vlan.proto = tcf_vlan_push_proto(act);
+   key->vlan.prio = tcf_vlan_push_prio(act);
+   break;
+   case TCA_VLAN_ACT_POP:
+   key->id = FLOW_ACTION_KEY_VLAN_POP;
+   break;
+   case TCA_VLAN_ACT_MODIFY:
+   key->id = FLOW_ACTION_KEY_VLAN_MANGLE;
+   key->vlan.vid = tcf_vlan_push_vid(act);
+   key->vlan.proto = tcf_vlan_push_proto(act);
+   key->vlan.prio = tcf_vlan_push_prio(act);
+   break;
+   default:
+   goto err_out;
+   }
+   } else if (is_tcf_tunnel_set(act)) {
+   key->id = FLOW_ACTION_KEY_TUNNEL_ENCAP;
+   key->tunnel = tcf_tunnel_info(act);
+   } else if (is_tcf_tunnel_release(act)) {
+   key->id = FLOW_ACTION_KEY_TUNNEL_DECAP;
+   key->tunnel = tcf_tunnel_info(act);
+   } else if (is_tcf_pedit(act)) {
+   for (k = 0; k < tcf_pedit_nkeys(act); k++) {
+   switch (tcf_pedit_cmd(act, k)) {
+   case TCA_PEDIT_KEY_EX_CMD_SET:
+   key->id = FLOW_ACTION_KEY_MANGLE;
+   break;
+   case TCA_PEDIT_KEY_EX_CMD_ADD:
+ 

[PATCH net-next,v2 01/12] flow_dissector: add flow_rule and flow_match structures and use them

2018-11-18 Thread Pablo Neira Ayuso
This patch wraps the dissector key and mask - that flower uses to
represent the matching side - around the flow_match structure.

To avoid a follow up patch that would edit the same LoCs in the drivers,
this patch also wraps this new flow match structure around the flow rule
object. This new structure will also contain the flow actions in follow
up patches.

This introduces two new interfaces:

bool flow_rule_match_key(rule, dissector_id)

that returns true if a given matching key is set on, and:

flow_rule_match_XYZ(rule, &match);

To fetch the matching side XYZ into the match container structure, to
retrieve the key and the mask with one single call.

Signed-off-by: Pablo Neira Ayuso 
---
v2: Use reverse xmas tree for variable definition, requested by David S. Miller.

 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c   | 174 -
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 194 --
 drivers/net/ethernet/intel/i40e/i40e_main.c| 178 -
 drivers/net/ethernet/intel/iavf/iavf_main.c| 195 --
 drivers/net/ethernet/intel/igb/igb_main.c  |  64 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 420 +
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  | 202 +-
 drivers/net/ethernet/netronome/nfp/flower/action.c |  11 +-
 drivers/net/ethernet/netronome/nfp/flower/match.c  | 417 ++--
 .../net/ethernet/netronome/nfp/flower/offload.c| 145 +++
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  85 ++---
 include/net/flow_dissector.h   | 107 ++
 include/net/pkt_cls.h  |  10 +-
 net/core/flow_dissector.c  | 133 +++
 net/sched/cls_flower.c |  18 +-
 15 files changed, 1151 insertions(+), 1202 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 749f63beddd8..b82143d6cdde 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -177,18 +177,12 @@ static int bnxt_tc_parse_actions(struct bnxt *bp,
return 0;
 }
 
-#define GET_KEY(flow_cmd, key_type)\
-   skb_flow_dissector_target((flow_cmd)->dissector, key_type,\
- (flow_cmd)->key)
-#define GET_MASK(flow_cmd, key_type)   \
-   skb_flow_dissector_target((flow_cmd)->dissector, key_type,\
- (flow_cmd)->mask)
-
 static int bnxt_tc_parse_flow(struct bnxt *bp,
  struct tc_cls_flower_offload *tc_flow_cmd,
  struct bnxt_tc_flow *flow)
 {
-   struct flow_dissector *dissector = tc_flow_cmd->dissector;
+   struct flow_rule *rule = tc_cls_flower_offload_flow_rule(tc_flow_cmd);
+   struct flow_dissector *dissector = rule->match.dissector;
 
/* KEY_CONTROL and KEY_BASIC are needed for forming a meaningful key */
if ((dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) == 0 ||
@@ -198,140 +192,120 @@ static int bnxt_tc_parse_flow(struct bnxt *bp,
return -EOPNOTSUPP;
}
 
-   if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_BASIC)) {
-   struct flow_dissector_key_basic *key =
-   GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_BASIC);
-   struct flow_dissector_key_basic *mask =
-   GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_BASIC);
+   if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
+   struct flow_match_basic match;
 
-   flow->l2_key.ether_type = key->n_proto;
-   flow->l2_mask.ether_type = mask->n_proto;
+   flow_rule_match_basic(rule, &match);
+   flow->l2_key.ether_type = match.key->n_proto;
+   flow->l2_mask.ether_type = match.mask->n_proto;
 
-   if (key->n_proto == htons(ETH_P_IP) ||
-   key->n_proto == htons(ETH_P_IPV6)) {
-   flow->l4_key.ip_proto = key->ip_proto;
-   flow->l4_mask.ip_proto = mask->ip_proto;
+   if (match.key->n_proto == htons(ETH_P_IP) ||
+   match.key->n_proto == htons(ETH_P_IPV6)) {
+   flow->l4_key.ip_proto = match.key->ip_proto;
+   flow->l4_mask.ip_proto = match.mask->ip_proto;
}
}
 
-   if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
-   struct flow_dissector_key_eth_addrs *key =
-   GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ETH_ADDRS);
-   struct flow_dissector_key_eth_addrs *mask =
-   GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_ETH_ADDRS);
+   if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+

[PATCH net-next,v2 03/12] flow_dissector: add flow action infrastructure

2018-11-18 Thread Pablo Neira Ayuso
This new infrastructure defines the nic actions that you can perform
from existing network drivers. This infrastructure allows us to avoid a
direct dependency with the native software TC action representation.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 include/net/flow_dissector.h | 70 
 net/core/flow_dissector.c| 18 
 2 files changed, 88 insertions(+)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 965a82b8d881..925c208816f1 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -402,8 +402,78 @@ void flow_rule_match_enc_keyid(const struct flow_rule 
*rule,
 void flow_rule_match_enc_opts(const struct flow_rule *rule,
  struct flow_match_enc_opts *out);
 
+enum flow_action_key_id {
+   FLOW_ACTION_KEY_ACCEPT  = 0,
+   FLOW_ACTION_KEY_DROP,
+   FLOW_ACTION_KEY_TRAP,
+   FLOW_ACTION_KEY_GOTO,
+   FLOW_ACTION_KEY_REDIRECT,
+   FLOW_ACTION_KEY_MIRRED,
+   FLOW_ACTION_KEY_VLAN_PUSH,
+   FLOW_ACTION_KEY_VLAN_POP,
+   FLOW_ACTION_KEY_VLAN_MANGLE,
+   FLOW_ACTION_KEY_TUNNEL_ENCAP,
+   FLOW_ACTION_KEY_TUNNEL_DECAP,
+   FLOW_ACTION_KEY_MANGLE,
+   FLOW_ACTION_KEY_ADD,
+   FLOW_ACTION_KEY_CSUM,
+   FLOW_ACTION_KEY_MARK,
+};
+
+/* This is mirroring enum pedit_header_type definition for easy mapping between
+ * tc pedit action. Legacy TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK is mapped to
+ * FLOW_ACT_MANGLE_UNSPEC, which is supported by no driver.
+ */
+enum flow_act_mangle_base {
+   FLOW_ACT_MANGLE_UNSPEC  = 0,
+   FLOW_ACT_MANGLE_HDR_TYPE_ETH,
+   FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+   FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+   FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+   FLOW_ACT_MANGLE_HDR_TYPE_UDP,
+};
+
+struct flow_action_key {
+   enum flow_action_key_id id;
+   union {
+   u32 chain_index;/* FLOW_ACTION_KEY_GOTO 
*/
+   struct net_device   *dev;   /* 
FLOW_ACTION_KEY_REDIRECT */
+   struct {/* FLOW_ACTION_KEY_VLAN 
*/
+   u16 vid;
+   __be16  proto;
+   u8  prio;
+   } vlan;
+   struct {/* 
FLOW_ACTION_KEY_PACKET_EDIT */
+   enum flow_act_mangle_base htype;
+   u32 offset;
+   u32 mask;
+   u32 val;
+   } mangle;
+   const struct ip_tunnel_info *tunnel;/* 
FLOW_ACTION_KEY_TUNNEL_ENCAP */
+   u32 csum_flags; /* FLOW_ACTION_KEY_CSUM 
*/
+   u32 mark;   /* FLOW_ACTION_KEY_MARK 
*/
+   };
+};
+
+struct flow_action {
+   int num_keys;
+   struct flow_action_key  *keys;
+};
+
+int flow_action_init(struct flow_action *flow_action, int num_acts);
+void flow_action_free(struct flow_action *flow_action);
+
+static inline bool flow_action_has_keys(const struct flow_action *action)
+{
+   return action->num_keys;
+}
+
+#define flow_action_for_each(__i, __act, __actions)\
+for (__i = 0, __act = &(__actions)->keys[0]; __i < 
(__actions)->num_keys; __act = &(__actions)->keys[++__i])
+
 struct flow_rule {
struct flow_match   match;
+   struct flow_action  action;
 };
 
 static inline bool flow_rule_match_key(const struct flow_rule *rule,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 186089b8d852..b9368349f0f7 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -258,6 +258,24 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
 }
 EXPORT_SYMBOL(flow_rule_match_enc_opts);
 
+int flow_action_init(struct flow_action *flow_action, int num_acts)
+{
+   flow_action->keys = kmalloc(sizeof(struct flow_action_key) * num_acts,
+   GFP_KERNEL);
+   if (!flow_action->keys)
+   return -ENOMEM;
+
+   flow_action->num_keys = num_acts;
+   return 0;
+}
+EXPORT_SYMBOL(flow_action_init);
+
+void flow_action_free(struct flow_action *flow_action)
+{
+   kfree(flow_action->keys);
+}
+EXPORT_SYMBOL(flow_action_free);
+
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
  * @skb: sk_buff to extract the ports from
-- 
2.11.0



[PATCH net-next,v2 02/12] net/mlx5e: support for two independent packet edit actions

2018-11-18 Thread Pablo Neira Ayuso
This patch adds pedit_headers_action structure to store the result of
parsing tc pedit actions. Then, it calls alloc_tc_pedit_action() to
populate the mlx5e hardware intermediate representation once all actions
have been parsed.

This patch comes in preparation for the new flow_action infrastructure,
where each packet mangling comes in an separated action, ie. not packed
as in tc pedit.

Signed-off-by: Pablo Neira Ayuso 
---
v2: no changes.

 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 81 ++---
 1 file changed, 59 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 6a22f7f22890..2645e5d1e790 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1748,6 +1748,12 @@ struct pedit_headers {
struct udphdr  udp;
 };
 
+struct pedit_headers_action {
+   struct pedit_headersvals;
+   struct pedit_headersmasks;
+   u32 pedits;
+};
+
 static int pedit_header_offsets[] = {
[TCA_PEDIT_KEY_EX_HDR_TYPE_ETH] = offsetof(struct pedit_headers, eth),
[TCA_PEDIT_KEY_EX_HDR_TYPE_IP4] = offsetof(struct pedit_headers, ip4),
@@ -1759,16 +1765,15 @@ static int pedit_header_offsets[] = {
 #define pedit_header(_ph, _htype) ((void *)(_ph) + 
pedit_header_offsets[_htype])
 
 static int set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset,
-struct pedit_headers *masks,
-struct pedit_headers *vals)
+struct pedit_headers_action *hdrs)
 {
u32 *curr_pmask, *curr_pval;
 
if (hdr_type >= __PEDIT_HDR_TYPE_MAX)
goto out_err;
 
-   curr_pmask = (u32 *)(pedit_header(masks, hdr_type) + offset);
-   curr_pval  = (u32 *)(pedit_header(vals, hdr_type) + offset);
+   curr_pmask = (u32 *)(pedit_header(&hdrs->masks, hdr_type) + offset);
+   curr_pval  = (u32 *)(pedit_header(&hdrs->vals, hdr_type) + offset);
 
if (*curr_pmask & mask)  /* disallow acting twice on the same location 
*/
goto out_err;
@@ -1824,8 +1829,7 @@ static struct mlx5_fields fields[] = {
  * max from the SW pedit action. On success, it says how many HW actions were
  * actually parsed.
  */
-static int offload_pedit_fields(struct pedit_headers *masks,
-   struct pedit_headers *vals,
+static int offload_pedit_fields(struct pedit_headers_action *hdrs,
struct mlx5e_tc_flow_parse_attr *parse_attr,
struct netlink_ext_ack *extack)
 {
@@ -1840,10 +1844,10 @@ static int offload_pedit_fields(struct pedit_headers 
*masks,
__be16 mask_be16;
void *action;
 
-   set_masks = &masks[TCA_PEDIT_KEY_EX_CMD_SET];
-   add_masks = &masks[TCA_PEDIT_KEY_EX_CMD_ADD];
-   set_vals = &vals[TCA_PEDIT_KEY_EX_CMD_SET];
-   add_vals = &vals[TCA_PEDIT_KEY_EX_CMD_ADD];
+   set_masks = &hdrs[TCA_PEDIT_KEY_EX_CMD_SET].masks;
+   add_masks = &hdrs[TCA_PEDIT_KEY_EX_CMD_ADD].masks;
+   set_vals = &hdrs[TCA_PEDIT_KEY_EX_CMD_SET].vals;
+   add_vals = &hdrs[TCA_PEDIT_KEY_EX_CMD_ADD].vals;
 
action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto);
action = parse_attr->mod_hdr_actions;
@@ -1939,12 +1943,14 @@ static int offload_pedit_fields(struct pedit_headers 
*masks,
 }
 
 static int alloc_mod_hdr_actions(struct mlx5e_priv *priv,
-const struct tc_action *a, int namespace,
+struct pedit_headers_action *hdrs,
+int namespace,
 struct mlx5e_tc_flow_parse_attr *parse_attr)
 {
int nkeys, action_size, max_actions;
 
-   nkeys = tcf_pedit_nkeys(a);
+   nkeys = hdrs[TCA_PEDIT_KEY_EX_CMD_SET].pedits +
+   hdrs[TCA_PEDIT_KEY_EX_CMD_ADD].pedits;
action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto);
 
if (namespace == MLX5_FLOW_NAMESPACE_FDB) /* FDB offloading */
@@ -1968,18 +1974,15 @@ static const struct pedit_headers zero_masks = {};
 static int parse_tc_pedit_action(struct mlx5e_priv *priv,
 const struct tc_action *a, int namespace,
 struct mlx5e_tc_flow_parse_attr *parse_attr,
+struct pedit_headers_action *hdrs,
 struct netlink_ext_ack *extack)
 {
-   struct pedit_headers masks[__PEDIT_CMD_MAX], vals[__PEDIT_CMD_MAX], 
*cmd_masks;
int nkeys, i, err = -EOPNOTSUPP;
u32 mask, val, offset;
u8 cmd, htype;
 
nkeys = tcf_pedit_nkeys(a);
 
-   memset(masks, 0, sizeof(struct pedit_headers) * __PEDIT_CMD_MAX);
-   memset(vals,  0, sizeof(struct pedit_headers) * __PEDIT_CMD_MAX);
-
for (i = 0; i < nkeys; i++) {
hty

[PATCH 00/12 net-next,v2] add flow_rule infrastructure

2018-11-18 Thread Pablo Neira Ayuso
Hi,

This patchset introduces a kernel intermediate representation (IR) to
express ACL hardware offloads, as already described in previous RFC and
v1 patchset [1] [2]. The idea is to normalize the frontend U/APIs to use
the flow dissectors and the flow actions so drivers can reuse the
existing TC offload driver codebase - that has been converted to use the
flow_rule infrastructure.

After this patch, as Or previously described, there is one extra layer:

kernel frontend U/API X --> kernel parser Y --> IR --> driver --> HW API
kernel frontend U/API Z --> kernel parser W --> IR --> driver --> HW API

However, cost of this layer is very small, adding 1 million rules via
tc -batch, perf shows:

 0.06%  tc   [kernel.vmlinux][k] tc_setup_flow_action

at position 187 in the call graph, far from the top ten. The flow_match
representation uses the flow dissector infrastructure, just like
cls_flower, therefore, there is no need for conversion of the rule match
side.

The flow_action representation is very similar to the TC action plus
this includes wake-up-on-lan and queue to CPU actions that are needed
for the ethtool_rx_flow_spec interface in the bcm_sf2 driver, that is
converted in this patchset to use it. It is now possible to add tc
cls_flower support for bcm_sf2 and reuse the existing parser that was
originally designed for the ethtool_rx_flow_spec interface.

As requested, this new patchset also converts qlogic/qede to use this
new infrastructure (see patch 12/12). This driver currently has two
parsers, one for ethtool_rx_flow_spec and another for tc cls_flower.
This driver supports for simple 5-tuple matching and available actions
are packet drop and queue. This patch updates the driver code to use one
single parser to populate HW IR.

Thanks.

[1] https://lwn.net/Articles/766695/
[2] https://marc.info/?l=linux-netdev&m=154233253114506&w=2

Pablo Neira Ayuso (12):
  flow_dissector: add flow_rule and flow_match structures and use them
  net/mlx5e: support for two independent packet edit actions
  flow_dissector: add flow action infrastructure
  cls_api: add translator to flow_action representation
  cls_flower: add statistics retrieval infrastructure and use it
  drivers: net: use flow action infrastructure
  cls_flower: don't expose TC actions to drivers anymore
  flow_dissector: add wake-up-on-lan and queue to flow_action
  flow_dissector: add basic ethtool_rx_flow_spec to flow_rule structure
translator
  dsa: bcm_sf2: use flow_rule infrastructure
  qede: place ethtool_rx_flow_spec after code after TC flower codebase
  qede: use ethtool_rx_flow_rule() to remove duplicated parser code

 drivers/net/dsa/bcm_sf2_cfp.c  | 108 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c   | 252 +++
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 450 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c| 178 ++---
 drivers/net/ethernet/intel/iavf/iavf_main.c| 195 +++---
 drivers/net/ethernet/intel/igb/igb_main.c  |  64 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 743 ++---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c |   2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  | 259 ---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 196 +++---
 drivers/net/ethernet/netronome/nfp/flower/match.c  | 417 ++--
 .../net/ethernet/netronome/nfp/flower/offload.c| 151 ++---
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 537 ++-
 include/net/flow_dissector.h   | 185 +
 include/net/pkt_cls.h  |  29 +-
 net/core/flow_dissector.c  | 341 ++
 net/sched/cls_api.c| 113 
 net/sched/cls_flower.c |  42 +-
 18 files changed, 2279 insertions(+), 1983 deletions(-)

-- 
2.11.0



[no subject]

2018-11-18 Thread Major Dennis Hornbeck



I am in the military unit here in Afghanistan, we have some amount of funds 
that we want to move out of the country. My partners and I need a good partner 
someone we can trust. It is risk free and legal. Reply to this email: 
hornbeckmajordennis...@gmail.com
Regards,Major Dennis Hornbeck.


Re: Linux kernel hangs if using RV1108 with MSZ8863 switch with two ports connected

2018-11-18 Thread Heiko Stübner
Hi all,

Am Sonntag, 18. November 2018, 19:12:30 CET schrieb Andrew Lunn:
> > The kernel starts booting normally and then hangs like this when two
> > Ethernet cables are connected to the KSZ8863 switch:
> > http://dark-code.bulix.org/3xexu5-507563
>
> > This has the lock detection, inside kernel hacking, enabled.
> 
> Maybe turn on all the hung-task debug and magic key support.  With
> magic key, you might be able to get a backtrace of where it is
> spinning.
> 
> Maybe also add #define DEBUG at the top of drivers/net/phy/phy.c.
> Does it hang during a PHY state transition?
> 
> Maybe both PHYs are interrupting at the same time, and the interrupt
> code is broken?
> 
> Maybe look at the switch driver and see if there is any code which is
> executed on link up. Put some printk() in there.
> 
> If you PHYs are using interrupt mode, maybe disable that and use
> polling.
> 
> Do you know if this ever worked properly before? If you know when it
> did work, you can do a git bisect to narrow it down to the one patch
> which broke it..
> 
> Basically, at the moment, you just need to try lots of things, to
> narrow it down.

Your hang also seems to happen around the time the kernel disables
unused clocks and regulators.

So you might also try with these functions disabled, as it may be caused
by some clock or regulator handled wrongly there (I think it's called
clk_ignore_unused as kernel commandline but please double-check
and you'll need to check for a regulator equivalent yourself).

And as I think it might be some sort of driver-related issue, you could
also enable debugging in the driver-core [drivers/base/dd.c]
by either #define DEBUG or just redefining dev_dbg temporarily ;-)
#define dev_dbg dev_warn
or so.

That may help finding the culprit of your hang.


Heiko




Re: [PATCH net-next] add part of TCP counts explanations in snmp_counters.rst

2018-11-18 Thread David Miller
From: yupeng 
Date: Fri, 16 Nov 2018 11:17:40 -0800

> Add explanations of some generic TCP counters, fast open
> related counters and TCP abort related counters and several
> examples.
> 
> Signed-off-by: yupeng 

Applied.


[PATCH net] sctp: always set frag_point on pmtu change

2018-11-18 Thread Jakub Audykowicz
Calling send on a connected SCTP socket results in kernel panic if
spp_pathmtu was configured manually before an association is established
and it was not reconfigured to another value once the association is
established.

Steps to reproduce:
1. Set up a listening SCTP server socket.
2. Set up an SCTP client socket.
3. Configure client socket using setsockopt SCTP_PEER_ADDR_PARAMS with
spp_pathmtu set to a legal value (e.g. 1000) and
SPP_PMTUD_DISABLE set in spp_flags.
4. Connect client to server.
5. Send message from client to server.

At this point oom-killer is invoked, which will eventually lead to:
[5.197262] Out of memory and no killable processes...
[5.198107] Kernel panic - not syncing: System is deadlocked on memory

Commit 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
introduces sctp_assoc_update_frag_point, but this function is not called
in this case, causing frag_point to be zero:
 void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
 {
-   if (asoc->pathmtu != pmtu)
+   if (asoc->pathmtu != pmtu) {
asoc->pathmtu = pmtu;
+   sctp_assoc_update_frag_point(asoc);
+   }

In this scenario, on association establishment, asoc->pathmtu is already
1000 and pmtu will be as well. Before this commit the frag_point was being
correctly set in the scenario described. Moving the call outside the if
block fixes the issue.

I will be providing a separate patch to lksctp-tools with a simple test
reproducing this problem ("func_tests: frag_point should never be zero").

I have also taken the liberty to introduce a sanity check in chunk.c to
set the frag_point to a non-negative value in order to avoid chunking
endlessly (which is the reason for the eventual panic).

Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
Signed-off-by: Jakub Audykowicz 
---
 include/net/sctp/constants.h |  3 +++
 net/sctp/associola.c | 13 +++--
 net/sctp/chunk.c |  6 ++
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8dadc74c22e7..90316fab6f04 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -293,6 +293,9 @@ enum { SCTP_MAX_GABS = 16 };
 */
 #define SCTP_DEFAULT_MINSEGMENT 512/* MTU size ... if no mtu disc */
 
+/* An association's fragmentation point should never be non-positive */
+#define SCTP_FRAG_POINT_MIN 1
+
 #define SCTP_SECRET_SIZE 32/* Number of octets in a 256 bits. */
 
 #define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96e779e..44d71a1af62e 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1431,13 +1431,14 @@ void sctp_assoc_update_frag_point(struct 
sctp_association *asoc)
 
 void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
 {
-   if (asoc->pathmtu != pmtu) {
-   asoc->pathmtu = pmtu;
-   sctp_assoc_update_frag_point(asoc);
-   }
+   pr_debug("%s: before asoc:%p, pmtu:%d, frag_point:%d\n",
+   __func__, asoc, asoc->pathmtu, asoc->frag_point);
+
+   asoc->pathmtu = pmtu;
+   sctp_assoc_update_frag_point(asoc);
 
-   pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
-asoc->pathmtu, asoc->frag_point);
+   pr_debug("%s: after asoc:%p, pmtu:%d, frag_point:%d\n",
+   __func__, asoc, asoc->pathmtu, asoc->frag_point);
 }
 
 /* Update the association's pmtu and frag_point by going through all the
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce8087846f05..9f0e64ddbd9c 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -190,6 +190,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
/* This is the biggest possible DATA chunk that can fit into
 * the packet
 */
+   if (asoc->frag_point < SCTP_FRAG_POINT_MIN) {
+   pr_warn("%s: asoc:%p->frag_point is less than allowed (%d<%d)",
+   __func__, asoc, asoc->frag_point, SCTP_FRAG_POINT_MIN);
+   pr_warn("forcing minimum value to avoid chunking endlessly");
+   asoc->frag_point = SCTP_FRAG_POINT_MIN;
+   }
max_data = asoc->frag_point;
 
/* If the the peer requested that we authenticate DATA chunks
-- 
2.17.1



Re: DSA support for Marvell 88e6065 switch

2018-11-18 Thread Pavel Machek
HI!

> > > > I'm trying to create support for Marvell 88e6065 switch... and it
> > > > seems like drivers/net/dsa supports everything, but this model.
> > > > 
> > > > Did someone work with this hardware before? Any idea if it would be
> > > > more suitable to support by existing 88e6060 code, or if 88e6xxx code
> > > > should serve as a base?
> > > 
> > > Hi Pavel
> > > 
> > > The 88e6xxx should be extended to support this. I think you will find
> > > a lot of the building blocks are already in the driver. Compare the
> > > various implementations of the functions in the mv88e6xxx_ops to what
> > > the datasheet says for the registers, and pick those that match.
> > 
> > Ok, so I played a bit.
> > 
> > It looks like e6065 has different register layout from those supported
> > by 6xxx, and is quite similar to e6060.
> 
> Hi Pavel
> 
> However, if you look in the mv88e6xxx, there are quite a few functions
> called mv88e6065_foo. Marvell keeps changing the register layout. When
> writing code, we try to name the functions based on which family of
> devices introduced those registers. But we don't have the whole
> history, so we probably have some names wrong.

Let me check... I thought there was only one such function. Ok, I see
two such functions:

mv88e6065_phylink_validate
mv88e6065_port_set_speed

I may need to re-check, but it looked to me like even functions and
registeres labeled 6xxx are different on 6060 and 6065... which means
changes to the code would not be exactly nice and easy.

> > I understand how 88e6xxx code is supposed to work... but I don't
> > understand how e6060 code is supposed to be probed. It does not seem
> > to have device tree support. It seems to be older code, but is way
> > simpler, and seems to be targetted at similar hardware.
> 
> The e6060 code is really old, pretty much abandoned. To make it
> usable, you are going to have to make a lot of changes. Turn it into
> an mdio driver, add device tree, make it use the new dsa2.c API, etc.

If I wanted it to work, what do I need to do? AFAICT phy autoprobing
should just attach it as soon as it is compiled in?

I tried adding nodes in dts trying to make the driver attach, but no
luck so far.

> I still think you should be looking at the mv88e6xxx driver, but maybe
> taking bits of code from the 6060 driver and adding it to the
> mv88e6xxx driver as needed.

e6060 driver is much simpler, which is somehow nice. Let me look
around some more.

Thanks,
Pavel
-- 
DENX Software Engineering GmbH,  Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany


signature.asc
Description: Digital signature


Re: [iproute2-next PATCH v3 1/2] tc: flower: Classify packets based port ranges

2018-11-18 Thread Jiri Pirko
Fri, Nov 16, 2018 at 01:55:13AM CET, amritha.namb...@intel.com wrote:
>Added support for filtering based on port ranges.
>UAPI changes have been accepted into net-next.
>
>Example:
>1. Match on a port range:
>-
>$ tc filter add dev enp4s0 protocol ip parent :\
>  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
>  action drop
>
>$ tc -s filter show dev enp4s0 parent :
>filter protocol ip pref 1 flower chain 0
>filter protocol ip pref 1 flower chain 0 handle 0x1
>  eth_type ipv4
>  ip_proto tcp
>  dst_port range 20-30
>  skip_hw
>  not_in_hw
>action order 1: gact action drop
> random type none pass val 0
> index 1 ref 1 bind 1 installed 85 sec used 3 sec
>Action statistics:
>Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
>backlog 0b 0p requeues 0
>
>2. Match on IP address and port range:
>--
>$ tc filter add dev enp4s0 protocol ip parent :\
>  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
>  skip_hw action drop
>
>$ tc -s filter show dev enp4s0 parent :
>filter protocol ip pref 1 flower chain 0 handle 0x2
>  eth_type ipv4
>  ip_proto tcp
>  dst_ip 192.168.1.1
>  dst_port range 100-200
>  skip_hw
>  not_in_hw
>action order 1: gact action drop
> random type none pass val 0
> index 2 ref 1 bind 1 installed 58 sec used 2 sec
>Action statistics:
>Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
>backlog 0b 0p requeues 0
>
>v3:
>Modified flower_port_range_attr_type calls.
>
>v2:
>Addressed Jiri's comment to sync output format with input
>
>Signed-off-by: Amritha Nambiar 

Looks ok. But why do you have man changes in a separate patch ?
I think it should be in this one. Anyway

Acked-by: Jiri Pirko 



[PATCH net] ipv6: Fix PMTU updates for UDP/raw sockets in presence of VRF

2018-11-18 Thread David Ahern
From: David Ahern 

Preethi reported that PMTU discovery for UDP/raw applications is not
working in the presence of VRF when the socket is not bound to a device.
The problem is that ip6_sk_update_pmtu does not consider the L3 domain
of the skb device if the socket is not bound. Update the function to
set oif to the L3 master device if relevant.

Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
Reported-by: Preethi Ramachandra 
Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 14b422f35504..059f0531f7c1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2359,10 +2359,13 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu);
 
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 {
+   int oif = sk->sk_bound_dev_if;
struct dst_entry *dst;
 
-   ip6_update_pmtu(skb, sock_net(sk), mtu,
-   sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
+   if (!oif && skb->dev)
+   oif = l3mdev_master_ifindex(skb->dev);
+
+   ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
 
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
-- 
2.11.0



Re: DSA support for Marvell 88e6065 switch

2018-11-18 Thread Andrew Lunn
On Sun, Nov 18, 2018 at 07:07:12PM +0100, Pavel Machek wrote:
> On Thu 2018-11-15 21:26:18, Andrew Lunn wrote:
> > On Thu, Nov 15, 2018 at 08:51:11PM +0100, Pavel Machek wrote:
> > > Hi!
> > > 
> > > I'm trying to create support for Marvell 88e6065 switch... and it
> > > seems like drivers/net/dsa supports everything, but this model.
> > > 
> > > Did someone work with this hardware before? Any idea if it would be
> > > more suitable to support by existing 88e6060 code, or if 88e6xxx code
> > > should serve as a base?
> > 
> > Hi Pavel
> > 
> > The 88e6xxx should be extended to support this. I think you will find
> > a lot of the building blocks are already in the driver. Compare the
> > various implementations of the functions in the mv88e6xxx_ops to what
> > the datasheet says for the registers, and pick those that match.
> 
> Ok, so I played a bit.
> 
> It looks like e6065 has different register layout from those supported
> by 6xxx, and is quite similar to e6060.

Hi Pavel

However, if you look in the mv88e6xxx, there are quite a few functions
called mv88e6065_foo. Marvell keeps changing the register layout. When
writing code, we try to name the functions based on which family of
devices introduced those registers. But we don't have the whole
history, so we probably have some names wrong.

> I understand how 88e6xxx code is supposed to work... but I don't
> understand how e6060 code is supposed to be probed. It does not seem
> to have device tree support. It seems to be older code, but is way
> simpler, and seems to be targetted at similar hardware.

The e6060 code is really old, pretty much abandoned. To make it
usable, you are going to have to make a lot of changes. Turn it into
an mdio driver, add device tree, make it use the new dsa2.c API, etc.

I still think you should be looking at the mv88e6xxx driver, but maybe
taking bits of code from the 6060 driver and adding it to the
mv88e6xxx driver as needed.

  Andrew


Re: Linux kernel hangs if using RV1108 with MSZ8863 switch with two ports connected

2018-11-18 Thread Andrew Lunn
> The kernel starts booting normally and then hangs like this when two
> Ethernet cables are connected to the KSZ8863 switch:
> http://dark-code.bulix.org/3xexu5-507563
> 
> This has the lock detection, inside kernel hacking, enabled.

Maybe turn on all the hung-task debug and magic key support.  With
magic key, you might be able to get a backtrace of where it is
spinning.

Maybe also add #define DEBUG at the top of drivers/net/phy/phy.c.
Does it hang during a PHY state transition?

Maybe both PHYs are interrupting at the same time, and the interrupt
code is broken?

Maybe look at the switch driver and see if there is any code which is
executed on link up. Put some printk() in there.

If you PHYs are using interrupt mode, maybe disable that and use
polling.

Do you know if this ever worked properly before? If you know when it
did work, you can do a git bisect to narrow it down to the one patch
which broke it..

Basically, at the moment, you just need to try lots of things, to
narrow it down.

   Andrew


Re: DSA support for Marvell 88e6065 switch

2018-11-18 Thread Pavel Machek
On Thu 2018-11-15 21:26:18, Andrew Lunn wrote:
> On Thu, Nov 15, 2018 at 08:51:11PM +0100, Pavel Machek wrote:
> > Hi!
> > 
> > I'm trying to create support for Marvell 88e6065 switch... and it
> > seems like drivers/net/dsa supports everything, but this model.
> > 
> > Did someone work with this hardware before? Any idea if it would be
> > more suitable to support by existing 88e6060 code, or if 88e6xxx code
> > should serve as a base?
> 
> Hi Pavel
> 
> The 88e6xxx should be extended to support this. I think you will find
> a lot of the building blocks are already in the driver. Compare the
> various implementations of the functions in the mv88e6xxx_ops to what
> the datasheet says for the registers, and pick those that match.

Ok, so I played a bit.

It looks like e6065 has different register layout from those supported
by 6xxx, and is quite similar to e6060.

I understand how 88e6xxx code is supposed to work... but I don't
understand how e6060 code is supposed to be probed. It does not seem
to have device tree support. It seems to be older code, but is way
simpler, and seems to be targetted at similar hardware.

Best regards,

Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH RFC net-next] net: SAIL based FIB lookup for XDP

2018-11-18 Thread Andrew Lunn
> > + * The router can have upto 255 ports. This limitation
> > + * allows us to represent netdev_index as an u8
> > + */
> > +#define NETDEV_COUNT_MAX 255
> 
> ... 255 is high for a physical port count but not a logical device
> count. In the presence of VLANs 255 is nothing and VLANs are an
> essential deployment feature.

I agree with David here. Doing some network simulation work using
namespaces, i've had more than 255 devices in the root namespace.

Andrew


Re: [PATCH RFC net-next] net: SAIL based FIB lookup for XDP

2018-11-18 Thread David Ahern
On 11/11/18 7:25 PM, Md. Islam wrote:
> This patch implements SAIL[1] based routing table lookup for XDP. I
> however made some changes from the original proposal (details are
> described in the patch). This changes decreased the memory consumption
> from 21.94 MB to 4.97 MB for my example routing table with 400K
> routes.
> 
> This patch can perform FIB lookup in 50 CPU cycles for the example
> routing table (with 400K routes) whereas LC-trie takes around 350 CPU
> cycles.
> 
> I tried to follow all the advice I got from my last patch. Looking
> forward to your review.
> 
> 1. Yang, Tong, et al. "Guarantee IP lookup performance with FIB
> explosion." ACM SIGCOMM Computer Communication Review. Vol. 44. No. 4.
> ACM, 2014.

This work you are doing on different FIB algorithms is interesting and
probably has its use cases where it is beneficial, but you are still not
integrating it into the Linux stack correctly.

For starters, it is wrong to have 2 separate FIB data structures for the
same network namespace. If SAIL is good enough for XDP, it should be
good enough for normal routing in the namespace. There is no need to put
the same route data in 2 places. That means the FIB algorithm needs to
be selectable - either trie or sail but not both.

Further, you are not handling unexpected conditions - e.g., multipath or
lwtunnel encaps, cleanup of device references on a FIB entry delete or
device overflows ...

> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 69c91d1..cc275c7 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -197,6 +197,62 @@ struct fib_entry_notifier_info {
>  u32 tb_id;
>  };
> 
> +#if IS_ENABLED(CONFIG_FIB_SAIL_XDP)
> +/*
> + * The router can have upto 255 ports. This limitation
> + * allows us to represent netdev_index as an u8
> + */
> +#define NETDEV_COUNT_MAX 255

... 255 is high for a physical port count but not a logical device
count. In the presence of VLANs 255 is nothing and VLANs are an
essential deployment feature.

> +
> +struct chunk {

chunk is too generic. sail_chunk at least puts it in the sail symbol
namespace.

> +/*256-bit bitmap. Here i-th bit (from LSB) is set to 1 if C24[i] > 0 */
> +u64 bitmap[4];
> +/*
> + * Index to C24 where chunk is started. A chunk corresponds
> + * to 256 elements. Instead of having just one start index for the
> + * whole chunk, we divide the chunk into 4 parts and save start
> + * index for each part.
> + */
> +u64 start_index[4];
> +};
> +
> +struct sail {
> +/*default next-hop (Level 0)*/
> +u8def_nh;
> +
> +/*Level 16*/
> +u8 __rcu *N16;
> +u8 __rcu *P16;
> +u16 __rcu *C16;
> +
> +/*Level 24*/
> +u8 __rcu *N24;
> +u8 __rcu *P24;
> +struct chunk __rcu *CK24;
> +u32 __rcu *C24;
> +u32 cnk24_count;/*Number of chunks in level 24*/
> +
> +/*Level 32*/
> +u8 __rcu *N32;
> +u8 __rcu *P32;
> +u32 cnk32_count;/*Number of chunks in level 32*/
> +
> +/*Index to this array is stored in N16, N24 and N32*/
> +struct net_device*netdevs[NETDEV_COUNT_MAX];
> +u8 netdev_count;/*Number of netdevs*/
> +
> +spinlock_t lock;
> +};
> +
> +int sail_insert(struct sail *s, u32 key,
> +u8 prefix_len, struct net_device *dev);
> +int sail_delete(struct sail *s, u32 key,
> +u8 prefix_len);
> +int sail_flush(struct sail *s);
> +int sail_lookup(const struct sail *s, const __be32 dest,
> +struct net_device **dev);
> +#endif

Put the new FIB algorithm specific defines in a new header.

> +
>  struct fib_nh_notifier_info {
>  struct fib_notifier_info info; /* must be first */
>  struct fib_nh *fib_nh;
> @@ -219,6 +275,10 @@ struct fib_table {
>  inttb_num_default;
>  struct rcu_headrcu;
>  unsigned long *tb_data;
> +#if IS_ENABLED(CONFIG_FIB_SAIL_XDP)
> +/*Each FIB table will have its own SAIL structure.*/
> +struct sailsail;

Per comment above a separate sail entry is unnecessary when this
algorithm is an alternative to trie; It overlaps tb_data - see code
references for it.

> +#endif
>  unsigned long__data[0];
>  };
> 

...

> diff --git a/net/ipv4/fib_sail_xdp.c b/net/ipv4/fib_sail_xdp.c
> new file mode 100644
> index 000..f3f56c5
> --- /dev/null
> +++ b/net/ipv4/fib_sail_xdp.c
> @@ -0,0 +1,939 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2018-19 MD Iftakharul Islam (Tamim) 
> + *
> + *   This program is free software; you can redistribute it and/or
> + *   modify it under the terms of the GNU General Public License
> + *   as published by the Free Software Foundation; either version
> + *   2 of the License, or (at your option) any later version.
> + *
> + *
> + * This is SAIL_L based routing table lookup which was initially proposed in:
> + *
> + * Yang, Tong, Gaogang Xie, YanBiao Li, Qiaobin Fu, Alex X. Liu, Qi Li,
> + * and Laurent Mathy. "Guarantee IP lookup performance with FIB explosion."

[PATCH net-next] mlxsw: spectrum: Expose discard counters via ethtool

2018-11-18 Thread Ido Schimmel
From: Shalom Toledo 

Expose packets discard counters via ethtool to help with debugging.

Signed-off-by: Shalom Toledo 
Reviewed-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 142 
 .../net/ethernet/mellanox/mlxsw/spectrum.c| 155 ++
 2 files changed, 297 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h 
b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index d3babcc49fd2..be2ffbd19e3a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -4235,8 +4235,11 @@ MLXSW_ITEM32(reg, ppcnt, pnat, 0x00, 14, 2);
 
 enum mlxsw_reg_ppcnt_grp {
MLXSW_REG_PPCNT_IEEE_8023_CNT = 0x0,
+   MLXSW_REG_PPCNT_RFC_2863_CNT = 0x1,
MLXSW_REG_PPCNT_RFC_2819_CNT = 0x2,
+   MLXSW_REG_PPCNT_RFC_3635_CNT = 0x3,
MLXSW_REG_PPCNT_EXT_CNT = 0x5,
+   MLXSW_REG_PPCNT_DISCARD_CNT = 0x6,
MLXSW_REG_PPCNT_PRIO_CNT = 0x10,
MLXSW_REG_PPCNT_TC_CNT = 0x11,
MLXSW_REG_PPCNT_TC_CONG_TC = 0x13,
@@ -4251,6 +4254,7 @@ enum mlxsw_reg_ppcnt_grp {
  * 0x2: RFC 2819 Counters
  * 0x3: RFC 3635 Counters
  * 0x5: Ethernet Extended Counters
+ * 0x6: Ethernet Discard Counters
  * 0x8: Link Level Retransmission Counters
  * 0x10: Per Priority Counters
  * 0x11: Per Traffic Class Counters
@@ -4394,8 +4398,46 @@ MLXSW_ITEM64(reg, ppcnt, 
a_pause_mac_ctrl_frames_received,
 MLXSW_ITEM64(reg, ppcnt, a_pause_mac_ctrl_frames_transmitted,
 MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x90, 0, 64);
 
+/* Ethernet RFC 2863 Counter Group */
+
+/* reg_ppcnt_if_in_discards
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, if_in_discards,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x10, 0, 64);
+
+/* reg_ppcnt_if_out_discards
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, if_out_discards,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x38, 0, 64);
+
+/* reg_ppcnt_if_out_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, if_out_errors,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x40, 0, 64);
+
 /* Ethernet RFC 2819 Counter Group */
 
+/* reg_ppcnt_ether_stats_undersize_pkts
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_undersize_pkts,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x30, 0, 64);
+
+/* reg_ppcnt_ether_stats_oversize_pkts
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_oversize_pkts,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x38, 0, 64);
+
+/* reg_ppcnt_ether_stats_fragments
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_fragments,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x40, 0, 64);
+
 /* reg_ppcnt_ether_stats_pkts64octets
  * Access: RO
  */
@@ -4456,6 +4498,32 @@ MLXSW_ITEM64(reg, ppcnt, 
ether_stats_pkts4096to8191octets,
 MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts8192to10239octets,
 MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0xA0, 0, 64);
 
+/* Ethernet RFC 3635 Counter Group */
+
+/* reg_ppcnt_dot3stats_fcs_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, dot3stats_fcs_errors,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* reg_ppcnt_dot3stats_symbol_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, dot3stats_symbol_errors,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x60, 0, 64);
+
+/* reg_ppcnt_dot3control_in_unknown_opcodes
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, dot3control_in_unknown_opcodes,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x68, 0, 64);
+
+/* reg_ppcnt_dot3in_pause_frames
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, dot3in_pause_frames,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x70, 0, 64);
+
 /* Ethernet Extended Counter Group Counters */
 
 /* reg_ppcnt_ecn_marked
@@ -4464,6 +4532,80 @@ MLXSW_ITEM64(reg, ppcnt, 
ether_stats_pkts8192to10239octets,
 MLXSW_ITEM64(reg, ppcnt, ecn_marked,
 MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
 
+/* Ethernet Discard Counter Group Counters */
+
+/* reg_ppcnt_ingress_general
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ingress_general,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_ingress_policy_engine
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ingress_policy_engine,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* reg_ppcnt_ingress_vlan_membership
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ingress_vlan_membership,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x10, 0, 64);
+
+/* reg_ppcnt_ingress_tag_frame_type
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ingress_tag_frame_type,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x18, 0, 64);
+
+/* reg_ppcnt_egress_vlan_membership
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, egress_vlan_membership,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x20, 0, 64);
+
+/* reg_ppcnt_loopback_filter
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, loopback_filter,
+MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x28, 0, 64);
+
+/* reg_ppcnt_egress_general
+ * Access: RO
+ */
+MLXSW_ITEM64(reg,

[PATCH net-next] tun: use netdev_alloc_frag() in tun_napi_alloc_frags()

2018-11-18 Thread Eric Dumazet
In order to cook skbs in the same way than Ethernet drivers,
it is probably better to not use GFP_KERNEL, but rather
use the GFP_ATOMIC and PFMEMALLOC mechanisms provided by
netdev_alloc_frag().

This would allow to use tun driver even in memory stress
situations, especially if swap is used over this tun channel.

Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
Signed-off-by: Eric Dumazet 
Cc: Petar Penkov 
Cc: Mahesh Bandewar 
Cc: Willem de Bruijn 
---
 drivers/net/tun.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 
36163a147d3950a5d7451abed96809c2af7c322f..1e9da697081d10e086a26deb1ab38e62f77436b5
 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1478,23 +1478,22 @@ static struct sk_buff *tun_napi_alloc_frags(struct 
tun_file *tfile,
skb->truesize += skb->data_len;
 
for (i = 1; i < it->nr_segs; i++) {
-   struct page_frag *pfrag = ¤t->task_frag;
size_t fragsz = it->iov[i].iov_len;
+   struct page *page;
+   void *frag;
 
if (fragsz == 0 || fragsz > PAGE_SIZE) {
err = -EINVAL;
goto free;
}
-
-   if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) {
+   frag = netdev_alloc_frag(fragsz);
+   if (!frag) {
err = -ENOMEM;
goto free;
}
-
-   skb_fill_page_desc(skb, i - 1, pfrag->page,
-  pfrag->offset, fragsz);
-   page_ref_inc(pfrag->page);
-   pfrag->offset += fragsz;
+   page = virt_to_head_page(frag);
+   skb_fill_page_desc(skb, i - 1, page,
+  frag - page_address(page), fragsz);
}
 
return skb;
-- 
2.19.1.1215.g8438c0b245-goog



Re: hw csum failure + conntrack with more debugging information

2018-11-18 Thread Andre Tomt

On 18.11.2018 02:12, Eric Dumazet wrote:



On Sat, Nov 17, 2018 at 3:18 PM Andre Tomt > wrote:


I added Cong Wang's hw csum failure debug patch to my 4.19.2 tree and
got a splat with a bit more information.

 > [47273.905616] p0xe0: hw csum failure
 > [47273.905642] dev features: 0x000860c000114bb3
 > [47273.905663] skb len=44 data_len=0 gso_size=0 gso_type=0
ip_summed=2 csum=0, csum_complete_sw=0, csum_valid=0
 > [47273.905706] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.19.0-1 #1
 > [47273.905707] Hardware name: Supermicro Super
Server/X10SDV-4C-TLN2F, BIOS 2.0 06/13/2018
 > [47273.905707] Call Trace:
 > [47273.905710]  
 > [47273.905717]  dump_stack+0x5c/0x80
 > [47273.905721]  __skb_checksum_complete+0xaf/0xc0
 > [47273.905731]  icmp_error+0x1c8/0x1f0 [nf_conntrack]
 > [47273.905734]  ? skb_copy_bits+0x13d/0x220
 > [47273.905740]  nf_conntrack_in+0xd8/0x390 [nf_conntrack]
 > [47273.905743]  ? ___pskb_trim+0x192/0x330
 > [47273.905746]  nf_hook_slow+0x43/0xc0
 > [47273.905749]  ip_rcv+0x90/0xb0
 > [47273.905752]  ? ip_rcv_finish_core.isra.0+0x310/0x310
 > [47273.905754]  __netif_receive_skb_one_core+0x42/0x50
 > [47273.905756]  netif_receive_skb_internal+0x24/0xb0
 > [47273.905758]  napi_gro_frags+0x177/0x210
 > [47273.905762]  mlx4_en_process_rx_cq+0x8df/0xb50 [mlx4_en]
 > [47273.905773]  ? mlx4_eq_int+0x38f/0xcb0 [mlx4_core]
 > [47273.905776]  mlx4_en_poll_rx_cq+0x55/0xf0 [mlx4_en]
 > [47273.905778]  net_rx_action+0xe1/0x2c0
 > [47273.905781]  __do_softirq+0xe7/0x2d3
 > [47273.905784]  irq_exit+0x96/0xd0
 > [47273.905786]  do_IRQ+0x85/0xd0
 > [47273.905790]  common_interrupt+0xf/0xf
 > [47273.905791]  
 > [47273.905794] RIP: 0010:cpuidle_enter_state+0xb9/0x320
 > [47273.905796] Code: e8 3c 15 bc ff 80 7c 24 0b 00 74 17 9c 58 0f
1f 44 00 00 f6 c4 02 0f 85 3b 02 00 00 31 ff e8 6e fa c0 ff fb 66 0f
1f 44 00 00 <48> b8 ff ff ff ff f3 01 00 00 48 2b 1c 24 ba ff ff ff
7f 48 39 c3
 > [47273.905798] RSP: 0018:b75601943ea8 EFLAGS: 0246
ORIG_RAX: ffdb
 > [47273.905801] RAX: 9d636fa60fc0 RBX: 2afed059e821 RCX:
001f
 > [47273.905802] RDX: 2afed059e821 RSI: 3a2ea91a RDI:

 > [47273.905803] RBP: 9d636fa698c8 R08: 0002 R09:
00020840
 > [47273.905804] R10: 000e97ef158d1e39 R11: 9d636fa601e8 R12:
0001
 > [47273.905805] R13: ab0ac698 R14: 0001 R15:

 > [47273.905808]  ? cpuidle_enter_state+0x94/0x320
 > [47273.905812]  do_idle+0x1e4/0x220
 > [47273.905815]  cpu_startup_entry+0x5f/0x70
 > [47273.905818]  start_secondary+0x185/0x1a0
 > [47273.905821]  secondary_startup_64+0xa4/0xb0

All instances stripped of the identical stack traces:
 > [13778.531040] dev features: 0x000860c000114bb3
 > [13778.531056] skb len=40 data_len=0 gso_size=0 gso_type=0
ip_summed=2 csum=0, csum_complete_sw=0, csum_valid=0
 > [13778.531176] dev features: 0x000860c000114bb3
 > [13778.531204] skb len=40 data_len=0 gso_size=0 gso_type=0
ip_summed=2 csum=0, csum_complete_sw=0, csum_valid=0
 > [13778.531256] dev features: 0x000860c000114bb3
 > [13778.531285] skb len=40 data_len=0 gso_size=0 gso_type=0
ip_summed=2 csum=0, csum_complete_sw=0, csum_valid=0 >
[47273.905642] dev features: 0x000860c000114bb3
 > [47273.905663] skb len=44 data_len=0 gso_size=0 gso_type=0
ip_summed=2 csum=0, csum_complete_sw=0, csum_valid=0

The setup has also further been simplified by also removing vlans and
6to4 tunnels, It's now only conntrack and nat (configured with
nftables)
on bare ethernet netdevs.

offloads, ring sizes etc is left at defaults,
net.ipv4.ip_early_demux is
off, fq_codel as net.core.default_qdisc

Hardware is ConnectX-3 VPI 2xQSFP+ (firmware 2.42.5000) on a quad core
Xeon D-1521, passing traffic from port 1 to port 2 on the same card.
Last switch to touch the packets is an Arista DCS-7050QX-32 running EOS
4.20.2.1F

This kernel build contains some other bits and pieces from net.git
(mostly things queued for stable) and a couple of backports from
net-next (Aaron Lu's pcp page recycling fix, Eric's BQL+mlx4
optimizations), but the stack traces are identical to before so they
dont seem involved in this.

Workload remains nearly exclusively TCP and UDP torrent junk traffic to
two machines behind it.



Please try this patch, we suspect mlx4 support for CHECKSUM_COMPLETE is 
wrong.


(Only IPv4 handled, but I suspect a similar fix is needed for IPv6)


Testing it now. Can sometimes take a few days to hit here so will 
probably have to leave it running for a while.


[PATCH net] sctp: not increase stream's incnt before sending addstrm_in request

2018-11-18 Thread Xin Long
Different from processing the addstrm_out request, The receiver handles
an addstrm_in request by sending back an addstrm_out request to the
sender who will increase its stream's in and incnt later.

Now stream->incnt has been increased since it sent out the addstrm_in
request in sctp_send_add_streams(), with the wrong stream->incnt will
even cause crash when copying stream info from the old stream's in to
the new one's in sctp_process_strreset_addstrm_out().

This patch is to fix it by simply removing the stream->incnt change
from sctp_send_add_streams().

Fixes: 242bd2d519d7 ("sctp: implement sender-side procedures for Add 
Incoming/Outgoing Streams Request Parameter")
Reported-by: Jianwen Ji 
Signed-off-by: Xin Long 
---
 net/sctp/stream.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ffb940d..3892e76 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -535,7 +535,6 @@ int sctp_send_add_streams(struct sctp_association *asoc,
goto out;
}
 
-   stream->incnt = incnt;
stream->outcnt = outcnt;
 
asoc->strreset_outstanding = !!out + !!in;
-- 
2.1.0



[no subject]

2018-11-18 Thread Mrs. Maureen Hinckley



I am Maureen Hinckley and my foundation is donating (Five hundred and fifty 
thousand USD) to you. Contact us via my email at (maurhin...@gmail.com) for 
further details.


 Best Regards, Mrs. Maureen Hinckley,  Copyright ©2018 The Maureen 
Hinckley Foundation All Rights Reserved.


Re: [PATCH iproute2-next v3] rdma: Document IB device renaming option

2018-11-18 Thread Leon Romanovsky
On Fri, Nov 16, 2018 at 08:10:35PM +, Ruhl, Michael J wrote:
> >-Original Message-
> >From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
> >ow...@vger.kernel.org] On Behalf Of Leon Romanovsky
> >Sent: Sunday, November 4, 2018 2:11 PM
> >To: David Ahern 
> >Cc: Leon Romanovsky ; netdev
> >; RDMA mailing list ;
> >Stephen Hemminger 
> >Subject: [PATCH iproute2-next v3] rdma: Document IB device renaming
> >option
> >
> >From: Leon Romanovsky 
>
> Hi Leon,
>
> After looking at this and Steve Wise's changes for the ADDLINK/DELLINK,
> it occurred to me that the driver that handed the name to ib_register_device()
> might be interested in knowing that this name change occurred.
>
> Are there plans to include a some kind of notify mechanism so drivers can
> find out when things like this occur?

At least for device rename, I don't see any real need for such event,
because drivers are not supposed to rely on names.

I would say that it is probably driver bug to rely on device name
during its execution.

>
> Is this something that should be done?

I think yes, we can extend ib_event to support more events than now,
but should we?

>
> Thanks,
>
> Mike
>
> >[leonro@server /]$ lspci |grep -i Ether
> >00:08.0 Ethernet controller: Red Hat, Inc. Virtio network device
> >00:09.0 Ethernet controller: Mellanox Technologies MT27700 Family
> >[ConnectX-4]
> >[leonro@server /]$ sudo rdma dev
> >1: mlx5_0: node_type ca fw 3.8. node_guid 5254:00c0:fe12:3455
> >sys_image_guid 5254:00c0:fe12:3455
> >[leonro@server /]$ sudo rdma dev set mlx5_0 name hfi1_0
> >[leonro@server /]$ sudo rdma dev
> >1: hfi1_0: node_type ca fw 3.8. node_guid 5254:00c0:fe12:3455
> >sys_image_guid 5254:00c0:fe12:3455
> >
> >Signed-off-by: Leon Romanovsky 
> >---
> >Changelog:
> >v2->v3:
> > * Dropped "to be named" words from example section of man
> >---
> > man/man8/rdma-dev.8 | 15 ++-
> > 1 file changed, 14 insertions(+), 1 deletion(-)
> >
> >diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8
> >index 461681b6..7c275180 100644
> >--- a/man/man8/rdma-dev.8
> >+++ b/man/man8/rdma-dev.8
> >@@ -1,6 +1,6 @@
> > .TH RDMA\-DEV 8 "06 Jul 2017" "iproute2" "Linux"
> > .SH NAME
> >-rdmak-dev \- RDMA device configuration
> >+rdma-dev \- RDMA device configuration
> > .SH SYNOPSIS
> > .sp
> > .ad l
> >@@ -22,10 +22,18 @@ rdmak-dev \- RDMA device configuration
> > .B rdma dev show
> > .RI "[ " DEV " ]"
> >
> >+.ti -8
> >+.B rdma dev set
> >+.RI "[ " DEV " ]"
> >+.BR name
> >+.BR NEWNAME
> >+
> > .ti -8
> > .B rdma dev help
> >
> > .SH "DESCRIPTION"
> >+.SS rdma dev set - rename rdma device
> >+
> > .SS rdma dev show - display rdma device attributes
> >
> > .PP
> >@@ -45,6 +53,11 @@ rdma dev show mlx5_3
> > Shows the state of specified RDMA device.
> > .RE
> > .PP
> >+rdma dev set mlx5_3 name rdma_0
> >+.RS 4
> >+Renames the mlx5_3 device to rdma_0.
> >+.RE
> >+.PP
> >
> > .SH SEE ALSO
> > .BR rdma (8),
> >--
> >2.19.1
>


signature.asc
Description: PGP signature


[PATCH net] Revert "sctp: remove sctp_transport_pmtu_check"

2018-11-18 Thread Xin Long
This reverts commit 22d7be267eaa8114dcc28d66c1c347f667d7878a.

The dst's mtu in transport can be updated by a non sctp place like
in xfrm where the MTU information didn't get synced between asoc,
transport and dst, so it is still needed to do the pmtu check
in sctp_packet_config.
---
 include/net/sctp/sctp.h | 12 
 net/sctp/output.c   |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa3..ab9242e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -608,4 +608,16 @@ static inline __u32 sctp_dst_mtu(const struct dst_entry 
*dst)
 SCTP_DEFAULT_MINSEGMENT));
 }
 
+static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
+{
+   __u32 pmtu = sctp_dst_mtu(t->dst);
+
+   if (t->pathmtu == pmtu)
+   return true;
+
+   t->pathmtu = pmtu;
+
+   return false;
+}
+
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 67939ad..0860122 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -118,6 +118,9 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 
vtag,
sctp_transport_route(tp, NULL, sp);
if (asoc->param_flags & SPP_PMTUD_ENABLE)
sctp_assoc_sync_pmtu(asoc);
+   } else if (!sctp_transport_pmtu_check(tp)) {
+   if (asoc->param_flags & SPP_PMTUD_ENABLE)
+   sctp_assoc_sync_pmtu(asoc);
}
 
if (asoc->pmtu_pending) {
-- 
2.1.0



[PATCHv3 net-next 0/4] sctp: add subscribe per asoc and sockopt SCTP_EVENT

2018-11-18 Thread Xin Long
This patchset mainly adds the Event Subscription sockopt described in
rfc6525#section-6.2:

"Subscribing to events as described in [RFC6458] uses a setsockopt()
call with the SCTP_EVENT socket option.  This option takes the
following structure, which specifies the association, the event type
(using the same value found in the event type field), and an on/off
boolean.

  struct sctp_event {
sctp_assoc_t se_assoc_id;
uint16_t se_type;
uint8_t  se_on;
  };

The user fills in the se_type field with the same value found in the
strreset_type field, i.e., SCTP_STREAM_RESET_EVENT.  The user will
also fill in the se_assoc_id field with either the association to set
this event on (this field is ignored for one-to-one style sockets) or
one of the reserved constant values defined in [RFC6458].  Finally,
the se_on field is set with a 1 to enable the event or a 0 to disable
the event."

As for the old SCTP_EVENTS Option with struct sctp_event_subscribe,
it's being DEPRECATED.

v1->v2:
  - fix some key word in changelog that triggerred the filters at
vger.kernel.org.
v2->v3:
  - fix an array out of bounds noticed by Neil in patch 1/4.

Xin Long (4):
  sctp: define subscribe in sctp_sock as __u16
  sctp: add subscribe per asoc
  sctp: rename enum sctp_event to sctp_event_type
  sctp: add sockopt SCTP_EVENT

 include/net/sctp/constants.h |   2 +-
 include/net/sctp/sm.h|   4 +-
 include/net/sctp/structs.h   |   4 +-
 include/net/sctp/ulpevent.h  |  39 --
 include/uapi/linux/sctp.h|  13 -
 net/sctp/associola.c |   2 +
 net/sctp/chunk.c |   8 ++-
 net/sctp/primitive.c |   2 +-
 net/sctp/sm_sideeffect.c |  12 ++---
 net/sctp/sm_statetable.c |   2 +-
 net/sctp/socket.c| 125 ---
 net/sctp/stream_interleave.c |  12 +++--
 net/sctp/ulpqueue.c  |   8 +--
 13 files changed, 183 insertions(+), 50 deletions(-)

-- 
2.1.0



[PATCHv3 net-next 2/4] sctp: add subscribe per asoc

2018-11-18 Thread Xin Long
The member subscribe should be per asoc, so that sockopt SCTP_EVENT
in the next patch can subscribe a event from one asoc only.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h   | 2 ++
 net/sctp/associola.c | 2 ++
 net/sctp/chunk.c | 6 ++
 net/sctp/socket.c| 6 +-
 net/sctp/stream_interleave.c | 7 ---
 net/sctp/ulpqueue.c  | 4 ++--
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index bc7808a..7eaa294 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2077,6 +2077,8 @@ struct sctp_association {
 
int sent_cnt_removable;
 
+   __u16 subscribe;
+
__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 };
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96..685c7ef 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -135,6 +135,8 @@ static struct sctp_association *sctp_association_init(
 */
asoc->max_burst = sp->max_burst;
 
+   asoc->subscribe = sp->subscribe;
+
/* initialize association timers */
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 6c761af..0b203b8 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg)
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
+   struct sctp_association *asoc = NULL;
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
-   struct sctp_sock *sp;
struct sctp_ulpevent *ev;
-   struct sctp_association *asoc = NULL;
int error = 0, notify;
 
/* If we failed, we may need to notify. */
@@ -108,8 +107,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
else
error = asoc->outqueue.error;
 
-   sp = sctp_sk(asoc->base.sk);
-   notify = sctp_ulpevent_type_enabled(sp->subscribe,
+   notify = sctp_ulpevent_type_enabled(asoc->subscribe,
SCTP_SEND_FAILED);
}
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9d75129..c771827 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2307,6 +2307,7 @@ static int sctp_setsockopt_events(struct sock *sk, char 
__user *optval,
struct sctp_event_subscribe subscribe;
__u8 *sn_type = (__u8 *)&subscribe;
struct sctp_sock *sp = sctp_sk(sk);
+   struct sctp_association *asoc;
int i;
 
if (optlen > sizeof(struct sctp_event_subscribe))
@@ -2319,14 +2320,17 @@ static int sctp_setsockopt_events(struct sock *sk, char 
__user *optval,
sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i,
   sn_type[i]);
 
+   list_for_each_entry(asoc, &sp->ep->asocs, asocs)
+   asoc->subscribe = sctp_sk(sk)->subscribe;
+
/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
 * if there is no data to be sent or retransmit, the stack will
 * immediately send up this notification.
 */
if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) {
-   struct sctp_association *asoc = sctp_id2assoc(sk, 0);
struct sctp_ulpevent *event;
 
+   asoc = sctp_id2assoc(sk, 0);
if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
event = sctp_ulpevent_make_sender_dry_event(asoc,
GFP_USER | __GFP_NOWARN);
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index ceef5a3..a6bf215 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -503,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
sk_incoming_cpu_update(sk);
}
 
-   if (!sctp_ulpevent_is_enabled(event, sp->subscribe))
+   if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
goto out_free;
 
if (skb_list)
@@ -992,16 +992,17 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq 
*ulpq, __u16 sid,
  __u32 mid, __u16 flags, gfp_t gfp)
 {
struct sock *sk = ulpq->asoc->base.sk;
-   struct sctp_sock *sp = sctp_sk(sk);
struct sctp_ulpevent *ev = NULL;
 
-   if (!sctp_ulpevent_type_enabled(sp->subscribe,
+   if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
SCTP_PARTIAL_DELIVERY_EVENT))
return;
 
ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTE

[PATCHv3 net-next 1/4] sctp: define subscribe in sctp_sock as __u16

2018-11-18 Thread Xin Long
The member subscribe in sctp_sock is used to indicate to which of
the events it is subscribed, more like a group of flags. So it's
better to be defined as __u16 (2 bytpes), instead of struct
sctp_event_subscribe (13 bytes).

Note that sctp_event_subscribe is an UAPI struct, used on sockopt
calls, and thus it will not be removed. This patch only changes
the internal storage of the flags.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h   |  2 +-
 include/net/sctp/ulpevent.h  | 39 ---
 include/uapi/linux/sctp.h|  6 +-
 net/sctp/chunk.c |  4 ++--
 net/sctp/socket.c| 35 ++-
 net/sctp/stream_interleave.c | 11 ++-
 net/sctp/ulpqueue.c  |  8 
 7 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index af9d494..bc7808a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,7 +217,7 @@ struct sctp_sock {
 * These two structures must be grouped together for the usercopy
 * whitelist region.
 */
-   struct sctp_event_subscribe subscribe;
+   __u16 subscribe;
struct sctp_initmsg initmsg;
 
int user_frag;
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 51b4e06..bd922a0 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -164,30 +164,39 @@ void sctp_ulpevent_read_nxtinfo(const struct 
sctp_ulpevent *event,
 
 __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);
 
+static inline void sctp_ulpevent_type_set(__u16 *subscribe,
+ __u16 sn_type, __u8 on)
+{
+   if (sn_type > SCTP_SN_TYPE_MAX)
+   return;
+
+   if (on)
+   *subscribe |=  (1 << (sn_type - SCTP_SN_TYPE_BASE));
+   else
+   *subscribe &= ~(1 << (sn_type - SCTP_SN_TYPE_BASE));
+}
+
 /* Is this event type enabled? */
-static inline int sctp_ulpevent_type_enabled(__u16 sn_type,
-struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_type_enabled(__u16 subscribe, __u16 sn_type)
 {
-   int offset = sn_type - SCTP_SN_TYPE_BASE;
-   char *amask = (char *) mask;
+   if (sn_type > SCTP_SN_TYPE_MAX)
+   return false;
 
-   if (offset >= sizeof(struct sctp_event_subscribe))
-   return 0;
-   return amask[offset];
+   return subscribe & (1 << (sn_type - SCTP_SN_TYPE_BASE));
 }
 
 /* Given an event subscription, is this event enabled? */
-static inline int sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
-  struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
+   __u16 subscribe)
 {
__u16 sn_type;
-   int enabled = 1;
 
-   if (sctp_ulpevent_is_notification(event)) {
-   sn_type = sctp_ulpevent_get_notification_type(event);
-   enabled = sctp_ulpevent_type_enabled(sn_type, mask);
-   }
-   return enabled;
+   if (!sctp_ulpevent_is_notification(event))
+   return true;
+
+   sn_type = sctp_ulpevent_get_notification_type(event);
+
+   return sctp_ulpevent_type_enabled(subscribe, sn_type);
 }
 
 #endif /* __sctp_ulpevent_h__ */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c81feb3..66afa5b 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -632,7 +632,9 @@ union sctp_notification {
  */
 
 enum sctp_sn_type {
-   SCTP_SN_TYPE_BASE = (1<<15),
+   SCTP_SN_TYPE_BASE   = (1<<15),
+   SCTP_DATA_IO_EVENT  = SCTP_SN_TYPE_BASE,
+#define SCTP_DATA_IO_EVENT SCTP_DATA_IO_EVENT
SCTP_ASSOC_CHANGE,
 #define SCTP_ASSOC_CHANGE  SCTP_ASSOC_CHANGE
SCTP_PEER_ADDR_CHANGE,
@@ -657,6 +659,8 @@ enum sctp_sn_type {
 #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT
SCTP_STREAM_CHANGE_EVENT,
 #define SCTP_STREAM_CHANGE_EVENT   SCTP_STREAM_CHANGE_EVENT
+   SCTP_SN_TYPE_MAX= SCTP_STREAM_CHANGE_EVENT,
+#define SCTP_SN_TYPE_MAX   SCTP_SN_TYPE_MAX
 };
 
 /* Notification error codes used to fill up the error fields in some
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce80878..6c761af 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -109,8 +109,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
error = asoc->outqueue.error;
 
sp = sctp_sk(asoc->base.sk);
-   notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
-   &sp->subscribe);
+   notify = sctp_ulpevent_type_enabled(sp->subscribe,
+   

[PATCHv3 net-next 3/4] sctp: rename enum sctp_event to sctp_event_type

2018-11-18 Thread Xin Long
sctp_event is a structure name defined in RFC for sockopt
SCTP_EVENT. To avoid the conflict, rename it.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  2 +-
 include/net/sctp/sm.h|  4 ++--
 net/sctp/primitive.c |  2 +-
 net/sctp/sm_sideeffect.c | 12 ++--
 net/sctp/sm_statetable.c |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8dadc74..4588bdc 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -71,7 +71,7 @@ enum { SCTP_DEFAULT_INSTREAMS = SCTP_MAX_STREAM };
 SCTP_NUM_AUTH_CHUNK_TYPES)
 
 /* These are the different flavours of event.  */
-enum sctp_event {
+enum sctp_event_type {
SCTP_EVENT_T_CHUNK = 1,
SCTP_EVENT_T_TIMEOUT,
SCTP_EVENT_T_OTHER,
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 9e3d327..24825a8 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -173,7 +173,7 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
 const struct sctp_sm_table_entry *sctp_sm_lookup_event(
struct net *net,
-   enum sctp_event event_type,
+   enum sctp_event_type event_type,
enum sctp_state state,
union sctp_subtype event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
@@ -313,7 +313,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 /* Prototypes for statetable processing. */
 
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp);
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index c0817f7a..a8c4c33 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
 int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
void *arg) { \
int error = 0; \
-   enum sctp_event event_type; union sctp_subtype subtype; \
+   enum sctp_event_type event_type; union sctp_subtype subtype; \
enum sctp_state state; \
struct sctp_endpoint *ep; \
\
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 85d3930..1d143bc 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -52,7 +52,7 @@
 #include 
 #include 
 
-static int sctp_cmd_interpreter(enum sctp_event event_type,
+static int sctp_cmd_interpreter(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
enum sctp_disposition status,
struct sctp_cmd_seq *commands,
gfp_t gfp);
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_effects(enum sctp_event_type event_type,
 union sctp_subtype subtype,
 enum sctp_state state,
 struct sctp_endpoint *ep,
@@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq 
*commands,
 /* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
 static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
  struct sctp_association *asoc,
- enum sctp_event event_type,
+ enum sctp_event_type event_type,
  union sctp_subtype subtype,
  struct sctp_chunk *chunk,
  unsigned int error)
@@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association 
*asoc)
  * If you want to understand all of lksctp, this is a
  * good place to start.
  */
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp)
@@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event 
event_type,
 /*
  * This the master state function side effect processing function.
  */
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_eff

[PATCHv3 net-next 4/4] sctp: add sockopt SCTP_EVENT

2018-11-18 Thread Xin Long
This patch adds sockopt SCTP_EVENT described in rfc6525#section-6.2.
With this sockopt users can subscribe to an event from a specified
asoc.

Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h |  7 
 net/sctp/socket.c | 88 +++
 2 files changed, 95 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66afa5b..d584073 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_STREAM_SCHEDULER_VALUE124
 #define SCTP_INTERLEAVING_SUPPORTED125
 #define SCTP_SENDMSG_CONNECT   126
+#define SCTP_EVENT 127
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -1154,6 +1155,12 @@ struct sctp_add_streams {
uint16_t sas_outstrms;
 };
 
+struct sctp_event {
+   sctp_assoc_t se_assoc_id;
+   uint16_t se_type;
+   uint8_t se_on;
+};
+
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
SCTP_SS_FCFS,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c771827..e16c090 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,6 +4288,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, 
char __user *optval,
return 0;
 }
 
+static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
+unsigned int optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_ulpevent *event;
+   struct sctp_event param;
+   int retval = 0;
+
+   if (optlen < sizeof(param)) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   optlen = sizeof(param);
+   if (copy_from_user(¶m, optval, optlen)) {
+   retval = -EFAULT;
+   goto out;
+   }
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   if (!asoc) {
+   sctp_ulpevent_type_set(&sctp_sk(sk)->subscribe,
+  param.se_type, param.se_on);
+   goto out;
+   }
+
+   sctp_ulpevent_type_set(&asoc->subscribe, param.se_type, param.se_on);
+
+   if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) {
+   if (sctp_outq_is_empty(&asoc->outqueue)) {
+   event = sctp_ulpevent_make_sender_dry_event(asoc,
+   GFP_USER | __GFP_NOWARN);
+   if (!event) {
+   retval = -ENOMEM;
+   goto out;
+   }
+
+   asoc->stream.si->enqueue_event(&asoc->ulpq, event);
+   }
+   }
+
+out:
+   return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4485,6 +4536,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_setsockopt_event(sk, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7430,6 +7484,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, 
int len,
return 0;
 }
 
+static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
+int __user *optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_event param;
+   __u16 subscribe;
+
+   if (len < sizeof(param))
+   return -EINVAL;
+
+   len = sizeof(param);
+   if (copy_from_user(¶m, optval, len))
+   return -EFAULT;
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX)
+   return -EINVAL;
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
+   param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);
+
+   if (put_user(len, optlen))
+   return -EFAULT;
+
+   if (copy_to_user(optval, ¶m, len))
+   return -EFAULT;
+
+   return 0;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
   char __user *optval, int __user *optlen)
 {
@@ -7628,6 +7713,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_getsockopt_event(sk, len, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
-- 
2.1.0