date:20180126

[PATCH] xfrm: Add ESN support for IPSec HW offload

2018-01-26 Thread Steffen Klassert

From: Yossef Efraim 

This patch adds ESN support to IPsec device offload.
Adding new xfrm device operation to synchronize device ESN.

Signed-off-by: Yossef Efraim 
Signed-off-by: Shannon Nelson 
Signed-off-by: Steffen Klassert 
---
 Documentation/networking/xfrm_device.txt |  3 +++
 include/linux/netdevice.h|  1 +
 include/net/xfrm.h   | 12 
 net/xfrm/xfrm_device.c   | 11 +--
 net/xfrm/xfrm_replay.c   |  2 ++
 5 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/xfrm_device.txt 
b/Documentation/networking/xfrm_device.txt
index 2d9d588cd34b..50c34ca65efe 100644
--- a/Documentation/networking/xfrm_device.txt
+++ b/Documentation/networking/xfrm_device.txt
@@ -41,6 +41,7 @@ struct xfrmdev_ops {
void(*xdo_dev_state_free) (struct xfrm_state *x);
bool(*xdo_dev_offload_ok) (struct sk_buff *skb,
   struct xfrm_state *x);
+   void(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 };
 
 The NIC driver offering ipsec offload will need to implement these
@@ -117,6 +118,8 @@ the stack in xfrm_input().
 
hand the packet to napi_gro_receive() as usual
 
+In ESN mode, xdo_dev_state_advance_esn() is called from 
xfrm_replay_advance_esn().
+Driver will check packet seq number and update HW ESN state machine if needed.
 
 When the SA is removed by the user, the driver's xdo_dev_state_delete()
 is asked to disable the offload.  Later, xdo_dev_state_free() is called
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ed0799a12bf2..540151875444 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -851,6 +851,7 @@ struct xfrmdev_ops {
void(*xdo_dev_state_free) (struct xfrm_state *x);
bool(*xdo_dev_offload_ok) (struct sk_buff *skb,
   struct xfrm_state *x);
+   void(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 };
 #endif
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2e6d4fe6b0ba..7d2077665c0b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1904,6 +1904,14 @@ int xfrm_dev_state_add(struct net *net, struct 
xfrm_state *x,
   struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
+static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
+{
+   struct xfrm_state_offload *xso = &x->xso;
+
+   if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
+   xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
+}
+
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
struct xfrm_state *x = dst->xfrm;
@@ -1974,6 +1982,10 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff 
*skb, struct xfrm_state *x
return false;
 }
 
+static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
+{
+}
+
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
return false;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 75982506617b..93520106731f 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -147,8 +147,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state 
*x,
if (!x->type_offload)
return -EINVAL;
 
-   /* We don't yet support UDP encapsulation, TFC padding and ESN. */
-   if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
+   /* We don't yet support UDP encapsulation and TFC padding. */
+   if (x->encap || x->tfcpad)
return -EINVAL;
 
dev = dev_get_by_index(net, xuo->ifindex);
@@ -178,6 +178,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state 
*x,
return 0;
}
 
+   if (x->props.flags & XFRM_STATE_ESN &&
+   !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
+   xso->dev = NULL;
+   dev_put(dev);
+   return -EINVAL;
+   }
+
xso->dev = dev;
xso->num_exthdrs = 1;
xso->flags = xuo->flags;
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 02501817227b..1d38c6acf8af 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -551,6 +551,8 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, 
__be32 net_seq)
bitnr = replay_esn->replay_window - (diff - pos);
}
 
+   xfrm_dev_state_advance_esn(x);
+
nr = bitnr >> 5;
bitnr = bitnr & 0x1F;
replay_esn->bmp[nr] |= (1U << bitnr);
-- 
2.14.1

pull request (net-next): ipsec-next 2018-01-26

2018-01-26 Thread Steffen Klassert

One last patch for this development cycle:

1) Add ESN support for IPSec HW offload.
   From Yossef Efraim.

Please pull or let me know if there are problems.

Thanks!

The following changes since commit 4f7d58517f461aa6e7b7509668f04021e089323d:

  Merge tag 'linux-can-next-for-4.16-20180116' of 
ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next 
(2018-01-17 16:08:25 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master

for you to fetch changes up to 50bd870a9e5cca9fcf5fb4c130c373643d7d9906:

  xfrm: Add ESN support for IPSec HW offload (2018-01-18 10:42:59 +0100)


Yossef Efraim (1):
  xfrm: Add ESN support for IPSec HW offload

 Documentation/networking/xfrm_device.txt |  3 +++
 include/linux/netdevice.h|  1 +
 include/net/xfrm.h   | 12 
 net/xfrm/xfrm_device.c   | 11 +--
 net/xfrm/xfrm_replay.c   |  2 ++
 5 files changed, 27 insertions(+), 2 deletions(-)

Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Siwei Liu

On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin  wrote:
> On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
>> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin  wrote:
>> > On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>> >> First off, as mentioned in another thread, the model of stacking up
>> >> virt-bond functionality over virtio seems a wrong direction to me.
>> >> Essentially the migration process would need to carry over all guest
>> >> side configurations previously done on the VF/PT and get them moved to
>> >> the new device being it virtio or VF/PT.
>> >
>> > I might be wrong but I don't see why we should worry about this usecase.
>> > Whoever has a bond configured already has working config for migration.
>> > We are trying to help people who don't, not convert existig users.
>>
>> That has been placed in the view of cloud providers that the imported
>> images from the store must be able to run unmodified thus no
>> additional setup script is allowed (just as Stephen mentioned in
>> another mail). Cloud users don't care about live migration themselves
>> but the providers are required to implement such automation mechanism
>> to make this process transparent if at all possible. The user does not
>> care about the device underneath being VF or not, but they do care
>> about consistency all across and the resulting performance
>> acceleration in making VF the prefered datapath. It is not quite
>> peculiar user cases but IMHO *any* approach proposed for live
>> migration should be able to persist the state including network config
>> e.g. as simple as MTU. Actually this requirement has nothing to do
>> with virtio but our target users are live migration agnostic, being it
>> tracking DMA through dirty pages, using virtio as the helper, or
>> whatsoever, the goal of persisting configs across remains same.
>
> So the patching being discussed here will mostly do exactly that if your
> original config was simply a single virtio net device.
>

True, but I don't see the patch being discussed starts with good
foundation of supporting the same for VF/PT device. That is the core
of the issue.

>
> What kind of configs do your users have right now?

Any configs be it generic or driver specific that the VF/PT device
supports and have been enabled/configured. General network configs
(MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
(hardware offload, # of queues and ring entris, RSC options, rss
rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
flower offload, just to name a few. As cloud providers we don't limit
users from applying driver specific tuning to the NIC/VF, and
sometimes this is essential to achieving best performance for their
workload. We've seen cases like tuning coalescing parameters for
getting low latency, changing rx-flow-hash function for better VXLAN
throughput, or even adopting quite advanced NIC features such as flow
director or cloud filter. We don't expect users to compromise even a
little bit on these. That is once we turn on live migration for the VF
or pass through devices in the VM, it all takes place under the hood,
users (guest admins, applications) don't have to react upon it or even
notice the change. I should note that the majority of live migrations
take place between machines with completely identical hardware, it's
more critical than necessary to keep the config as-is across the move,
stealth while quiet.

As you see generic bond or bridge cannot suffice the need. That's why
we need a new customized virt bond driver, and tailor it for VM live
migration specifically. Leveraging para-virtual e.g. virtio net device
as the backup path is one thing, tracking through driver config
changes in order to re-config as necessary is another. I would think
without considering the latter, the proposal being discussed is rather
incomplete, and remote to be useful in production.

>
>
>> >
>> >> Without the help of a new
>> >> upper layer bond driver that enslaves virtio and VF/PT devices
>> >> underneath, virtio will be overloaded with too much specifics being a
>> >> VF/PT backup in the future.
>> >
>> > So this paragraph already includes at least two conflicting
>> > proposals. On the one hand you want a separate device for
>> > the virtual bond, on the other you are saying a separate
>> > driver.
>>
>> Just to be crystal clear: separate virtual bond device (netdev ops,
>> not necessarily bus device) for VM migration specifically with a
>> separate driver.
>
> Okay, but note that any config someone had on a virtio device won't
> propagate to that bond.
>
>> >
>> > Further, the reason to have a separate *driver* was that
>> > some people wanted to share code with netvsc - and that
>> > one does not create a separate device, which you can't
>> > change without breaking existing configs.
>>
>> I'm not sure I understand this statement. netvsc is already another
>> netdev being created than the enslaved VF netdev, why it bothers?
>

[PATCH] atm: he: Replace GFP_ATOMIC with GFP_KERNEL in he_open

2018-01-26 Thread Jia-Ju Bai

After checking all possible call chains to he_open() here,
my tool finds that he_open() is never called in atomic context.
And this function is assigned to a function pointer "dev->ops->open",
which is only called by __vcc_connect() (net/atm/common.c)
through dev->ops->send(), and __vcc_connect() is only called by
vcc_connect(), which calls mutex_lock(),
so it indicates that he_open() can call functions which may sleep.
Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL.

This is found by a static analysis tool named DCNS written by myself.

Signed-off-by: Jia-Ju Bai 
---
 drivers/atm/he.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index e58538c..fea5bf0 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -2135,7 +2135,7 @@ static int he_start(struct atm_dev *dev)
 
cid = he_mkcid(he_dev, vpi, vci);
 
-   he_vcc = kmalloc(sizeof(struct he_vcc), GFP_ATOMIC);
+   he_vcc = kmalloc(sizeof(struct he_vcc), GFP_KERNEL);
if (he_vcc == NULL) {
hprintk("unable to allocate he_vcc during open\n");
return -ENOMEM;
-- 
1.7.9.5

[PATCH net-next 5/5] net/smc: return booleans instead of integers

2018-01-26 Thread Ursula Braun

From: Gustavo A. R. Silva 

Return statements in functions returning bool should use
true/false instead of 1/0.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
Signed-off-by: Ursula Braun 
---
 net/smc/smc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/smc/smc.h b/net/smc/smc.h
index bfbe20234105..9518986c97b1 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -252,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed)
 static inline bool using_ipsec(struct smc_sock *smc)
 {
return (smc->clcsock->sk->sk_policy[0] ||
-   smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+   smc->clcsock->sk->sk_policy[1]) ? true : false;
 }
 #else
 static inline bool using_ipsec(struct smc_sock *smc)
 {
-   return 0;
+   return false;
 }
 #endif
 
-- 
2.13.5

[PATCH net-next 4/5] net/smc: release clcsock from tcp_listen_worker

2018-01-26 Thread Ursula Braun

Closing a listen socket may hit the warning
WARN_ON(sock_owned_by_user(sk)) of tcp_close(), if the wake up of
the smc_tcp_listen_worker has not yet finished.
This patch introduces smc_close_wait_listen_clcsock() making sure
the listening internal clcsock has been closed in smc_tcp_listen_work(),
before the listening external SMC socket finishes closing.

Signed-off-by: Ursula Braun 
---
 net/smc/af_smc.c| 13 -
 net/smc/smc_close.c | 33 -
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 732a37ddbc21..267e68379110 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -670,6 +670,10 @@ struct sock *smc_accept_dequeue(struct sock *parent,
 
smc_accept_unlink(new_sk);
if (new_sk->sk_state == SMC_CLOSED) {
+   if (isk->clcsock) {
+   sock_release(isk->clcsock);
+   isk->clcsock = NULL;
+   }
new_sk->sk_prot->unhash(new_sk);
sock_put(new_sk); /* final */
continue;
@@ -969,8 +973,15 @@ static void smc_tcp_listen_work(struct work_struct *work)
}
 
 out:
+   if (lsmc->clcsock) {
+   sock_release(lsmc->clcsock);
+   lsmc->clcsock = NULL;
+   }
release_sock(lsk);
-   lsk->sk_data_ready(lsk); /* no more listening, wake accept */
+   /* no more listening, wake up smc_close_wait_listen_clcsock and
+* accept
+*/
+   lsk->sk_state_change(lsk);
sock_put(&lsmc->sk); /* sock_hold in smc_listen */
 }
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 4339852a8910..e339c0186dcf 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -19,6 +19,8 @@
 #include "smc_cdc.h"
 #include "smc_close.h"
 
+#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ)
+
 static void smc_close_cleanup_listen(struct sock *parent)
 {
struct sock *sk;
@@ -28,6 +30,27 @@ static void smc_close_cleanup_listen(struct sock *parent)
smc_close_non_accepted(sk);
 }
 
+static void smc_close_wait_listen_clcsock(struct smc_sock *smc)
+{
+   DEFINE_WAIT_FUNC(wait, woken_wake_function);
+   struct sock *sk = &smc->sk;
+   signed long timeout;
+
+   timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME;
+   add_wait_queue(sk_sleep(sk), &wait);
+   do {
+   release_sock(sk);
+   if (smc->clcsock)
+   timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE,
+timeout);
+   sched_annotate_sleep();
+   lock_sock(sk);
+   if (!smc->clcsock)
+   break;
+   } while (timeout);
+   remove_wait_queue(sk_sleep(sk), &wait);
+}
+
 /* wait for sndbuf data being transmitted */
 static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
 {
@@ -114,7 +137,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
break;
case SMC_APPCLOSEWAIT1:
case SMC_APPCLOSEWAIT2:
-   sock_release(smc->clcsock);
if (!smc_cdc_rxed_any_close(&smc->conn))
sk->sk_state = SMC_PEERABORTWAIT;
else
@@ -128,7 +150,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
if (!txflags->peer_conn_closed) {
/* just SHUTDOWN_SEND done */
sk->sk_state = SMC_PEERABORTWAIT;
-   sock_release(smc->clcsock);
} else {
sk->sk_state = SMC_CLOSED;
}
@@ -136,8 +157,6 @@ static void smc_close_active_abort(struct smc_sock *smc)
break;
case SMC_PROCESSABORT:
case SMC_APPFINCLOSEWAIT:
-   if (!txflags->peer_conn_closed)
-   sock_release(smc->clcsock);
sk->sk_state = SMC_CLOSED;
break;
case SMC_PEERFINCLOSEWAIT:
@@ -177,8 +196,6 @@ int smc_close_active(struct smc_sock *smc)
switch (sk->sk_state) {
case SMC_INIT:
sk->sk_state = SMC_CLOSED;
-   if (smc->smc_listen_work.func)
-   cancel_work_sync(&smc->smc_listen_work);
break;
case SMC_LISTEN:
sk->sk_state = SMC_CLOSED;
@@ -187,11 +204,9 @@ int smc_close_active(struct smc_sock *smc)
rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
/* wake up kernel_accept of smc_tcp_listen_worker */
smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+   smc_close_wait_listen_clcsock(smc);
}
-   release_sock(sk);
smc_close_cleanup_listen(sk);
-   cancel_work_sync(&smc->smc_listen_work);
-   lo

[PATCH net-next 0/5] net/smc: fixes 2018-01-26

2018-01-26 Thread Ursula Braun

Dave,

here are some more smc patches. The first 4 patches take care about
different aspects of smc socket closing, the 5th patch improves
coding style.

Thanks, Ursula

Gustavo A. R. Silva (1):
  smc: return booleans instead of integers

Ursula Braun (4):
  net/smc: handle device, port, and QP error events
  net/smc: smc_poll improvements
  net/smc: replace sock_put worker by socket refcounting
  net/smc: release clcsock from tcp_listen_worker

 net/smc/af_smc.c| 152 ++--
 net/smc/smc.h   |   5 +-
 net/smc/smc_cdc.c   |  20 +++
 net/smc/smc_close.c |  96 +
 net/smc/smc_close.h |   1 -
 net/smc/smc_core.c  |   6 +--
 net/smc/smc_ib.c|  38 -
 7 files changed, 191 insertions(+), 127 deletions(-)

-- 
2.13.5

[PATCH net-next 3/5] net/smc: replace sock_put worker by socket refcounting

2018-01-26 Thread Ursula Braun

Proper socket refcounting makes the sock_put worker obsolete.

Signed-off-by: Ursula Braun 
---
 net/smc/af_smc.c| 65 +
 net/smc/smc.h   |  1 -
 net/smc/smc_cdc.c   | 20 +
 net/smc/smc_close.c | 63 ++-
 net/smc/smc_close.h |  1 -
 net/smc/smc_core.c  |  6 ++---
 6 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 90c22a854f28..732a37ddbc21 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -115,7 +115,6 @@ static int smc_release(struct socket *sock)
goto out;
 
smc = smc_sk(sk);
-   sock_hold(sk);
if (sk->sk_state == SMC_LISTEN)
/* smc_close_non_accepted() is called and acquires
 * sock lock for child sockets again
@@ -124,10 +123,7 @@ static int smc_release(struct socket *sock)
else
lock_sock(sk);
 
-   if (smc->use_fallback) {
-   sk->sk_state = SMC_CLOSED;
-   sk->sk_state_change(sk);
-   } else {
+   if (!smc->use_fallback) {
rc = smc_close_active(smc);
sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
@@ -136,20 +132,21 @@ static int smc_release(struct socket *sock)
sock_release(smc->clcsock);
smc->clcsock = NULL;
}
+   if (smc->use_fallback) {
+   sock_put(sk); /* passive closing */
+   sk->sk_state = SMC_CLOSED;
+   sk->sk_state_change(sk);
+   }
 
/* detach socket */
sock_orphan(sk);
sock->sk = NULL;
-   if (smc->use_fallback) {
-   schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
-   } else if (sk->sk_state == SMC_CLOSED) {
+   if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
smc_conn_free(&smc->conn);
-   schedule_delayed_work(&smc->sock_put_work,
- SMC_CLOSE_SOCK_PUT_DELAY);
-   }
release_sock(sk);
 
-   sock_put(sk);
+   sk->sk_prot->unhash(sk);
+   sock_put(sk); /* final sock_put */
 out:
return rc;
 }
@@ -181,7 +178,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct 
socket *sock)
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_LIST_HEAD(&smc->accept_q);
spin_lock_init(&smc->accept_q_lock);
-   INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
sk->sk_prot->hash(sk);
sk_refcnt_debug_inc(sk);
 
@@ -399,6 +395,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
int rc = 0;
u8 ibport;
 
+   sock_hold(&smc->sk); /* sock put in passive closing */
+
if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
/* peer has not signalled SMC-capability */
smc->use_fallback = true;
@@ -542,6 +540,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
mutex_unlock(&smc_create_lgr_pending);
smc_conn_free(&smc->conn);
 out_err:
+   if (smc->sk.sk_state == SMC_INIT)
+   sock_put(&smc->sk); /* passive closing */
return rc;
 }
 
@@ -620,7 +620,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct 
smc_sock **new_smc)
new_sk->sk_state = SMC_CLOSED;
sock_set_flag(new_sk, SOCK_DEAD);
new_sk->sk_prot->unhash(new_sk);
-   sock_put(new_sk);
+   sock_put(new_sk); /* final */
*new_smc = NULL;
goto out;
}
@@ -637,7 +637,7 @@ static void smc_accept_enqueue(struct sock *parent, struct 
sock *sk)
 {
struct smc_sock *par = smc_sk(parent);
 
-   sock_hold(sk);
+   sock_hold(sk); /* sock_put in smc_accept_unlink () */
spin_lock(&par->accept_q_lock);
list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
spin_unlock(&par->accept_q_lock);
@@ -653,7 +653,7 @@ static void smc_accept_unlink(struct sock *sk)
list_del_init(&smc_sk(sk)->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
-   sock_put(sk);
+   sock_put(sk); /* sock_hold in smc_accept_enqueue */
 }
 
 /* remove a sock from the accept queue to bind it to a new socket created
@@ -671,7 +671,7 @@ struct sock *smc_accept_dequeue(struct sock *parent,
smc_accept_unlink(new_sk);
if (new_sk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
-   sock_put(new_sk);
+   sock_put(new_sk); /* final */
continue;
}
if (new_sock)
@@ -686,14 +686,11 @@ void smc_close_non_accepted(struct sock *sk)
 {
struct smc_sock *smc = smc_sk(sk);
 
-   sock_hold(sk);
lock_sock(sk);
if (!sk->sk_lingertime)

[PATCH net-next 2/5] net/smc: smc_poll improvements

2018-01-26 Thread Ursula Braun

Increase the socket refcount during poll wait.
Take the socket lock before checking socket state.
For a listening socket return a mask independent of state SMC_ACTIVE and
cover errors or closed state as well.
Get rid of the accept_q loop in smc_accept_poll().

Signed-off-by: Ursula Braun 
---
 net/smc/af_smc.c | 74 ++--
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index cf0e11978b66..90c22a854f28 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1122,21 +1122,15 @@ static int smc_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
 
 static unsigned int smc_accept_poll(struct sock *parent)
 {
-   struct smc_sock *isk;
-   struct sock *sk;
-
-   lock_sock(parent);
-   list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
-   sk = (struct sock *)isk;
+   struct smc_sock *isk = smc_sk(parent);
+   int mask = 0;
 
-   if (sk->sk_state == SMC_ACTIVE) {
-   release_sock(parent);
-   return POLLIN | POLLRDNORM;
-   }
-   }
-   release_sock(parent);
+   spin_lock(&isk->accept_q_lock);
+   if (!list_empty(&isk->accept_q))
+   mask = POLLIN | POLLRDNORM;
+   spin_unlock(&isk->accept_q_lock);
 
-   return 0;
+   return mask;
 }
 
 static unsigned int smc_poll(struct file *file, struct socket *sock,
@@ -1147,9 +1141,15 @@ static unsigned int smc_poll(struct file *file, struct 
socket *sock,
struct smc_sock *smc;
int rc;
 
+   if (!sk)
+   return POLLNVAL;
+
smc = smc_sk(sock->sk);
+   sock_hold(sk);
+   lock_sock(sk);
if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
/* delegate to CLC child sock */
+   release_sock(sk);
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
/* if non-blocking connect finished ... */
lock_sock(sk);
@@ -1161,37 +1161,43 @@ static unsigned int smc_poll(struct file *file, struct 
socket *sock,
rc = smc_connect_rdma(smc);
if (rc < 0)
mask |= POLLERR;
-   else
-   /* success cases including fallback */
-   mask |= POLLOUT | POLLWRNORM;
+   /* success cases including fallback */
+   mask |= POLLOUT | POLLWRNORM;
}
}
-   release_sock(sk);
} else {
-   sock_poll_wait(file, sk_sleep(sk), wait);
-   if (sk->sk_state == SMC_LISTEN)
-   /* woken up by sk_data_ready in smc_listen_work() */
-   mask |= smc_accept_poll(sk);
+   if (sk->sk_state != SMC_CLOSED) {
+   release_sock(sk);
+   sock_poll_wait(file, sk_sleep(sk), wait);
+   lock_sock(sk);
+   }
if (sk->sk_err)
mask |= POLLERR;
-   if (atomic_read(&smc->conn.sndbuf_space) ||
-   (sk->sk_shutdown & SEND_SHUTDOWN)) {
-   mask |= POLLOUT | POLLWRNORM;
-   } else {
-   sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-   set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-   }
-   if (atomic_read(&smc->conn.bytes_to_rcv))
-   mask |= POLLIN | POLLRDNORM;
if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
(sk->sk_state == SMC_CLOSED))
mask |= POLLHUP;
-   if (sk->sk_shutdown & RCV_SHUTDOWN)
-   mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-   if (sk->sk_state == SMC_APPCLOSEWAIT1)
-   mask |= POLLIN;
+   if (sk->sk_state == SMC_LISTEN) {
+   /* woken up by sk_data_ready in smc_listen_work() */
+   mask = smc_accept_poll(sk);
+   } else {
+   if (atomic_read(&smc->conn.sndbuf_space) ||
+   sk->sk_shutdown & SEND_SHUTDOWN) {
+   mask |= POLLOUT | POLLWRNORM;
+   } else {
+   sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+   set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+   }
+   if (atomic_read(&smc->conn.bytes_to_rcv))
+   mask |= POLLIN | POLLRDNORM;
+   if (sk->sk_shutdown & RCV_SHUTDOWN)
+   mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+   if (sk->sk_state == SMC_APPCLOSEWAIT1

[PATCH net-next 1/5] net/smc: handle device, port, and QP error events

2018-01-26 Thread Ursula Braun

RoCE device changes cause an IB event, processed in the global event
handler for the ROCE device. Problems for a certain Queue Pair cause a QP
event, processed in the QP event handler for this QP.
Among those events are port errors and other fatal device errors. All
link groups using such a port or device must be terminated in those cases.

Signed-off-by: Ursula Braun 
---
 net/smc/smc_ib.c | 38 +-
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 90f1a7f9085c..2a8957bd6d38 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -141,6 +141,17 @@ int smc_ib_ready_link(struct smc_link *lnk)
return rc;
 }
 
+static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
+{
+   struct smc_link_group *lgr, *l;
+
+   list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+   if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+   lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
+   smc_lgr_terminate(lgr);
+   }
+}
+
 /* process context wrapper for might_sleep smc_ib_remember_port_attr */
 static void smc_ib_port_event_work(struct work_struct *work)
 {
@@ -151,6 +162,8 @@ static void smc_ib_port_event_work(struct work_struct *work)
for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
smc_ib_remember_port_attr(smcibdev, port_idx + 1);
clear_bit(port_idx, &smcibdev->port_event_mask);
+   if (!smc_ib_port_active(smcibdev, port_idx + 1))
+   smc_ib_port_terminate(smcibdev, port_idx + 1);
}
 }
 
@@ -165,15 +178,7 @@ static void smc_ib_global_event_handler(struct 
ib_event_handler *handler,
 
switch (ibevent->event) {
case IB_EVENT_PORT_ERR:
-   port_idx = ibevent->element.port_num - 1;
-   set_bit(port_idx, &smcibdev->port_event_mask);
-   schedule_work(&smcibdev->port_event_work);
-   /* fall through */
case IB_EVENT_DEVICE_FATAL:
-   /* tbd in follow-on patch:
-* abnormal close of corresponding connections
-*/
-   break;
case IB_EVENT_PORT_ACTIVE:
port_idx = ibevent->element.port_num - 1;
set_bit(port_idx, &smcibdev->port_event_mask);
@@ -186,7 +191,8 @@ static void smc_ib_global_event_handler(struct 
ib_event_handler *handler,
 
 void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
 {
-   ib_dealloc_pd(lnk->roce_pd);
+   if (lnk->roce_pd)
+   ib_dealloc_pd(lnk->roce_pd);
lnk->roce_pd = NULL;
 }
 
@@ -203,14 +209,18 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
 
 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 {
+   struct smc_ib_device *smcibdev =
+   (struct smc_ib_device *)ibevent->device;
+   u8 port_idx;
+
switch (ibevent->event) {
case IB_EVENT_DEVICE_FATAL:
case IB_EVENT_GID_CHANGE:
case IB_EVENT_PORT_ERR:
case IB_EVENT_QP_ACCESS_ERR:
-   /* tbd in follow-on patch:
-* abnormal close of corresponding connections
-*/
+   port_idx = ibevent->element.port_num - 1;
+   set_bit(port_idx, &smcibdev->port_event_mask);
+   schedule_work(&smcibdev->port_event_work);
break;
default:
break;
@@ -219,7 +229,8 @@ static void smc_ib_qp_event_handler(struct ib_event 
*ibevent, void *priv)
 
 void smc_ib_destroy_queue_pair(struct smc_link *lnk)
 {
-   ib_destroy_qp(lnk->roce_qp);
+   if (lnk->roce_qp)
+   ib_destroy_qp(lnk->roce_qp);
lnk->roce_qp = NULL;
 }
 
@@ -462,6 +473,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device 
*smcibdev)
 {
if (!smcibdev->initialized)
return;
+   smcibdev->initialized = 0;
smc_wr_remove_dev(smcibdev);
ib_unregister_event_handler(&smcibdev->event_handler);
ib_destroy_cq(smcibdev->roce_cq_recv);
-- 
2.13.5

Re: [PATCH net-next v1] samples/bpf: Partially fixes the bpf.o build

2018-01-26 Thread Mickaël Salaün


On 26/01/2018 03:16, Alexei Starovoitov wrote:
> On Fri, Jan 26, 2018 at 01:39:30AM +0100, Mickaël Salaün wrote:
>> Do not build lib/bpf/bpf.o with this Makefile but use the one from the
>> library directory.  This avoid making a buggy bpf.o file (e.g. missing
>> symbols).
> 
> could you provide an example?
> What symbols will be missing?
> I don't think there is an issue with existing Makefile.

You can run this commands:
make -C samples/bpf; nm tools/lib/bpf/bpf.o > a; make -C tools/lib/bpf;
nm tools/lib/bpf/bpf.o > b; diff -u a b

Symbols like bzero and sys_bpf are missing with the samples/bpf
Makefile, which makes the bpf.o shrink from 25K to 7K.

> 
>> This patch is useful if some code (e.g. Landlock tests) needs both the
>> bpf.o (from tools/lib/bpf) and the bpf_load.o (from samples/bpf).
> 
> is that some future patches?

Yes, I'll send them next week.

> 
> we're trying to move everything form samples/bpf/ into selftests/bpf/
> and convert to use libbpf.a instead of obsolete bpf_load.c
> Please use this approach for landlock as well.

Ok, it should be better with this lib.

> 
>> Signed-off-by: Mickaël Salaün 
>> Cc: Alexei Starovoitov 
>> Cc: Daniel Borkmann 
>> ---
>>
>> This is not a complet fix because the call to multi_depend with
>> $(host-cmulti) from scripts/Makefile.host force the build of bpf.o
>> anyway. I'm not sure how to completely avoid this automatic build
>> though.
>> ---
>>  samples/bpf/Makefile | 5 -
>>  1 file changed, 4 insertions(+), 1 deletion(-)
>>
>> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
>> index 7f61a3d57fa7..64335bb94f9f 100644
>> --- a/samples/bpf/Makefile
>> +++ b/samples/bpf/Makefile
>> @@ -201,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH)
>>  endif
>>  
>>  # Trick to allow make to be run from this directory
>> -all:
>> +all: $(LIBBPF)
>>  $(MAKE) -C ../../ $(CURDIR)/
>>  
>>  clean:
>>  $(MAKE) -C ../../ M=$(CURDIR) clean
>>  @rm -f *~
>>  
>> +$(LIBBPF): FORCE
>> +$(MAKE) -C $(dir $@) $(notdir $@)
>> +
>>  $(obj)/syscall_nrs.s:   $(src)/syscall_nrs.c
>>  $(call if_changed_dep,cc_s_c)
>>  
>> -- 
>> 2.15.1
>>
> 



signature.asc
Description: OpenPGP digital signature

Re: [PATCH net-next 0/3 V1] rtnetlink: enable IFLA_IF_NETNSID for RTM_{DEL,SET}LINK

2018-01-26 Thread Jiri Benc

On Fri, 26 Jan 2018 00:34:51 +0100, Nicolas Dichtel wrote:
> Why meaningful? The user knows that the answer is like if if was done in 
> another
> netns. It enables to have only one netlink socket instead of one per netns. 
> But
> the code using it will be the same.  

Because you can't use it to query the linked interface. You can't even
use it as an opaque value to track interfaces (netnsid+ifindex) because
netnsids are not unique across net name spaces. You can easily have two
interfaces that have all the ifindex, ifname, netnsid (and basically
everything else) identical but being completely different interfaces.
That's really not helpful.

> I fear that with your approach, it will results to a lot of complexity in the
> kernel.  

The complexity is (at least partly) already there. It's an inevitable
result of the design decision to have relative identifiers.

I agree that we should think about how to make this easy to implement.
I like your idea of doing this somehow generically. Perhaps it's
possible to do while keeping the netnsids valid in the caller's netns?

> What is really missing for me, is a way to get a fd from an nsid. The user
> should be able to call RTM_GETNSID with an fd and a nsid and the kernel 
> performs
> the needed operations so that the fd points to the corresponding netns.  

That's what I was missing, too. I even looked into implementing it. But
opening a fd on behalf of the process and returning it over netlink is a
wrong thing to do. Netlink messages can get lost. Then you have a fd
leak you can do nothing about.

Given that we have netnsids used for so much stuff already (like
NETLINK_LISTEN_ALL_NSID) you need to track them anyway. And if you need
to track them, why bother with another identifier? It would be better
if netnsid can be used universally for anything. Then there will be no
need for the conversion.

 Jiri

[PATCH] tcp: release sk_frag.page in tcp_disconnect

2018-01-26 Thread Li RongQing

socket can be disconnected and gets transformed back to a listening
socket, if sk_frag.page is not released, which will be cloned into
a new socket by sk_clone_lock, but the reference count of this page
is increased, lead to a use after free or double free issue

Signed-off-by: Li RongQing 
Cc: Eric Dumazet 
---
 net/ipv4/tcp.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f08eebe60446..73f068406519 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2431,6 +2431,12 @@ int tcp_disconnect(struct sock *sk, int flags)
 
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
+   if (sk->sk_frag.page) {
+   put_page(sk->sk_frag.page);
+   sk->sk_frag.page = NULL;
+   sk->sk_frag.offset = 0;
+   }
+
sk->sk_error_report(sk);
return err;
 }
-- 
2.11.0

[PATCH net-next] sfc: add suffix to large constant in ptp

2018-01-26 Thread Bert Kenward

Fixes: 1280c0f8aafc ("sfc: support second + quarter ns time format for receive 
datapath")
Reported-by: kbuild test robot 
Signed-off-by: Bert Kenward 
---
 drivers/net/ethernet/sfc/ptp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 433d29d6bc95..f1cc2ed76029 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -643,7 +643,7 @@ static int efx_ptp_get_attributes(struct efx_nic *efx)
case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_QTR_NANOSECONDS:
ptp->ns_to_nic_time = efx_ptp_ns_to_s_qns;
ptp->nic_to_kernel_time = efx_ptp_s_qns_to_ktime_correction;
-   ptp->nic_time.minor_max = 40;
+   ptp->nic_time.minor_max = 40UL;
ptp->nic_time.sync_event_minor_shift = 24;
break;
default:
-- 
2.13.6

[PATCH 2/3] Revert "e1000e: Separate signaling for link check/link up"

2018-01-26 Thread Benjamin Poirier

This reverts commit 19110cfbb34d4af0cdfe14cd243f3b09dc95b013.
This reverts commit 4110e02eb45ea447ec6f5459c9934de0a273fb91.

... because they cause an extra 2s delay for the link to come up when
autoneg is off.

After reverting, the race condition described in the log of commit
19110cfbb34d ("e1000e: Separate signaling for link check/link up") is
reintroduced. It may still be triggered by LSC events but this should not
result in link flap. It may no longer be triggered by RXO events because
commit 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts")
restored reading icr in the Other handler.

As discussed, the driver should be in "maintenance mode". In the interest
of stability, revert to the original code as much as possible instead of a
half-baked solution.

Link: https://www.spinics.net/lists/netdev/msg479923.html
Signed-off-by: Benjamin Poirier 
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 +++
 drivers/net/ethernet/intel/e1000e/mac.c | 11 +++
 drivers/net/ethernet/intel/e1000e/netdev.c  |  2 +-
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c 
b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index 31277d3bb7dc..d6d4ed7acf03 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1367,9 +1367,6 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, 
bool force)
  *  Checks to see of the link status of the hardware has changed.  If a
  *  change in link status has been detected, then we read the PHY registers
  *  to get the current speed/duplex if link exists.
- *
- *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
- *  up).
  **/
 static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 {
@@ -1385,7 +1382,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
 * Change or Rx Sequence Error interrupt.
 */
if (!mac->get_link_status)
-   return 1;
+   return 0;
 
/* First we want to see if the MII Status Register reports
 * link.  If so, then we want to get the current speed/duplex
@@ -1616,12 +1613,10 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
 * different link partner.
 */
ret_val = e1000e_config_fc_after_link_up(hw);
-   if (ret_val) {
+   if (ret_val)
e_dbg("Error configuring flow control\n");
-   return ret_val;
-   }
 
-   return 1;
+   return ret_val;
 }
 
 static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter)
diff --git a/drivers/net/ethernet/intel/e1000e/mac.c 
b/drivers/net/ethernet/intel/e1000e/mac.c
index f457c5703d0c..b322011ec282 100644
--- a/drivers/net/ethernet/intel/e1000e/mac.c
+++ b/drivers/net/ethernet/intel/e1000e/mac.c
@@ -410,9 +410,6 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw)
  *  Checks to see of the link status of the hardware has changed.  If a
  *  change in link status has been detected, then we read the PHY registers
  *  to get the current speed/duplex if link exists.
- *
- *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
- *  up).
  **/
 s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 {
@@ -426,7 +423,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 * Change or Rx Sequence Error interrupt.
 */
if (!mac->get_link_status)
-   return 1;
+   return 0;
 
/* First we want to see if the MII Status Register reports
 * link.  If so, then we want to get the current speed/duplex
@@ -464,12 +461,10 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 * different link partner.
 */
ret_val = e1000e_config_fc_after_link_up(hw);
-   if (ret_val) {
+   if (ret_val)
e_dbg("Error configuring flow control\n");
-   return ret_val;
-   }
 
-   return 1;
+   return ret_val;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 398e940436f8..ed103b9a8d3a 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5091,7 +5091,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
case e1000_media_type_copper:
if (hw->mac.get_link_status) {
ret_val = hw->mac.ops.check_for_link(hw);
-   link_active = ret_val > 0;
+   link_active = !hw->mac.get_link_status;
} else {
link_active = true;
}
-- 
2.15.1

[PATCH 1/3] Partial revert "e1000e: Avoid receiver overrun interrupt bursts"

2018-01-26 Thread Benjamin Poirier

This reverts commit 4aea7a5c5e940c1723add439f4088844cd26196d.

We keep the fix for the first part of the problem (1) described in the log
of that commit however we use the implementation of e1000_msix_other() from
before commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt").
We remove the fix for the second part of the problem (2).

Bursts of "Other" interrupts may once again occur during rxo (receive
overflow) traffic conditions. This is deemed acceptable in the interest of
reverting driver code back to its previous state. The e1000e driver should
be in "maintenance" mode and we want to avoid unforeseen fallout from
changes that are not strictly necessary.

Link: https://www.spinics.net/lists/netdev/msg480675.html
Signed-off-by: Benjamin Poirier 
---
 drivers/net/ethernet/intel/e1000e/defines.h |  1 -
 drivers/net/ethernet/intel/e1000e/netdev.c  | 32 +++--
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/defines.h 
b/drivers/net/ethernet/intel/e1000e/defines.h
index afb7ebe20b24..0641c0098738 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -398,7 +398,6 @@
 #define E1000_ICR_LSC   0x0004 /* Link Status Change */
 #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */
 #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */
-#define E1000_ICR_RXO   0x0040 /* Receiver Overrun */
 #define E1000_ICR_RXT0  0x0080 /* Rx timer intr (ring 0) */
 #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */
 /* If this bit asserted, the driver should claim the interrupt */
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 9f18d39bdc8f..398e940436f8 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1914,30 +1914,23 @@ static irqreturn_t e1000_msix_other(int __always_unused 
irq, void *data)
struct net_device *netdev = data;
struct e1000_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
-   u32 icr;
-   bool enable = true;
-
-   icr = er32(ICR);
-   if (icr & E1000_ICR_RXO) {
-   ew32(ICR, E1000_ICR_RXO);
-   enable = false;
-   /* napi poll will re-enable Other, make sure it runs */
-   if (napi_schedule_prep(&adapter->napi)) {
-   adapter->total_rx_bytes = 0;
-   adapter->total_rx_packets = 0;
-   __napi_schedule(&adapter->napi);
-   }
-   }
-   if (icr & E1000_ICR_LSC) {
-   ew32(ICR, E1000_ICR_LSC);
+   u32 icr = er32(ICR);
+
+   if (icr & adapter->eiac_mask)
+   ew32(ICS, (icr & adapter->eiac_mask));
+
+   if (icr & E1000_ICR_OTHER) {
+   if (!(icr & E1000_ICR_LSC))
+   goto no_link_interrupt;
hw->mac.get_link_status = true;
/* guard against interrupt when we're going down */
if (!test_bit(__E1000_DOWN, &adapter->state))
mod_timer(&adapter->watchdog_timer, jiffies + 1);
}
 
-   if (enable && !test_bit(__E1000_DOWN, &adapter->state))
-   ew32(IMS, E1000_IMS_OTHER);
+no_link_interrupt:
+   if (!test_bit(__E1000_DOWN, &adapter->state))
+   ew32(IMS, E1000_IMS_LSC | E1000_IMS_OTHER);
 
return IRQ_HANDLED;
 }
@@ -2707,8 +2700,7 @@ static int e1000e_poll(struct napi_struct *napi, int 
weight)
napi_complete_done(napi, work_done);
if (!test_bit(__E1000_DOWN, &adapter->state)) {
if (adapter->msix_entries)
-   ew32(IMS, adapter->rx_ring->ims_val |
-E1000_IMS_OTHER);
+   ew32(IMS, adapter->rx_ring->ims_val);
else
e1000_irq_enable(adapter);
}
-- 
2.15.1

[PATCH 3/3] Revert "e1000e: Do not read ICR in Other interrupt"

2018-01-26 Thread Benjamin Poirier

This reverts commit 16ecba59bc333d6282ee057fb02339f77a880beb.

It was reported that emulated e1000e devices in vmware esxi 6.5 Build
7526125 do not link up after commit 4aea7a5c5e94 ("e1000e: Avoid receiver
overrun interrupt bursts"). Some tracing shows that after
e1000e_trigger_lsc() is called, ICR reads out as 0x0 in e1000_msix_other()
on emulated e1000e devices. In comparison, on real e1000e 82574 hardware,
icr=0x8004 (_INT_ASSERTED | _LSC) in the same situation.

Some experimentation showed that this flaw in vmware e1000e emulation can
be worked around by not setting Other in EIAC. This is how it was before
commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt").

Since the ICR read in the Other interrupt handler has already been
restored, this patch effectively reverts the remainder of commit
16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt").

Fixes: 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts")
Signed-off-by: Benjamin Poirier 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index ed103b9a8d3a..fffc1f0e3895 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1916,6 +1916,13 @@ static irqreturn_t e1000_msix_other(int __always_unused 
irq, void *data)
struct e1000_hw *hw = &adapter->hw;
u32 icr = er32(ICR);
 
+   /* Certain events (such as RXO) which trigger Other do not set
+* INT_ASSERTED. In that case, read to clear of icr does not take
+* place.
+*/
+   if (!(icr & E1000_ICR_INT_ASSERTED))
+   ew32(ICR, E1000_ICR_OTHER);
+
if (icr & adapter->eiac_mask)
ew32(ICS, (icr & adapter->eiac_mask));
 
@@ -2033,7 +2040,6 @@ static void e1000_configure_msix(struct e1000_adapter 
*adapter)
   hw->hw_addr + E1000_EITR_82574(vector));
else
writel(1, hw->hw_addr + E1000_EITR_82574(vector));
-   adapter->eiac_mask |= E1000_IMS_OTHER;
 
/* Cause Tx interrupts on every write back */
ivar |= BIT(31);
@@ -2258,7 +2264,7 @@ static void e1000_irq_enable(struct e1000_adapter 
*adapter)
 
if (adapter->msix_entries) {
ew32(EIAC_82574, adapter->eiac_mask & E1000_EIAC_MASK_82574);
-   ew32(IMS, adapter->eiac_mask | E1000_IMS_LSC);
+   ew32(IMS, adapter->eiac_mask | E1000_IMS_OTHER | E1000_IMS_LSC);
} else if (hw->mac.type >= e1000_pch_lpt) {
ew32(IMS, IMS_ENABLE_MASK | E1000_IMS_ECCER);
} else {
-- 
2.15.1

[PATCH 0/3] e1000e: Revert interrupt handling changes

2018-01-26 Thread Benjamin Poirier

As discussed in the thread "[RFC PATCH] e1000e: Remove Other from EIAC.",
https://www.spinics.net/lists/netdev/msg479311.html

The following list of commits was considered:
4d432f67ff00 e1000e: Remove unreachable code (v4.5-rc1)
16ecba59bc33 e1000e: Do not read ICR in Other interrupt (v4.5-rc1)
a61cfe4ffad7 e1000e: Do not write lsc to ics in msi-x mode (v4.5-rc1)
0a8047ac68e5 e1000e: Fix msi-x interrupt automask (v4.5-rc1)
19110cfbb34d e1000e: Separate signaling for link check/link up (v4.15-rc1)
4aea7a5c5e94 e1000e: Avoid receiver overrun interrupt bursts (v4.15-rc1)
4110e02eb45e e1000e: Fix e1000_check_for_copper_link_ich8lan return value. 
(v4.15-rc8)

There have a been a slew of regressions due to unforeseen consequences
(receive overflow triggers Other, vmware's emulated e1000e) and programming
mistakes (4110e02eb45e). Since the e1000e driver is supposed to be in
maintenance mode, this patch series revisits the above changes to prune
them down.

After this series, the remaining differences related to how interrupts were
handled at commit 4d432f67ff00 ("e1000e: Remove unreachable code",
v4.5-rc1) are:
* the changes in commit 0a8047ac68e5 ("e1000e: Fix msi-x interrupt
  automask", v4.5-rc1) are preserved.
* we manually clear Other from icr in e1000_msix_other().

We try to go back to a long lost time when things were simple and drivers
ran smoothly.


Benjamin Poirier (3):
  Partial revert "e1000e: Avoid receiver overrun interrupt bursts"
  Revert "e1000e: Separate signaling for link check/link up"
  Revert "e1000e: Do not read ICR in Other interrupt"

 drivers/net/ethernet/intel/e1000e/defines.h |  1 -
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 ++--
 drivers/net/ethernet/intel/e1000e/mac.c | 11 ++--
 drivers/net/ethernet/intel/e1000e/netdev.c  | 44 ++---
 4 files changed, 27 insertions(+), 40 deletions(-)

-- 
2.15.1

Re: [PATCH net-next 0/3 V1] rtnetlink: enable IFLA_IF_NETNSID for RTM_{DEL,SET}LINK

2018-01-26 Thread Nicolas Dichtel

Le 26/01/2018 à 09:36, Jiri Benc a écrit :
> On Fri, 26 Jan 2018 00:34:51 +0100, Nicolas Dichtel wrote:
>> Why meaningful? The user knows that the answer is like if if was done in 
>> another
>> netns. It enables to have only one netlink socket instead of one per netns. 
>> But
>> the code using it will be the same.  
> 
> Because you can't use it to query the linked interface. You can't even
> use it as an opaque value to track interfaces (netnsid+ifindex) because
> netnsids are not unique across net name spaces. You can easily have two
> interfaces that have all the ifindex, ifname, netnsid (and basically
> everything else) identical but being completely different interfaces.
Yes, the user have to map those info correctly. And this complexifies the (user)
code a lot.

> That's really not helpful.
> 
>> I fear that with your approach, it will results to a lot of complexity in the
>> kernel.  
> 
> The complexity is (at least partly) already there. It's an inevitable
> result of the design decision to have relative identifiers.
Yes, you're right. My approach moves the complexity to the user, which make this
feature hard to use.

> 
> I agree that we should think about how to make this easy to implement.
> I like your idea of doing this somehow generically. Perhaps it's
> possible to do while keeping the netnsids valid in the caller's netns?
Yes. I agree that it will be a lot easier to use if the conversion is done in
the kernel. And having a generic mechanism will also help a lot to use it.

> 
>> What is really missing for me, is a way to get a fd from an nsid. The user
>> should be able to call RTM_GETNSID with an fd and a nsid and the kernel 
>> performs
>> the needed operations so that the fd points to the corresponding netns.  
> 
> That's what I was missing, too. I even looked into implementing it. But
> opening a fd on behalf of the process and returning it over netlink is a
> wrong thing to do. Netlink messages can get lost. Then you have a fd
> leak you can do nothing about.
Yes, I also looked at this ;-)

> 
> Given that we have netnsids used for so much stuff already (like
> NETLINK_LISTEN_ALL_NSID) you need to track them anyway. And if you need
> to track them, why bother with another identifier? It would be better
> if netnsid can be used universally for anything. Then there will be no
> need for the conversion.
I like this idea a lot. So the missing part is a setns() using the nsid ;-)


Regards,
Nicolas

pull-request: can-next 2018-01-26

2018-01-26 Thread Marc Kleine-Budde

Hello David,

this is a pull request for net-next/master consisting of 3 patches.

The first two patches target the CAN documentation. The first is by me
and fixes pointer to location of fsl,mpc5200-mscan node in the mpc5200
documentation. The second patch is by Robert Schwebel and it converts
the plain ASCII documentation to restructured text.

The third patch is by Fabrizio Castro add the r8a774[35] support to the
rcar_can dt-bindings documentation.

regards,
Marc

---

The following changes since commit 9515a2e082f91457db0ecff4b65371d0fb5d9aad:

  net/ipv4: Allow send to local broadcast from a socket bound to a VRF 
(2018-01-25 21:51:31 -0500)

are available in the Git repository at:

  
ssh://g...@gitolite.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git 
tags/linux-can-next-for-4.16-20180126

for you to fetch changes up to 216bf2f490c214b8a0702b52cc957138ba24bc3f:

  dt-bindings: can: rcar_can: document r8a774[35] can support (2018-01-26 
10:47:17 +0100)


linux-can-next-for-4.16-20180126


Fabrizio Castro (1):
  dt-bindings: can: rcar_can: document r8a774[35] can support

Marc Kleine-Budde (1):
  Documentation/devicetree: mpc5200.txt: fix pointer to location of 
fsl,mpc5200-mscan node

Robert Schwebel (1):
  can: migrate documentation to restructured text

 .../devicetree/bindings/net/can/rcar_can.txt   |7 +-
 .../devicetree/bindings/powerpc/fsl/mpc5200.txt|2 +-
 Documentation/networking/00-INDEX  |2 -
 Documentation/networking/can.rst   | 1437 
 Documentation/networking/can.txt   | 1308 --
 Documentation/networking/index.rst |1 +
 MAINTAINERS|2 +-
 drivers/net/can/dev.c  |2 +-
 drivers/net/can/vcan.c |2 +-
 net/can/Kconfig|2 +-
 10 files changed, 1448 insertions(+), 1317 deletions(-)
 create mode 100644 Documentation/networking/can.rst
 delete mode 100644 Documentation/networking/can.txt

-- 
Pengutronix e.K.  | Marc Kleine-Budde   |
Industrial Linux Solutions| Phone: +49-231-2826-924 |
Vertretung West/Dortmund  | Fax:   +49-5121-206917- |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |



signature.asc
Description: OpenPGP digital signature

Re: [PATCH 1/8] dt-bindings: can: rcar_can: document r8a774[35] can support

2018-01-26 Thread Marc Kleine-Budde

On 01/25/2018 09:53 AM, Marc Kleine-Budde wrote:
> On 01/24/2018 06:22 PM, Fabrizio Castro wrote:
>> thank you for Acking the patch, just wondering if this patch has any
>> chance to end up in v4.16?
> 
> I can take this via the linux-can tree, if no one else takes it.

Done - included in the latest pull request.

Marc

-- 
Pengutronix e.K.  | Marc Kleine-Budde   |
Industrial Linux Solutions| Phone: +49-231-2826-924 |
Vertretung West/Dortmund  | Fax:   +49-5121-206917- |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |



signature.asc
Description: OpenPGP digital signature

[PATCH] e1000e: allocate ring descriptors with dma_zalloc_coherent

2018-01-26 Thread Pierre-Yves Kerbrat

Descriptor rings were not initialized at zero when allocated
When area contained garbage data, it caused skb_over_panic in
e1000_clean_rx_irq (if data had E1000_RXD_STAT_DD bit set)

This patch makes use of dma_zalloc_coherent to make sure the
ring is memset at 0 to prevent the area from containing garbage.

Following is the signature of the panic:
IODDR0@0.0: skbuff: skb_over_panic: text:80407b20 len:64010 put:64010 
head:ab46d800 data:ab46d842 tail:0xab47d24c end:0xab46df40 dev:eth0
IODDR0@0.0: BUG: failure at net/core/skbuff.c:105/skb_panic()!
IODDR0@0.0: Kernel panic - not syncing: BUG!
IODDR0@0.0:
IODDR0@0.0: Process swapper/0 (pid: 0, threadinfo=81728000, task=8173cc00 ,cpu: 
0)
IODDR0@0.0: SP = <815a1c0c>
IODDR0@0.0: Stack:  0001
IODDR0@0.0: b2d89800 815e33ac
IODDR0@0.0: ea73c040 0001
IODDR0@0.0: 60040003 fa0a
IODDR0@0.0: 0002
IODDR0@0.0:
IODDR0@0.0: 804540c0 815a1c70
IODDR0@0.0: b2744000 602ac070
IODDR0@0.0: 815a1c44 b2d89800
IODDR0@0.0: 8173cc00 815a1c08
IODDR0@0.0:
IODDR0@0.0: 0006
IODDR0@0.0: 815a1b50 
IODDR0@0.0: 80079434 0001
IODDR0@0.0: ab46df40 b2744000
IODDR0@0.0: b2d89800
IODDR0@0.0:
IODDR0@0.0: fa0a 8045745c
IODDR0@0.0: 815a1c88 fa0a
IODDR0@0.0: 80407b20 b2789f80
IODDR0@0.0: 0005 80407b20
IODDR0@0.0:
IODDR0@0.0:
IODDR0@0.0: Call Trace:
IODDR0@0.0: [<804540bc>] skb_panic+0xa4/0xa8
IODDR0@0.0: [<80079430>] console_unlock+0x2f8/0x6d0
IODDR0@0.0: [<80457458>] skb_put+0xa0/0xc0
IODDR0@0.0: [<80407b1c>] e1000_clean_rx_irq+0x2dc/0x3e8
IODDR0@0.0: [<80407b1c>] e1000_clean_rx_irq+0x2dc/0x3e8
IODDR0@0.0: [<804079c8>] e1000_clean_rx_irq+0x188/0x3e8
IODDR0@0.0: [<80407b1c>] e1000_clean_rx_irq+0x2dc/0x3e8
IODDR0@0.0: [<80468b48>] __dev_kfree_skb_any+0x88/0xa8
IODDR0@0.0: [<804101ac>] e1000e_poll+0x94/0x288
IODDR0@0.0: [<8046e9d4>] net_rx_action+0x19c/0x4e8
IODDR0@0.0:   ...
IODDR0@0.0: Maximum depth to print reached. Use kstack= 
To specify a custom value (where 0 means to display the full backtrace)
IODDR0@0.0: ---[ end Kernel panic - not syncing: BUG!

Signed-off-by: Pierre-Yves Kerbrat 
Signed-off-by: Marius Gligor 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 1298b69..26121ed 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -2333,7 +2333,7 @@ static int e1000_alloc_ring_dma(struct e1000_adapter 
*adapter,
 {
struct pci_dev *pdev = adapter->pdev;
 
-   ring->desc = dma_alloc_coherent(&pdev->dev, ring->size, &ring->dma,
+   ring->desc = dma_zalloc_coherent(&pdev->dev, ring->size, &ring->dma,
GFP_KERNEL);
if (!ring->desc)
return -ENOMEM;
-- 
1.8.3.1

[PATCH v2] net: macb: Handle HRESP error

2018-01-26 Thread Harini Katakam

From: Harini Katakam 

Handle HRESP error by doing a SW reset of RX and TX and
re-initializing the descriptors, RX and TX queue pointers.

Signed-off-by: Harini Katakam 
Signed-off-by: Michal Simek 
---
v2:
Rebased on top of latest net-next and reinitialized
all rx queues.

 drivers/net/ethernet/cadence/macb.h  |  3 ++
 drivers/net/ethernet/cadence/macb_main.c | 59 +---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index c50c5ec..8665982 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 

 #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
 #define MACB_EXT_DESC
@@ -1200,6 +1201,8 @@ struct macb {
struct ethtool_rx_fs_list rx_fs_list;
spinlock_t rx_fs_lock;
unsigned int max_tuples;
+
+   struct tasklet_struct   hresp_err_tasklet;
 };

 #ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 234667e..e84afcf 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int 
budget)
return work_done;
 }

+static void macb_hresp_error_task(unsigned long data)
+{
+   struct macb *bp = (struct macb *)data;
+   struct net_device *dev = bp->dev;
+   struct macb_queue *queue = bp->queues;
+   unsigned int q;
+   u32 ctrl;
+
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+   ctrl = macb_readl(bp, NCR);
+   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
+   macb_writel(bp, NCR, ctrl);
+
+   netif_tx_stop_all_queues(dev);
+   netif_carrier_off(dev);
+
+   bp->macbgem_ops.mog_init_rings(bp);
+
+   /* Initialize TX and RX buffers */
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, RBQPH,
+upper_32_bits(queue->rx_ring_dma));
+#endif
+   queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, TBQPH,
+upper_32_bits(queue->tx_ring_dma));
+#endif
+
+   /* Enable interrupts */
+   queue_writel(queue, IER,
+MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+
+   ctrl |= MACB_BIT(RE) | MACB_BIT(TE);
+   macb_writel(bp, NCR, ctrl);
+
+   netif_carrier_on(dev);
+   netif_tx_start_all_queues(dev);
+}
+
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
struct macb_queue *queue = dev_id;
@@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
}

if (status & MACB_BIT(HRESP)) {
-   /* TODO: Reset the hardware, and maybe move the
-* netdev_err to a lower-priority context as well
-* (work queue?)
-*/
+   tasklet_schedule(&bp->hresp_err_tasklet);
netdev_err(dev, "DMA bus error: HRESP not OK\n");

if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
@@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev)
goto err_out_unregister_mdio;
}

+   tasklet_init(&bp->hresp_err_tasklet, macb_hresp_error_task,
+(unsigned long)bp);
+
phy_attached_info(phydev);

netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
--
2.7.4

This email and any attachments are intended for the sole use of the named 
recipient(s) and contain(s) confidential information that may be proprietary, 
privileged or copyrighted under applicable law. If you are not the intended 
recipient, do not read, copy, or forward this email message or any attachments. 
Delete this email message and any attachments immediately.

Re: [regresssion 4.15] Userspace compilation broken by uapi/linux/if_ether.h update

2018-01-26 Thread Guillaume Nault

On Thu, Jan 25, 2018 at 11:21:34PM +0100, Hauke Mehrtens wrote:
> On 01/25/2018 03:58 PM, Guillaume Nault wrote:
> > Hi,
> > 
> > Commit 6926e041a892 ("uapi/if_ether.h: prevent redefinition of struct 
> > ethhdr"),
> > can break compilation of userspace programs (in my case, accel-ppp
> > (https://accel-ppp.org)).
> > 
> > This happens for userspace programs that end up including
> > linux/if_ether.h, netinet/in.h and linux/in.h in this order:
> > 
> > # cat test_ifether.c
> > #include 
> > #include 
> > #include 
> > 
> > int main(void)
> > {
> > return 0;
> > }
> > 
> > # gcc test_ifether.c
> > In file included from test_ifether.c:2:0:
> > /usr/include/linux/in.h:29:3: error: redeclaration of enumerator 
> > ‘IPPROTO_IP’
> >IPPROTO_IP = 0,  /* Dummy protocol for TCP  */
> >^
> > /usr/include/netinet/in.h:42:5: note: previous definition of ‘IPPROTO_IP’ 
> > was here
> >  IPPROTO_IP = 0,/* Dummy protocol for TCP.  */
> >  ^~
> > 
> > 
> > Now that linux/libc-compat.h is included in linux/if_ether.h, it is
> > processed before netinet/in.h. Therefore, it sets the relevant
> > __UAPI_DEF_* guards to 1 (as _NETINET_IN_H isn't yet defined).
> > Then netinet/in.h is included, followed by linux/in.h. The later
> > doesn't realise that what it defines has already been included by
> > netinet/in.h because the __UAPI_DEF_* guards were set too early.
> > 
> > Of course the situation is a bit more complicated on real projects, as
> > these files aren't included directly. For example, in accel-ppp, the
> > PPPoE module (accel-ppp/accel-pppd/ctrl/pppoe/pppoe.c) uses
> > #include/* includes linux/if_ether.h */
> > #include   /* includes netinet/in.h */
> > #include  /* (through pppoe.h), includes linux/in.h */
> > 
> > 
> > I don't have a satisfying solution for now, but I'd really like it if
> > we could avoid shipping a kernel which forces userspace to play with
> > include files ordering to keep compiling.
> > 
> Hi,
> 
> This is about this commit:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6926e041a8920c8ec27e4e155efa760aa01551fd
> 
> On option would be to move this into include/uapi/linux/if_ether.h and
> remove the include for libc-compat.h:
> #ifndef __UAPI_DEF_ETHHDR
> #define __UAPI_DEF_ETHHDR 1
> #endif
> 
> This will only work if netinet/if_ether.h is included before
> linux/if_ether.h, but I think this is very likely.
>
I don't see what makes its likely. That's not directly related to your
point, but for example, glibc guarantees the opposite as it includes
linux/if_ether.h at the beginning of netinet/if_ether.h.

> I think we can do this because we do not need some special libc handling
> like it is done for other symbols as __UAPI_DEF_ETHHDR is currently only
> needed by musl and not by glibc.
> 
That's ok for me as long as existing projects keep compiling. But all
__UAPI_DEF_* are currently centralised in libc-compat.h. Adding
__UAPI_DEF_ETHHDR in if_ether.h looks like defeating the purpose of
libc-compat.h and I wonder if that'd be accepted. Maybe with a
different name.

In any case, we're really late in the release cycle. If more discussion
is needed, it's probably better to revert and take time to work on a
solution for the next release.

[PATCH net-next 0/2] net: hns3: add support ethtool_ops.{set|get}_coalesce for VF

2018-01-26 Thread Peng Li

This patch-set adds ethtool_ops.{get|set}_coalesce to VF and
fix one related bug.

HNS3 PF and VF driver use the common enet layer, as the
ethtool_ops.{get|set}_coalesce to PF have upstreamed,  just
need add the ops to hns3vf_ethtool_ops.

[Patch 1/2] fix a related bug for the VF ethtool_ops.{set|
get}_coalesce.

Fuyun Liang (2):
  net: hns3: add get/set_coalesce support to VF
  net: hns3: add int_gl_idx setup for VF

 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c| 2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c| 8 
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +
 3 files changed, 15 insertions(+)

-- 
1.9.1

[PATCH net-next 1/2] net: hns3: add get/set_coalesce support to VF

2018-01-26 Thread Peng Li

From: Fuyun Liang 

This patch adds ethtool_ops.get/set_coalesce support to VF.

Since PF and VF share the same get/set_coalesce interface,
we only need to set hns3_get/set_coalesce to the ethtool_ops
when supporting get/set_coalesce for VF.

Signed-off-by: Fuyun Liang 
Signed-off-by: Peng Li 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 7410205..b034c7f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1109,6 +1109,8 @@ static int hns3_set_phys_id(struct net_device *netdev,
.set_rxfh = hns3_set_rss,
.get_link_ksettings = hns3_get_link_ksettings,
.get_channels = hns3_get_channels,
+   .get_coalesce = hns3_get_coalesce,
+   .set_coalesce = hns3_set_coalesce,
 };
 
 static const struct ethtool_ops hns3_ethtool_ops = {
-- 
1.9.1

[PATCH net-next 2/2] net: hns3: add int_gl_idx setup for VF

2018-01-26 Thread Peng Li

From: Fuyun Liang 

Just like PF, if the int_gl_idx of VF does not be set, the default
interrupt coalesce index of VF is 0. But it should be GL1 for TX
queues and GL0 for RX queues.

This patch adds the int_gl_idx setup for VF.

Fixes: 200ecda42598 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) 
Support")
Signed-off-by: Fuyun Liang 
Signed-off-by: Peng Li 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c| 8 
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 96f453f..f38fc5c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -116,6 +116,9 @@ static int hclge_get_ring_chain_from_mbx(
hnae_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B, req->msg[3]);
ring_chain->tqp_index =
hclge_get_queue_id(vport->nic.kinfo.tqp[req->msg[4]]);
+   hnae_set_field(ring_chain->int_gl_idx, HCLGE_INT_GL_IDX_M,
+  HCLGE_INT_GL_IDX_S,
+  req->msg[5]);
 
cur_chain = ring_chain;
 
@@ -133,6 +136,11 @@ static int hclge_get_ring_chain_from_mbx(
[req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i +
HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 1]]);
 
+   hnae_set_field(new_chain->int_gl_idx, HCLGE_INT_GL_IDX_M,
+  HCLGE_INT_GL_IDX_S,
+  req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i +
+  HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 2]);
+
cur_chain->next = new_chain;
cur_chain = new_chain;
}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 3d2bc9a..0d89965 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -565,6 +565,11 @@ static int hclgevf_bind_ring_to_vector(struct hnae3_handle 
*handle, bool en,
hnae_get_bit(node->flag, HNAE3_RING_TYPE_B);
req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 1] =
node->tqp_index;
+   req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 2] =
+   hnae_get_field(node->int_gl_idx,
+  HNAE3_RING_GL_IDX_M,
+  HNAE3_RING_GL_IDX_S);
+
if (i == (HCLGE_MBX_VF_MSG_DATA_NUM -
HCLGEVF_RING_MAP_MBX_BASIC_MSG_NUM) /
HCLGEVF_RING_NODE_VARIABLE_NUM) {
-- 
1.9.1

Re: [PATCH net-next 0/3 V1] rtnetlink: enable IFLA_IF_NETNSID for RTM_{DEL,SET}LINK

2018-01-26 Thread Christian Brauner

On Thu, Jan 25, 2018 at 01:59:06PM +0100, Christian Brauner wrote:
> On Wed, Jan 24, 2018 at 03:26:31PM +0100, Christian Brauner wrote:
> > Hi,
> > 
> > Based on the previous discussion this enables passing a IFLA_IF_NETNSID
> > property along with RTM_SETLINK and RTM_DELLINK requests. The patch for
> > RTM_NEWLINK will be sent out in a separate patch since there are more
> > corner-cases to think about.
> > 
> > Best,
> > Christian
> > 
> > Changelog 2018-01-24:
> > * Preserve old behavior and report -ENODEV when either ifindex or ifname is
> >   provided and IFLA_GROUP is set. Spotted by Wolfgang Bumiller.
> > 
> > Christian Brauner (3):
> >   rtnetlink: enable IFLA_IF_NETNSID in do_setlink()
> >   rtnetlink: enable IFLA_IF_NETNSID for RTM_SETLINK
> >   rtnetlink: enable IFLA_IF_NETNSID for RTM_DELLINK
> > 
> >  net/core/rtnetlink.c | 96 
> > 
> >  1 file changed, 75 insertions(+), 21 deletions(-)
> > 
> > -- 
> > 2.14.1
> > 
> 
> I've done more testing around enabling IFLA_IF_NETNSID for RTM_NEWLINK
> as well and I think that the change is actually trivial. However, I
> would like to wait before sending this patch out until we have agreed on
> the patches for RTM_SETLINK and RTM_DELLINK in this series. Once we have
> merged those I can just send another small commit. Or - if changes to
> this patch series are required - I can just add it in a v2.

Jiri, do you want that patch resent with the small commit for
RTM_NEWLINK included or are you fine with sending it after this series?

Thanks!
Christian

[PATCH net-next 2/3] cxgb4: fix incorrect condition for using firmware LDST commands

2018-01-26 Thread Rahul Lakkireddy

Only contact firmware if it's alive _AND_ if use_bd (use backdoor
access) is not set when issuing FW_LDST_CMD.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index be795d0b0b7e..047609ef0515 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -5090,7 +5090,7 @@ int t4_read_rss(struct adapter *adapter, u16 *map)
 
 static unsigned int t4_use_ldst(struct adapter *adap)
 {
-   return (adap->flags & FW_OK) || !adap->use_bd;
+   return (adap->flags & FW_OK) && !adap->use_bd;
 }
 
 /**
-- 
2.14.1

[PATCH net-next 3/3] cxgb4: use backdoor access to collect dumps when firmware crashed

2018-01-26 Thread Rahul Lakkireddy

Fallback to backdoor register access to collect dumps if firmware
is crashed.  Fixes TID, SGE Queue Context, and MPS TCAM dump collection.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 51 +++---
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c 
b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 8b95117c2923..557fd8bfd54e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -1567,6 +1567,12 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init,
tid1->ver_hdr.size = sizeof(struct cudbg_tid_info_region_rev1) -
 sizeof(struct cudbg_ver_hdr);
 
+   /* If firmware is not attached/alive, use backdoor register
+* access to collect dump.
+*/
+   if (!is_fw_attached(pdbg_init))
+   goto fill_tid;
+
 #define FW_PARAM_PFVF_A(param) \
(FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) | \
 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_##param) | \
@@ -1604,6 +1610,9 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init,
tid->nhpftids = val[1] - val[0] + 1;
}
 
+#undef FW_PARAM_PFVF_A
+
+fill_tid:
tid->ntids = padap->tids.ntids;
tid->nstids = padap->tids.nstids;
tid->stid_base = padap->tids.stid_base;
@@ -1623,8 +1632,6 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init,
tid->ip_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV4_A);
tid->ipv6_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV6_A);
 
-#undef FW_PARAM_PFVF_A
-
return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
@@ -1866,11 +1873,18 @@ int cudbg_collect_dump_context(struct cudbg_init 
*pdbg_init,
max_ctx_size = region_info[i].end - region_info[i].start + 1;
max_ctx_qid = max_ctx_size / SGE_CTXT_SIZE;
 
-   t4_sge_ctxt_flush(padap, padap->mbox, i);
-   rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i],
- region_info[i].start, max_ctx_size,
- (__be32 *)ctx_buf, 1);
-   if (rc) {
+   /* If firmware is not attached/alive, use backdoor register
+* access to collect dump.
+*/
+   if (is_fw_attached(pdbg_init)) {
+   t4_sge_ctxt_flush(padap, padap->mbox, i);
+
+   rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i],
+ region_info[i].start, max_ctx_size,
+ (__be32 *)ctx_buf, 1);
+   }
+
+   if (rc || !is_fw_attached(pdbg_init)) {
max_ctx_qid = CUDBG_LOWMEM_MAX_CTXT_QIDS;
cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, i,
  &buff);
@@ -1946,9 +1960,10 @@ static void cudbg_mps_rpl_backdoor(struct adapter *padap,
mps_rplc->rplc31_0 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP0_A));
 }
 
-static int cudbg_collect_tcam_index(struct adapter *padap,
+static int cudbg_collect_tcam_index(struct cudbg_init *pdbg_init,
struct cudbg_mps_tcam *tcam, u32 idx)
 {
+   struct adapter *padap = pdbg_init->adap;
u64 tcamy, tcamx, val;
u32 ctl, data2;
int rc = 0;
@@ -2033,12 +2048,22 @@ static int cudbg_collect_tcam_index(struct adapter 
*padap,
htons(FW_LDST_CMD_FID_V(FW_LDST_MPS_RPLC) |
  FW_LDST_CMD_IDX_V(idx));
 
-   rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd, sizeof(ldst_cmd),
-   &ldst_cmd);
-   if (rc)
+   /* If firmware is not attached/alive, use backdoor register
+* access to collect dump.
+*/
+   if (is_fw_attached(pdbg_init))
+   rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd,
+   sizeof(ldst_cmd), &ldst_cmd);
+
+   if (rc || !is_fw_attached(pdbg_init)) {
cudbg_mps_rpl_backdoor(padap, &mps_rplc);
-   else
+   /* Ignore error since we collected directly from
+* reading registers.
+*/
+   rc = 0;
+   } else {
mps_rplc = ldst_cmd.u.mps.rplc;
+   }
 
tcam->rplc[0] = ntohl(mps_rplc.rplc31_0);
tcam->rplc[1] = ntohl(mps_rplc.rplc63_32);
@@ -2075,7 +2100,7 @@ int cudbg_collect_mps_tcam(struct cudbg_init *pdbg_init,
 
tcam = (struct cudbg_mps_tcam *)temp_buff.data;
for (i = 0; i < n; i++) {
-   rc = cudbg_collect_tcam_index(padap, tcam, i);
+   rc = cudbg_collect_tc

[PATCH net-next 0/3] cxgb4: fix dump collection when firmware crashed

2018-01-26 Thread Rahul Lakkireddy

Patch 1 resets FW_OK flag, if firmware reports error.

Patch 2 fixes incorrect condition for using firmware LDST commands.

Patch 3 fixes dump collection logic to use backdoor register
access to collect dumps when firmware is crashed.

Thanks,
Rahul

Rahul Lakkireddy (3):
  cxgb4: reset FW_OK flag on firmware crash
  cxgb4: fix incorrect condition for using firmware LDST commands
  cxgb4: use backdoor access to collect dumps when firmware crashed

 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 51 +++---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c |  6 ++-
 2 files changed, 42 insertions(+), 15 deletions(-)

-- 
2.14.1

[PATCH net-next 1/3] cxgb4: reset FW_OK flag on firmware crash

2018-01-26 Thread Rahul Lakkireddy

If firmware reports error, reset FW_OK flag.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index af27d2b0f79f..be795d0b0b7e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -195,9 +195,11 @@ static void t4_report_fw_error(struct adapter *adap)
u32 pcie_fw;
 
pcie_fw = t4_read_reg(adap, PCIE_FW_A);
-   if (pcie_fw & PCIE_FW_ERR_F)
+   if (pcie_fw & PCIE_FW_ERR_F) {
dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n",
reason[PCIE_FW_EVAL_G(pcie_fw)]);
+   adap->flags &= ~FW_OK;
+   }
 }
 
 /*
-- 
2.14.1

Re: [PATCH net] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state

2018-01-26 Thread Alexey Kodanev

On 01/25/2018 09:03 PM, Eric Dumazet wrote:
> On Thu, 2018-01-25 at 20:43 +0300, Alexey Kodanev wrote:
>> ccid2_hc_tx_rto_expire() timer callback always restarts the timer
>> again and can run indefinitely (unless it is stopped outside), and
>> after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at
>> dismantle time"), which moved sk_stop_timer() to sk_destruct(),
>> this started to happen quite often. The timer prevents releasing
>> the socket, as a result, sk_destruct() won't be called.
>>
>> Found with LTP/dccp_ipsec tests running on the bonding device,
>> which later couldn't be unloaded after the tests were completed:
>>
>>   unregister_netdevice: waiting for bond0 to become free. Usage count = 148
>>
>> Fixes: 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time")
>> Signed-off-by: Alexey Kodanev 
>> ---
> 
> I understand your fix, but not why commit 120e9dabaf55 is bug origin.
> 
> Looks like this always had been buggy : Timer logic should have checked
> socket state from day 0.

Hi Eric,

Agree, I'll change to the initial commit id. I've added commit 120e9dabaf55
because ccid_hc_tx_delete() (and sk_stop_timer()) moved from dccp_destroy_sock()
to sk_destruct(), and only after this change the chances that the timer won't
stop increased significantly.

> 
> I did not move sk_stop_timer() to sk_destruct()
> 

ccid_hc_tx_delete() includes sk_stop_timer():


ccid_hc_tx_delete()
ccid2_hc_tx_exit(struct sock *sk)
sk_stop_timer(sk, &hc->tx_rtotimer);

ccid3_hc_tx_exit(struct sock *sk)
sk_stop_timer(sk, &hc->tx_no_feedback_timer);


Thanks,
Alexey

[PATCH] VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING

2018-01-26 Thread Stefan Hajnoczi

select(2) with wfds but no rfds must return when the socket is shut down
by the peer.  This way userspace notices socket activity and gets -EPIPE
from the next write(2).

Currently select(2) does not return for virtio-vsock when a SEND+RCV
shutdown packet is received.  This is because vsock_poll() only sets
POLLOUT | POLLWRNORM for TCP_CLOSE, not the TCP_CLOSING state that the
socket is in when the shutdown is received.

Signed-off-by: Stefan Hajnoczi 
---
 net/vmw_vsock/af_vsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 5d28abf87fbf..c9473d698525 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -951,7 +951,7 @@ static unsigned int vsock_poll(struct file *file, struct 
socket *sock,
 * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
 * but local send is not shutdown.
 */
-   if (sk->sk_state == TCP_CLOSE) {
+   if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) {
if (!(sk->sk_shutdown & SEND_SHUTDOWN))
mask |= POLLOUT | POLLWRNORM;
 
-- 
2.14.3

[PATCH net v2] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state

2018-01-26 Thread Alexey Kodanev

ccid2_hc_tx_rto_expire() timer callback always restarts the timer
again and can run indefinitely (unless it is stopped outside), and after
commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"),
which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from
dccp_destroy_sock() to sk_destruct(), this started to happen quite often.
The timer prevents releasing the socket, as a result, sk_destruct() won't
be called.

Found with LTP/dccp_ipsec tests running on the bonding device,
which later couldn't be unloaded after the tests were completed:

  unregister_netdevice: waiting for bond0 to become free. Usage count = 148

Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation")
Signed-off-by: Alexey Kodanev 
---

v2: * corrected bug origin commit id
* clarified commit message about sk_stop_timer()

 net/dccp/ccids/ccid2.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 1c75cd1..92d016e 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t)
 
ccid2_pr_debug("RTO_EXPIRE\n");
 
+   if (sk->sk_state == DCCP_CLOSED)
+   goto out;
+
/* back-off timer */
hc->tx_rto <<= 1;
if (hc->tx_rto > DCCP_RTO_MAX)
-- 
1.7.1

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread Al Viro

On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote:
> After checking all possible call chains to fs_send() here,
> my tool finds that fs_send() is never called in atomic context.
> And this function is assigned to a function pointer "dev->ops->send",
> which is only called by vcc_sendmsg() (net/atm/common.c)
> through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(),
> it indicates that fs_send() can call functions which may sleep.
> Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL.
> 
> This is found by a static analysis tool named DCNS written by myself.

The trouble is, places like
net/atm/raw.c:65:   vcc->send = atm_send_aal0;
net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
? DROP_PACKET : 1;
bh_unlock_sock(sk_atm(vcc));
in pppoatm_send() definitely is called under a spinlock.

Looking through the driver (in advanced bitrot, as usual for drivers/atm),
I'd say that submit_queue() is fucked in head in the "queue full" case.
And judging by the history, had been thus since the original merge...

Re: aio poll, io_pgetevents and a new in-kernel poll API V4

2018-01-26 Thread Christoph Hellwig

On Thu, Jan 25, 2018 at 03:10:25PM -0500, Benjamin LaHaise wrote:
> I implemented something similar back in December, but did so without
> changing the in-kernel poll API.  See below for the patch that implements
> it.  Is changing the in-kernel poll API really desirable given how many
> drivers that will touch?

I had various previous versions that did not touch the driver API,
but there are a couple issues with that:

 (1) you cannot make the API race free.  With the existing convoluted
 poll_table_struct-based API you can't check for pending items
 before adding yourself to the waitqueue in a race free manner.
 (2) you cannot make the submit non-blocking without deferring to
 a workqueue or similar and thus incurring another context switch
 (3) you cannot deliver events from the wakeup callback, incurring
 another context switch, this time in the wakeup path that actually
 matters for some applications
 (3) the in-kernel poll API really is broken to start with and needs
 to be fixed anyway.  I'd rather rely on that instead of working
 around decades old cruft that has no reason to exist.

Re: [PATCH net-next 03/12] ptr_ring: READ/WRITE_ONCE for __ptr_ring_empty

2018-01-26 Thread Michael S. Tsirkin

On Fri, Jan 26, 2018 at 11:19:58AM +0800, Jason Wang wrote:
> 
> 
> On 2018年01月26日 10:44, Michael S. Tsirkin wrote:
> > On Fri, Jan 26, 2018 at 10:37:58AM +0800, Jason Wang wrote:
> > > 
> > > On 2018年01月26日 07:36, Michael S. Tsirkin wrote:
> > > > Lockless __ptr_ring_empty requires that consumer head is read and
> > > > written at once, atomically. Annotate accordingly to make sure compiler
> > > > does it correctly.  Switch locked callers to __ptr_ring_peek which does
> > > > not support the lockless operation.
> > > > 
> > > > Signed-off-by: Michael S. Tsirkin 
> > > > ---
> > > >include/linux/ptr_ring.h | 11 ---
> > > >1 file changed, 8 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
> > > > index 8594c7b..9a72d8f 100644
> > > > --- a/include/linux/ptr_ring.h
> > > > +++ b/include/linux/ptr_ring.h
> > > > @@ -196,7 +196,9 @@ static inline void *__ptr_ring_peek(struct ptr_ring 
> > > > *r)
> > > > */
> > > >static inline bool __ptr_ring_empty(struct ptr_ring *r)
> > > >{
> > > > -   return !__ptr_ring_peek(r);
> > > > +   if (likely(r->size))
> > > > +   return !r->queue[READ_ONCE(r->consumer_head)];
> > > > +   return true;
> > > >}
> > > So after patch 8, __ptr_ring_peek() did:
> > > 
> > > static inline void *__ptr_ring_peek(struct ptr_ring *r)
> > > {
> > >      if (likely(r->size))
> > >          return READ_ONCE(r->queue[r->consumer_head]);
> > >      return NULL;
> > > }
> > > 
> > > Looks like a duplication.
> > > 
> > > Thanks
> > Nope - they are different.
> > 
> > The reason is that __ptr_ring_peek does not need to read the consumer_head 
> > once
> > since callers have a lock,
> 
> I get this.
> 
> >   and __ptr_ring_empty does not need to read
> > the queue once since it merely compares it to 0.
> > 
> 
> Do this still work if it was called inside a loop?
> 
> Thanks

Sure because compiler does not know head didn't change.

-- 
MST

Re: [PATCH net-next 12/12] tools/virtio: fix smp_mb on x86

2018-01-26 Thread Michael S. Tsirkin

On Fri, Jan 26, 2018 at 11:56:14AM +0800, Jason Wang wrote:
> 
> 
> On 2018年01月26日 07:36, Michael S. Tsirkin wrote:
> > Offset 128 overlaps the last word of the redzone.
> > Use 132 which is always beyond that.
> > 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> >   tools/virtio/ringtest/main.h | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h
> > index 593a328..301d59b 100644
> > --- a/tools/virtio/ringtest/main.h
> > +++ b/tools/virtio/ringtest/main.h
> > @@ -111,7 +111,7 @@ static inline void busy_wait(void)
> >   }
> >   #if defined(__x86_64__) || defined(__i386__)
> > -#define smp_mb() asm volatile("lock; addl $0,-128(%%rsp)" ::: 
> > "memory", "cc")
> 
> Just wonder did "rsp" work for __i386__ ?
> 
> Thanks

Oh you are right of course. Probably no one ever run this one on i386 :)
I'll add a patch on top as this is not a new bug.

> > +#define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: 
> > "memory", "cc")
> >   #else
> >   /*
> >* Not using __ATOMIC_SEQ_CST since gcc docs say they are only 
> > synchronized

Re: [Patch net-next v3 3/3] net_sched: implement ->change_tx_queue_len() for pfifo_fast

2018-01-26 Thread Michael S. Tsirkin

On Fri, Jan 26, 2018 at 11:57:59AM +0800, Jason Wang wrote:
> 
> 
> On 2018年01月26日 10:26, Cong Wang wrote:
> > pfifo_fast used to drop based on qdisc_dev(qdisc)->tx_queue_len,
> > so we have to resize skb array when we change tx_queue_len.
> > 
> > Other qdiscs which read tx_queue_len are fine because they
> > all save it to sch->limit or somewhere else in qdisc during init.
> > They don't have to implement this, it is nicer if they do so
> > that users don't have to re-configure qdisc after changing
> > tx_queue_len.
> > 
> > Cc: John Fastabend 
> > Signed-off-by: Cong Wang 
> > ---
> >   net/sched/sch_generic.c | 18 ++
> >   1 file changed, 18 insertions(+)
> > 
> > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> > index 08f9fa27e06e..190570f21b20 100644
> > --- a/net/sched/sch_generic.c
> > +++ b/net/sched/sch_generic.c
> > @@ -763,6 +763,23 @@ static void pfifo_fast_destroy(struct Qdisc *sch)
> > }
> >   }
> > +static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
> > + unsigned int new_len)
> > +{
> > +   struct pfifo_fast_priv *priv = qdisc_priv(sch);
> > +   struct skb_array *bands[PFIFO_FAST_BANDS];
> > +   int prio;
> > +
> > +   for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
> > +   struct skb_array *q = band2list(priv, prio);
> > +
> > +   bands[prio] = q;
> > +   }
> > +
> > +   return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
> > +GFP_KERNEL);
> > +}
> > +
> >   struct Qdisc_ops pfifo_fast_ops __read_mostly = {
> > .id =   "pfifo_fast",
> > .priv_size  =   sizeof(struct pfifo_fast_priv),
> > @@ -773,6 +790,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
> > .destroy=   pfifo_fast_destroy,
> > .reset  =   pfifo_fast_reset,
> > .dump   =   pfifo_fast_dump,
> > +   .change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
> > .owner  =   THIS_MODULE,
> > .static_flags   =   TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
> >   };
> 
> Is __skb_array_empty() in pfifo_fast_dequeue() still safe after this change?
> 
> Thanks

I think it isn't.  If you want to use resize *and* use unlocked variants,
you must lock all producers and consumers when resizing yourself.

-- 
MST

Re: [PATCH net-next] ptr_ring: fix integer overflow

2018-01-26 Thread Michael S. Tsirkin

On Fri, Jan 26, 2018 at 11:44:22AM +0800, Jason Wang wrote:
> 
> 
> On 2018年01月26日 01:31, Michael S. Tsirkin wrote:
> > On Thu, Jan 25, 2018 at 10:17:38PM +0800, Jason Wang wrote:
> > > 
> > > On 2018年01月25日 21:45, Michael S. Tsirkin wrote:
> > > > On Thu, Jan 25, 2018 at 03:31:42PM +0800, Jason Wang wrote:
> > > > > We try to allocate one more entry for lockless peeking. The adding
> > > > > operation may overflow which causes zero to be passed to kmalloc().
> > > > > In this case, it returns ZERO_SIZE_PTR without any notice by ptr
> > > > > ring. Try to do producing or consuming on such ring will lead NULL
> > > > > dereference. Fix this detect and fail early.
> > > > > 
> > > > > Fixes: bcecb4bbf88a ("net: ptr_ring: otherwise safe empty checks can 
> > > > > overrun array bounds")
> > > > > Reported-by:syzbot+87678bcf753b44c39...@syzkaller.appspotmail.com
> > > > > Cc: John Fastabend
> > > > > Signed-off-by: Jason Wang
> > > > Ugh that's just way too ugly.
> > > > I'll work on dropping the extra + 1 - but calling this
> > > > function with -1 size is the real source of the bug.
> > > > Do you know how come we do that?
> > > > 
> > > It looks e.g try to change tx_queue_len to UINT_MAX. And we probably can't
> > > prevent user form trying to do this?
> > > 
> > > Thanks
> > Right. BTW why net-next? Isn't the crash exploitable in net?
> > 
> 
> Commit bcecb4bbf88a exists only in net-next.

Right you are.

> And in net we check r->size
> before trying to dereference the queue.
> 
> Thanks

I was wondering what it's about btw. Does anyone really create 0 size rings?

-- 
MST

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread Jia-Ju Bai




On 2018/1/26 20:05, Al Viro wrote:

On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote:

After checking all possible call chains to fs_send() here,
my tool finds that fs_send() is never called in atomic context.
And this function is assigned to a function pointer "dev->ops->send",
which is only called by vcc_sendmsg() (net/atm/common.c)
through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(),
it indicates that fs_send() can call functions which may sleep.
Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL.

This is found by a static analysis tool named DCNS written by myself.

The trouble is, places like
net/atm/raw.c:65:   vcc->send = atm_send_aal0;
net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
 ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
 ? DROP_PACKET : 1;
 bh_unlock_sock(sk_atm(vcc));
in pppoatm_send() definitely is called under a spinlock.

Looking through the driver (in advanced bitrot, as usual for drivers/atm),
I'd say that submit_queue() is fucked in head in the "queue full" case.
And judging by the history, had been thus since the original merge...


Thanks for reply :)

I am sorry for this false positive.
I think other ATM related patches that I submitted are also false 
positives, sorry.
My tool did not handle this situation of passing function pointer, and I 
will improve the tool...



Thanks,
Jia-Ju Bai

Re: [Patch net-next v3 3/3] net_sched: implement ->change_tx_queue_len() for pfifo_fast

2018-01-26 Thread Michael S. Tsirkin

On Thu, Jan 25, 2018 at 08:01:42PM -0800, Cong Wang wrote:
> On Thu, Jan 25, 2018 at 7:57 PM, Jason Wang  wrote:
> >
> >
> > On 2018年01月26日 10:26, Cong Wang wrote:
> >>
> >> pfifo_fast used to drop based on qdisc_dev(qdisc)->tx_queue_len,
> >> so we have to resize skb array when we change tx_queue_len.
> >>
> >> Other qdiscs which read tx_queue_len are fine because they
> >> all save it to sch->limit or somewhere else in qdisc during init.
> >> They don't have to implement this, it is nicer if they do so
> >> that users don't have to re-configure qdisc after changing
> >> tx_queue_len.
> >>
> >> Cc: John Fastabend 
> >> Signed-off-by: Cong Wang 
> >> ---
> >>   net/sched/sch_generic.c | 18 ++
> >>   1 file changed, 18 insertions(+)
> >>
> >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> >> index 08f9fa27e06e..190570f21b20 100644
> >> --- a/net/sched/sch_generic.c
> >> +++ b/net/sched/sch_generic.c
> >> @@ -763,6 +763,23 @@ static void pfifo_fast_destroy(struct Qdisc *sch)
> >> }
> >>   }
> >>   +static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
> >> + unsigned int new_len)
> >> +{
> >> +   struct pfifo_fast_priv *priv = qdisc_priv(sch);
> >> +   struct skb_array *bands[PFIFO_FAST_BANDS];
> >> +   int prio;
> >> +
> >> +   for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
> >> +   struct skb_array *q = band2list(priv, prio);
> >> +
> >> +   bands[prio] = q;
> >> +   }
> >> +
> >> +   return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
> >> +GFP_KERNEL);
> >> +}
> >> +
> >>   struct Qdisc_ops pfifo_fast_ops __read_mostly = {
> >> .id =   "pfifo_fast",
> >> .priv_size  =   sizeof(struct pfifo_fast_priv),
> >> @@ -773,6 +790,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
> >> .destroy=   pfifo_fast_destroy,
> >> .reset  =   pfifo_fast_reset,
> >> .dump   =   pfifo_fast_dump,
> >> +   .change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
> >> .owner  =   THIS_MODULE,
> >> .static_flags   =   TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
> >>   };
> >
> >
> > Is __skb_array_empty() in pfifo_fast_dequeue() still safe after this change?
> 
> Yes, we sync with dequeue path before calling ->change_tx_queue_len().
> I already mentioned this in patch 2/3.


This part?

+   bool up = dev->flags & IFF_UP;
+   unsigned int i;
+   int ret = 0;
+
+   if (up)
+   dev_deactivate(dev);
+
+   for (i = 0; i < dev->num_tx_queues; i++) {
+   ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
+
+   /* TODO: revert changes on a partial failure */
+   if (ret)
+   break;
+   }
+
+   if (up)
+   dev_activate(dev);


I wonder whether it really is safe to read dev->flags like that
without any locks.

-- 
MST

Re: [Patch net-next v3 2/3] net_sched: plug in qdisc ops change_tx_queue_len

2018-01-26 Thread Michael S. Tsirkin

On Thu, Jan 25, 2018 at 06:26:23PM -0800, Cong Wang wrote:
> Introduce a new qdisc ops ->change_tx_queue_len() so that
> each qdisc could decide how to implement this if it wants.
> Previously we simply read dev->tx_queue_len, after pfifo_fast
> switches to skb array, we need this API to resize the skb array
> when we change dev->tx_queue_len.
> 
> To avoid handling race conditions with TX BH, we need to
> deactivate all TX queues before change the value and bring them
> back after we are done, this also makes implementation easier.
> 
> Cc: John Fastabend 
> Signed-off-by: Cong Wang 
> ---
>  include/net/sch_generic.h |  2 ++
>  net/core/dev.c|  1 +
>  net/sched/sch_generic.c   | 33 +
>  3 files changed, 36 insertions(+)
> 
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index eac43e8ca96d..e2ab13687fb9 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -200,6 +200,7 @@ struct Qdisc_ops {
> struct nlattr *arg,
> struct netlink_ext_ack *extack);
>   void(*attach)(struct Qdisc *sch);
> + int (*change_tx_queue_len)(struct Qdisc *, unsigned 
> int);
>  
>   int (*dump)(struct Qdisc *, struct sk_buff *);
>   int (*dump_stats)(struct Qdisc *, struct gnet_dump 
> *);
> @@ -489,6 +490,7 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *,
>  void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
>  void qdisc_class_hash_destroy(struct Qdisc_class_hash *);
>  
> +int dev_qdisc_change_tx_queue_len(struct net_device *dev);
>  void dev_init_scheduler(struct net_device *dev);
>  void dev_shutdown(struct net_device *dev);
>  void dev_activate(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index e0b0c2784070..c8443cfaa17a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7070,6 +7070,7 @@ int dev_change_tx_queue_len(struct net_device *dev, 
> unsigned long new_len)
>   dev->tx_queue_len = orig_len;
>   return res;
>   }
> + return dev_qdisc_change_tx_queue_len(dev);
>   }
>  
>   return 0;
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 1816bde47256..08f9fa27e06e 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -1178,6 +1178,39 @@ void dev_deactivate(struct net_device *dev)
>  }
>  EXPORT_SYMBOL(dev_deactivate);
>  
> +static int qdisc_change_tx_queue_len(struct net_device *dev,
> +  struct netdev_queue *dev_queue)
> +{
> + struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
> + const struct Qdisc_ops *ops = qdisc->ops;
> +
> + if (ops->change_tx_queue_len)
> + return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
> + return 0;
> +}
> +
> +int dev_qdisc_change_tx_queue_len(struct net_device *dev)
> +{
> + bool up = dev->flags & IFF_UP;
> + unsigned int i;
> + int ret = 0;
> +
> + if (up)
> + dev_deactivate(dev);


This drops all packets in the queue. I don't think tweaking the queue
length did this previously - did it?
If not this change might surprise some people.

> +
> + for (i = 0; i < dev->num_tx_queues; i++) {
> + ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
> +
> + /* TODO: revert changes on a partial failure */
> + if (ret)
> + break;
> + }
> +
> + if (up)
> + dev_activate(dev);
> + return ret;
> +}
> +
>  static void dev_init_scheduler_queue(struct net_device *dev,
>struct netdev_queue *dev_queue,
>void *_qdisc)
> -- 
> 2.13.0

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread Jia-Ju Bai




On 2018/1/26 21:56, Jia-Ju Bai wrote:



On 2018/1/26 20:05, Al Viro wrote:

On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote:

After checking all possible call chains to fs_send() here,
my tool finds that fs_send() is never called in atomic context.
And this function is assigned to a function pointer "dev->ops->send",
which is only called by vcc_sendmsg() (net/atm/common.c)
through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(),
it indicates that fs_send() can call functions which may sleep.
Thus GFP_ATOMIC is not necessary, and it can be replaced with 
GFP_KERNEL.


This is found by a static analysis tool named DCNS written by myself.

The trouble is, places like
net/atm/raw.c:65:   vcc->send = atm_send_aal0;
net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
 ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
 ? DROP_PACKET : 1;
 bh_unlock_sock(sk_atm(vcc));
in pppoatm_send() definitely is called under a spinlock.

Looking through the driver (in advanced bitrot, as usual for 
drivers/atm),

I'd say that submit_queue() is fucked in head in the "queue full" case.
And judging by the history, had been thus since the original merge...


Thanks for reply :)

I am sorry for this false positive.
I think other ATM related patches that I submitted are also false 
positives, sorry.
My tool did not handle this situation of passing function pointer, and 
I will improve the tool...



Thanks,
Jia-Ju Bai


I check the code again, and confirm only my patches about "send" are 
false positives.

I think other my patches that are about "open" does not has this problem:
https://marc.info/?l=linux-kernel&m=151693791432626&w=2
https://marc.info/?l=linux-kernel&m=151695475503314&w=2
https://marc.info/?l=linux-kernel&m=151693150131512&w=2

I hope you can have a check :)


Thanks,
Jia-Ju Bai

Re: dvb usb issues since kernel 4.9

2018-01-26 Thread Mauro Carvalho Chehab

Hi Alan,

Em Mon, 8 Jan 2018 14:15:35 -0500 (EST)
Alan Stern  escreveu:

> On Mon, 8 Jan 2018, Linus Torvalds wrote:
> 
> > Can somebody tell which softirq it is that dvb/usb cares about?  
> 
> I don't know about the DVB part.  The USB part is a little difficult to
> analyze, mostly because the bug reports I've seen are mostly from
> people running non-vanilla kernels. 

I suspect that the main reason for people not using non-vanilla Kernels
is that, among other bugs, the dwc2 upstream driver has serious troubles
handling ISOCH traffic.

Using Kernel 4.15-rc7 from this git tree:
https://git.linuxtv.org/mchehab/experimental.git/log/?h=softirq_fixup

(e. g. with the softirq bug partially reverted with Linux patch, and
 the DWC2 deferred probe fixed)

With a PCTV 461e device, with uses em28xx driver + Montage frontend
(with is the same used on dvbsky hardware - except for em28xx).

This device doesn't support bulk for DVB, just ISOCH. The drivers work 
fine on x86.

Using a test signal at the bit rate of 56698,4 Kbits/s, that's what
happens, when capturing less than one second of data:

$ dvbv5-zap -c ~/dvb_channel.conf "tv brasil" -l universal -X 100 -m 
-t2dvbv5-zap -c ~/dvb_channel.conf "tv brasil" -l universal -X 100 -m -t2
Using LNBf UNIVERSAL
Universal, Europe
Freqs : 10800 to 11800 MHz, LO: 9750 MHz
Freqs : 11600 to 12700 MHz, LO: 10600 MHz
using demux 'dvb0.demux0'
reading channels from file '/home/mchehab/dvb_channel.conf'
tuning to 11468000 Hz
   (0x00) Signal= -33.90dBm
Lock   (0x1f) Signal= -33.90dBm C/N= 30.28dB postBER= 2.33x10^-6
dvb_dev_set_bufsize: buffer set to 6160384
  dvb_set_pesfilter to 0x2000
354.08s: Starting capture
354.73s: only read 59220 bytes
354.73s: Stopping capture

[  354.000827] dwc2 3f98.usb: DWC OTG HCD EP DISABLE: 
bEndpointAddress=0x84, ep->hcpriv=116f41b2
[  354.000859] dwc2 3f98.usb: DWC OTG HCD EP RESET: bEndpointAddress=0x84
[  354.010744] dwc2 3f98.usb: --Host Channel 5 Interrupt: Frame Overrun--
... (hundreds of thousands of Frame Overrun messages)
[  354.660857] dwc2 3f98.usb: --Host Channel 5 Interrupt: Frame Overrun--
[  354.660935] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
[  354.660959] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
[  354.660966] dwc2 3f98.usb:   urb->status = 0
[  354.660992] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
[  354.661001] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
[  354.661008] dwc2 3f98.usb:   urb->status = 0
[  354.661054] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
[  354.661065] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
[  354.661072] dwc2 3f98.usb:   urb->status = 0
[  354.661107] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
[  354.661120] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
[  354.661127] dwc2 3f98.usb:   urb->status = 0
[  354.661146] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
[  354.661158] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
[  354.661165] dwc2 3f98.usb:   urb->status = 0

Kernel was compiled with:

CONFIG_USB_DWC2=y
CONFIG_USB_DWC2_HOST=y
# CONFIG_USB_DWC2_PERIPHERAL is not set
# CONFIG_USB_DWC2_DUAL_ROLE is not set
# CONFIG_USB_DWC2_PCI is not set
CONFIG_USB_DWC2_DEBUG=y
# CONFIG_USB_DWC2_VERBOSE is not set
# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set
CONFIG_USB_DWC2_DEBUG_PERIODIC=y

As reference, that's the output of lsusb for the PCTV usb hardware:

$ lsusb -v -d 2013:0258

Bus 001 Device 005: ID 2013:0258 PCTV Systems 
Couldn't open device, some information will be missing
Device Descriptor:
  bLength18
  bDescriptorType 1
  bcdUSB   2.00
  bDeviceClass0 (Defined at Interface level)
  bDeviceSubClass 0 
  bDeviceProtocol 0 
  bMaxPacketSize064
  idVendor   0x2013 PCTV Systems
  idProduct  0x0258 
  bcdDevice1.00
  iManufacturer   3 
  iProduct1 
  iSerial 2 
  bNumConfigurations  1
  Configuration Descriptor:
bLength 9
bDescriptorType 2
wTotalLength   41
bNumInterfaces  1
bConfigurationValue 1
iConfiguration  0 
bmAttributes 0x80
  (Bus Powered)
MaxPower  500mA
Interface Descriptor:
  bLength 9
  bDescriptorType 4
  bInterfaceNumber0
  bAlternateSetting   0
  bNumEndpoints   1
  bInterfaceClass   255 Vendor Specific Class
  bInterfaceSubClass  0 
  bInterfaceProtocol  0 
  iInterface  0 
  Endpoint Descriptor:
bLength 7
bDescriptorType 5
bEndpointAddress 0x84  EP 4 IN
bmAttributes1
  Transfer TypeIsochronous
  Synch Type   None
  Usage Type   Data
wMaxPacketSize 0x  1x 0 bytes
bInterva

Re: [RFC PATCH net-next] sfc: efx_default_channel_want_txqs() can be static

2018-01-26 Thread Edward Cree

On 26/01/18 01:03, kbuild test robot wrote:
> Fixes: 2935e3c38228 ("sfc: on 8000 series use TX queues for TX timestamps")
> Signed-off-by: Fengguang Wu 
Acked-by: Edward Cree 

Dave, can you take this directly or do you need it reposted without RFC tags?  
I'm not sure what the procedure is for robopatches.
> ---
>  efx.c |2 +-
>  ptp.c |4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
> index 456866b0..16757cf 100644
> --- a/drivers/net/ethernet/sfc/efx.c
> +++ b/drivers/net/ethernet/sfc/efx.c
> @@ -896,7 +896,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue)
>   mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(100));
>  }
>  
> -bool efx_default_channel_want_txqs(struct efx_channel *channel)
> +static bool efx_default_channel_want_txqs(struct efx_channel *channel)
>  {
>   return channel->channel - channel->efx->tx_channel_offset <
>   channel->efx->n_tx_channels;
> diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
> index 433d29d..3e2c5b1 100644
> --- a/drivers/net/ethernet/sfc/ptp.c
> +++ b/drivers/net/ethernet/sfc/ptp.c
> @@ -366,7 +366,7 @@ bool efx_ptp_use_mac_tx_timestamps(struct efx_nic *efx)
>  /* PTP 'extra' channel is still a traffic channel, but we only create TX 
> queues
>   * if PTP uses MAC TX timestamps, not if PTP uses the MC directly to 
> transmit.
>   */
> -bool efx_ptp_want_txqs(struct efx_channel *channel)
> +static bool efx_ptp_want_txqs(struct efx_channel *channel)
>  {
>   return efx_ptp_use_mac_tx_timestamps(channel->efx);
>  }
> @@ -2146,7 +2146,7 @@ static int efx_phc_enable(struct ptp_clock_info *ptp,
>   return 0;
>  }
>  
> -const struct efx_channel_type efx_ptp_channel_type = {
> +static const struct efx_channel_type efx_ptp_channel_type = {
>   .handle_no_channel  = efx_ptp_handle_no_channel,
>   .pre_probe  = efx_ptp_probe_channel,
>   .post_remove= efx_ptp_remove_channel,

Re: [PATCH v2 net-next 0/3] net/ipv6: Add support for ONLINK flag

2018-01-26 Thread David Miller

From: David Ahern 
Date: Thu, 25 Jan 2018 16:55:06 -0800

> Add support for RTNH_F_ONLINK with ipv6 routes.
> 
> First patch moves existing gateway validation into helper. The onlink
> flag requires a different set of checks and the existing validation
> makes ip6_route_info_create long enough.
> 
> Second patch makes the table id and lookup flag an option to 
> ip6_nh_lookup_table. onlink check needs to verify the gateway without
> the RT6_LOOKUP_F_IFACE flag and PBR with VRF means the table id can
> vary between the table the route is inserted and the VRF the egress
> device is enslaved to.
> 
> Third patch adds support for RTNH_F_ONLINK.
> 
> I have a set of test cases in a format based on the framework Ido and
> Jiri are working on. Once that goes in I will adapt the script and
> submit.
> 
> v2
> - removed table id check. Too constraining for PBR with VRF use cases

Series applied, thanks David.

Re: pull request (net-next): ipsec-next 2018-01-26

2018-01-26 Thread David Miller

From: Steffen Klassert 
Date: Fri, 26 Jan 2018 08:59:54 +0100

> One last patch for this development cycle:
> 
> 1) Add ESN support for IPSec HW offload.
>From Yossef Efraim.
> 
> Please pull or let me know if there are problems.

Pulled, thanks Steffen.

[PATCH net-next] bnxt_en: cleanup DIM work on device shutdown

2018-01-26 Thread Andy Gospodarek

From: Andy Gospodarek 

Make sure to cancel any pending work that might update driver coalesce
settings when taking down an interface.

Fixes: 6a8788f25625 ("bnxt_en: add support for software dynamic interrupt 
moderation")
Signed-off-by: Andy Gospodarek 
Cc: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4b001d2..1500243 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6082,8 +6082,14 @@ static void bnxt_disable_napi(struct bnxt *bp)
if (!bp->bnapi)
return;
 
-   for (i = 0; i < bp->cp_nr_rings; i++)
+   for (i = 0; i < bp->cp_nr_rings; i++) {
+   struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
+
+   if (bp->bnapi[i]->rx_ring)
+   cancel_work_sync(&cpr->dim.work);
+
napi_disable(&bp->bnapi[i]->napi);
+   }
 }
 
 static void bnxt_enable_napi(struct bnxt *bp)
-- 
2.7.4

Re: [PATCH net-next 0/5] net/smc: fixes 2018-01-26

2018-01-26 Thread David Miller

From: Ursula Braun 
Date: Fri, 26 Jan 2018 09:28:45 +0100

> here are some more smc patches. The first 4 patches take care about
> different aspects of smc socket closing, the 5th patch improves
> coding style.

Series applied, thank you.

Re: [PATCH net-next] sfc: add suffix to large constant in ptp

2018-01-26 Thread David Miller

From: Bert Kenward 
Date: Fri, 26 Jan 2018 08:51:47 +

> Fixes: 1280c0f8aafc ("sfc: support second + quarter ns time format for 
> receive datapath")
> Reported-by: kbuild test robot 
> Signed-off-by: Bert Kenward 

Applied.

Re: [bpf PATCH 1/3] net: add a UID to use for ULP socket assignment

2018-01-26 Thread Dave Watson

On 01/25/18 04:27 PM, John Fastabend wrote:
> I did not however retest TLS with the small change to ULP layer.
> Mostly because I don't have a TLS setup. I plan/hope to get around
> to writing either a sample or preferably a selftest for this case
> as well (assuming I didn't miss one).

> @Dave Watson can you take a quick look to verify the changes are
> good on TLS ULP side.

Looks reasonable, and passes my test suite.  One comment below

Tested-by: Dave Watson 

> Signed-off-by: John Fastabend 
> ---
>  include/net/tcp.h  |6 ++
>  net/ipv4/tcp_ulp.c |   53 
> +++-
>  net/tls/tls_main.c |2 ++
>  3 files changed, 56 insertions(+), 5 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
> index 6bb9e14..8ef437d 100644
> --- a/net/ipv4/tcp_ulp.c
> +++ b/net/ipv4/tcp_ulp.c
> @@ -133,3 +157,22 @@ int tcp_set_ulp(struct sock *sk, const char *name)
>   icsk->icsk_ulp_ops = ulp_ops;
>   return 0;
>  }
> +
> +int tcp_set_ulp_id(struct sock *sk, int ulp)
> +{
> + struct inet_connection_sock *icsk = inet_csk(sk);
> + const struct tcp_ulp_ops *ulp_ops;
> + int err;
> +
> + if (icsk->icsk_ulp_ops)
> + return -EEXIST;
> +
> + ulp_ops = __tcp_ulp_lookup(ulp);
> + if (!ulp_ops)
> + return -ENOENT;
> +
> + err = ulp_ops->init(sk);
> + if (!err)
> + icsk->icsk_ulp_ops = ulp_ops;

Does this need module_put on error, similar to tcp_set_ulp?

> + return err;
> +}

Re: pull-request: can-next 2018-01-26,pull-request: can-next 2018-01-26

2018-01-26 Thread David Miller

From: Marc Kleine-Budde 
Date: Fri, 26 Jan 2018 11:01:41 +0100

> this is a pull request for net-next/master consisting of 3 patches.
> 
> The first two patches target the CAN documentation. The first is by me
> and fixes pointer to location of fsl,mpc5200-mscan node in the mpc5200
> documentation. The second patch is by Robert Schwebel and it converts
> the plain ASCII documentation to restructured text.
> 
> The third patch is by Fabrizio Castro add the r8a774[35] support to the
> rcar_can dt-bindings documentation.

Pulled, thanks Marc.

Re: [PATCH v2] net: macb: Handle HRESP error

2018-01-26 Thread David Miller

From: Harini Katakam 
Date: Fri, 26 Jan 2018 16:12:11 +0530

> From: Harini Katakam 
> 
> Handle HRESP error by doing a SW reset of RX and TX and
> re-initializing the descriptors, RX and TX queue pointers.
> 
> Signed-off-by: Harini Katakam 
> Signed-off-by: Michal Simek 
> ---
> v2:
> Rebased on top of latest net-next and reinitialized
> all rx queues.

Your patch was corrupted by your email client, it changed TAB characters
into sequences of SPACES amongst other things.  We cannot integrate your
changes until you fix this.

> This email and any attachments are intended for the sole use of the named 
> recipient(s) and contain(s) confidential information that may be proprietary, 
> privileged or copyrighted under applicable law. If you are not the intended 
> recipient, do not read, copy, or forward this email message or any 
> attachments. Delete this email message and any attachments immediately.

This is also completely inappropriate for a public development list and
you must eliminate this signature on future postings or we will have to
ignore them.

Thank you.

Re: [PATCH net v2] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state

2018-01-26 Thread Eric Dumazet

On Fri, 2018-01-26 at 15:14 +0300, Alexey Kodanev wrote:
> ccid2_hc_tx_rto_expire() timer callback always restarts the timer
> again and can run indefinitely (unless it is stopped outside), and after
> commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"),
> which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from
> dccp_destroy_sock() to sk_destruct(), this started to happen quite often.
> The timer prevents releasing the socket, as a result, sk_destruct() won't
> be called.
> 
> Found with LTP/dccp_ipsec tests running on the bonding device,
> which later couldn't be unloaded after the tests were completed:
> 
>   unregister_netdevice: waiting for bond0 to become free. Usage count = 148
> 
> Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation")
> Signed-off-by: Alexey Kodanev 
> ---

Reviewed-by: Eric Dumazet 

Thanks Alexey.

Re: [PATCH net-next 5/5] net/smc: return booleans instead of integers

2018-01-26 Thread Bart Van Assche

On Fri, 2018-01-26 at 09:28 +0100, Ursula Braun wrote:
> From: Gustavo A. R. Silva 
> 
> Return statements in functions returning bool should use
> true/false instead of 1/0.
> 
> This issue was detected with the help of Coccinelle.
> 
> Signed-off-by: Gustavo A. R. Silva 
> Signed-off-by: Ursula Braun 
> ---
>  net/smc/smc.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/smc/smc.h b/net/smc/smc.h
> index bfbe20234105..9518986c97b1 100644
> --- a/net/smc/smc.h
> +++ b/net/smc/smc.h
> @@ -252,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed)
>  static inline bool using_ipsec(struct smc_sock *smc)
>  {
>   return (smc->clcsock->sk->sk_policy[0] ||
> - smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
> + smc->clcsock->sk->sk_policy[1]) ? true : false;
>  }

Hello Ursula,

If you ever have to touch this code again, please follow the style of other 
kernel
code and leave out the "? true : false" part and also the parentheses. Both are
superfluous.

Thanks,

Bart.

Re: [PATCH net-next 0/2] net: hns3: add support ethtool_ops.{set|get}_coalesce for VF

2018-01-26 Thread David Miller

From: Peng Li 
Date: Fri, 26 Jan 2018 19:31:23 +0800

> This patch-set adds ethtool_ops.{get|set}_coalesce to VF and
> fix one related bug.
> 
> HNS3 PF and VF driver use the common enet layer, as the
> ethtool_ops.{get|set}_coalesce to PF have upstreamed,  just
> need add the ops to hns3vf_ethtool_ops.
> 
> [Patch 1/2] fix a related bug for the VF ethtool_ops.{set|
> get}_coalesce.

Series applied, thank you.

Re: [PATCH net-next 0/3] cxgb4: fix dump collection when firmware crashed

2018-01-26 Thread David Miller

From: Rahul Lakkireddy 
Date: Fri, 26 Jan 2018 17:05:53 +0530

> Patch 1 resets FW_OK flag, if firmware reports error.
> 
> Patch 2 fixes incorrect condition for using firmware LDST commands.
> 
> Patch 3 fixes dump collection logic to use backdoor register
> access to collect dumps when firmware is crashed.

Series applied, thank you.

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread David Miller

From: Al Viro 
Date: Fri, 26 Jan 2018 12:05:22 +

> On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote:
>> After checking all possible call chains to fs_send() here,
>> my tool finds that fs_send() is never called in atomic context.
>> And this function is assigned to a function pointer "dev->ops->send",
>> which is only called by vcc_sendmsg() (net/atm/common.c)
>> through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(),
>> it indicates that fs_send() can call functions which may sleep.
>> Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL.
>> 
>> This is found by a static analysis tool named DCNS written by myself.
> 
> The trouble is, places like
> net/atm/raw.c:65:   vcc->send = atm_send_aal0;
> net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
> net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
> mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
> ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
> ? DROP_PACKET : 1;
> bh_unlock_sock(sk_atm(vcc));
> in pppoatm_send() definitely is called under a spinlock.
> 
> Looking through the driver (in advanced bitrot, as usual for drivers/atm),
> I'd say that submit_queue() is fucked in head in the "queue full" case.
> And judging by the history, had been thus since the original merge...

Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL
conversions.

Al's analysis above is similar to how things looked for other patches
you submited of this nature.

So because of the lack of care and serious auditing you put into these
conversions, I really have no choice but to drop them from my queue
because on the whole they are adding bugs rather than improving the
code.

Thank you.

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread David Miller

From: Jia-Ju Bai 
Date: Fri, 26 Jan 2018 21:56:32 +0800

> My tool did not handle this situation of passing function pointer, and
> I will improve the tool...

Never automate these kinds of changes.

Actually audit the change fully _yourself_ as a human after the tool
indicates a possibility to you.

This is why most of your changes have bugs, you rely on the tool too
much.

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread David Miller

From: Jia-Ju Bai 
Date: Fri, 26 Jan 2018 22:17:08 +0800

> 
> 
> On 2018/1/26 21:56, Jia-Ju Bai wrote:
>>
>>
>> On 2018/1/26 20:05, Al Viro wrote:
>>> On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote:
 After checking all possible call chains to fs_send() here,
 my tool finds that fs_send() is never called in atomic context.
 And this function is assigned to a function pointer "dev->ops->send",
 which is only called by vcc_sendmsg() (net/atm/common.c)
 through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(),
 it indicates that fs_send() can call functions which may sleep.
 Thus GFP_ATOMIC is not necessary, and it can be replaced with
 GFP_KERNEL.

 This is found by a static analysis tool named DCNS written by myself.
>>> The trouble is, places like
>>> net/atm/raw.c:65:   vcc->send = atm_send_aal0;
>>> net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
>>> net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
>>> mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
>>>  ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
>>>  ? DROP_PACKET : 1;
>>>  bh_unlock_sock(sk_atm(vcc));
>>> in pppoatm_send() definitely is called under a spinlock.
>>>
>>> Looking through the driver (in advanced bitrot, as usual for
>>> drivers/atm),
>>> I'd say that submit_queue() is fucked in head in the "queue full"
>>> case.
>>> And judging by the history, had been thus since the original merge...
>>
>> Thanks for reply :)
>>
>> I am sorry for this false positive.
>> I think other ATM related patches that I submitted are also false
>> positives, sorry.
>> My tool did not handle this situation of passing function pointer, and
>> I will improve the tool...
>>
>>
>> Thanks,
>> Jia-Ju Bai
> 
> I check the code again, and confirm only my patches about "send" are
> false positives.
> I think other my patches that are about "open" does not has this
> problem:
> https://marc.info/?l=linux-kernel&m=151693791432626&w=2
> https://marc.info/?l=linux-kernel&m=151695475503314&w=2
> https://marc.info/?l=linux-kernel&m=151693150131512&w=2
> 
> I hope you can have a check :)

No, _you_ have a check.

All of these patches will be dropped, sorry.

Re: [RFC PATCH net-next] sfc: efx_default_channel_want_txqs() can be static

2018-01-26 Thread David Miller

From: Edward Cree 
Date: Fri, 26 Jan 2018 15:13:05 +

> On 26/01/18 01:03, kbuild test robot wrote:
>> Fixes: 2935e3c38228 ("sfc: on 8000 series use TX queues for TX timestamps")
>> Signed-off-by: Fengguang Wu 
> Acked-by: Edward Cree 
> 
> Dave, can you take this directly or do you need it reposted without RFC tags? 
>  I'm not sure what the procedure is for robopatches.

No I cannot.

Don't you even notice that the subject line and commit message are
totally inaccurate?

They say that one function is being marked static.

But the patch actually marks two different functions static, as well
as a structure which is also completely not mentioned in the commit
message nor subject line.

Re: [PATCH net v2] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state

2018-01-26 Thread David Miller

From: Alexey Kodanev 
Date: Fri, 26 Jan 2018 15:14:16 +0300

> ccid2_hc_tx_rto_expire() timer callback always restarts the timer
> again and can run indefinitely (unless it is stopped outside), and after
> commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"),
> which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from
> dccp_destroy_sock() to sk_destruct(), this started to happen quite often.
> The timer prevents releasing the socket, as a result, sk_destruct() won't
> be called.
> 
> Found with LTP/dccp_ipsec tests running on the bonding device,
> which later couldn't be unloaded after the tests were completed:
> 
>   unregister_netdevice: waiting for bond0 to become free. Usage count = 148
> 
> Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation")
> Signed-off-by: Alexey Kodanev 
> ---
> 
> v2: * corrected bug origin commit id
> * clarified commit message about sk_stop_timer()

Applied and queued up for -stable.

Re: [PATCH] VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING

2018-01-26 Thread David Miller

From: Stefan Hajnoczi 
Date: Fri, 26 Jan 2018 11:48:25 +

> select(2) with wfds but no rfds must return when the socket is shut down
> by the peer.  This way userspace notices socket activity and gets -EPIPE
> from the next write(2).
> 
> Currently select(2) does not return for virtio-vsock when a SEND+RCV
> shutdown packet is received.  This is because vsock_poll() only sets
> POLLOUT | POLLWRNORM for TCP_CLOSE, not the TCP_CLOSING state that the
> socket is in when the shutdown is received.
> 
> Signed-off-by: Stefan Hajnoczi 

Applied and queued up for -stable, thank you.

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread Jia-Ju Bai

On 2018/1/27 0:07, David Miller wrote:

From: Al Viro 
Date: Fri, 26 Jan 2018 12:05:22 +

On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote:

After checking all possible call chains to fs_send() here,
my tool finds that fs_send() is never called in atomic context.
And this function is assigned to a function pointer "dev->ops->send",
which is only called by vcc_sendmsg() (net/atm/common.c)
through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(),
it indicates that fs_send() can call functions which may sleep.
Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL.

This is found by a static analysis tool named DCNS written by myself.

The trouble is, places like
net/atm/raw.c:65:   vcc->send = atm_send_aal0;
net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
 ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
 ? DROP_PACKET : 1;
 bh_unlock_sock(sk_atm(vcc));
in pppoatm_send() definitely is called under a spinlock.

Looking through the driver (in advanced bitrot, as usual for drivers/atm),
I'd say that submit_queue() is fucked in head in the "queue full" case.
And judging by the history, had been thus since the original merge...

Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL
conversions.

Al's analysis above is similar to how things looked for other patches
you submited of this nature.

So because of the lack of care and serious auditing you put into these
conversions, I really have no choice but to drop them from my queue
because on the whole they are adding bugs rather than improving the
code.

Thank you.

Okay, I can understand that careless GFP_KERNEL conversions will 
introduce bugs.

Your concern is right.
Sorry for my previous incorrect patches...
I will manually carefully check and audit my patches before I submit them.

Thanks,
Jia-Ju Bai

Re: [bpf PATCH 1/3] net: add a UID to use for ULP socket assignment

2018-01-26 Thread John Fastabend

On 01/26/2018 07:52 AM, Dave Watson wrote:
> On 01/25/18 04:27 PM, John Fastabend wrote:
>> I did not however retest TLS with the small change to ULP layer.
>> Mostly because I don't have a TLS setup. I plan/hope to get around
>> to writing either a sample or preferably a selftest for this case
>> as well (assuming I didn't miss one).
> 
>> @Dave Watson can you take a quick look to verify the changes are
>> good on TLS ULP side.
> 
> Looks reasonable, and passes my test suite.  One comment below
> 
> Tested-by: Dave Watson 
> 
>> Signed-off-by: John Fastabend 
>> ---
>>  include/net/tcp.h  |6 ++
>>  net/ipv4/tcp_ulp.c |   53 
>> +++-
>>  net/tls/tls_main.c |2 ++
>>  3 files changed, 56 insertions(+), 5 deletions(-)
>>
>> diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
>> index 6bb9e14..8ef437d 100644
>> --- a/net/ipv4/tcp_ulp.c
>> +++ b/net/ipv4/tcp_ulp.c
>> @@ -133,3 +157,22 @@ int tcp_set_ulp(struct sock *sk, const char *name)
>>  icsk->icsk_ulp_ops = ulp_ops;
>>  return 0;
>>  }
>> +
>> +int tcp_set_ulp_id(struct sock *sk, int ulp)
>> +{
>> +struct inet_connection_sock *icsk = inet_csk(sk);
>> +const struct tcp_ulp_ops *ulp_ops;
>> +int err;
>> +
>> +if (icsk->icsk_ulp_ops)
>> +return -EEXIST;
>> +
>> +ulp_ops = __tcp_ulp_lookup(ulp);
>> +if (!ulp_ops)
>> +return -ENOENT;
>> +
>> +err = ulp_ops->init(sk);
>> +if (!err)
>> +icsk->icsk_ulp_ops = ulp_ops;
> 
> Does this need module_put on error, similar to tcp_set_ulp?

Not needed in current use because its only being used with an in-kernel
ULP. However, best to fix it so that we don't have a (hard to detect)
bug first time someone uses this with a module ULP.

Thanks I'll spin a v2 this morning.

> 
>> +return err;
>> +}

pull-request: wireless-drivers 2018-01-26

2018-01-26 Thread Kalle Valo

Hi Dave,

this is a pull request to the net tree for 4.15. I hate to do late pull
requests like this but today Sven Joachim found a serious regression in
one of ssb patches, I hope there's still enough time to get this to
4.15.

But if it's too late, just let me know and I'll queue this for 4.16.

Kalle

The following changes since commit 1e19c4d689dc1e95bafd23ef68fbc0c6b9e05180:

  net: vrf: Add support for sends to local broadcast address (2018-01-25 
21:51:03 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git 
tags/wireless-drivers-for-davem-2018-01-26

for you to fetch changes up to a9e6d44ddeccd3522670e641f1ed9b068e746ff7:

  ssb: Do not disable PCI host on non-Mips (2018-01-26 16:55:34 +0200)


wireless-drivers fixes for 4.15

ssb

* last minute patch to fix b43 and b44 drivers on non-mips platforms,
  a regression from v4.15-rc9


Sven Joachim (1):
  ssb: Do not disable PCI host on non-Mips

 drivers/ssb/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

[GIT] Networking

2018-01-26 Thread David Miller


1) The per-network-namespace loopback device, and thus it's namespace,
   can have it's teardown deferred for a long time if a kernel created
   TCP socket closes and the namespace is exiting meanwhile.  The
   kernel keeps trying to finish the close sequence until it times out
   (which takes quite some time).

   Fix this by forcing the socket closed in this situation, from Dan
   Streetman.

2) Fix regression where we're trying to invoke the update_pmtu method
   on route types (in this case metadata tunnel routes) that don't
   implement the dst_ops method.  Fix from Nicolas Dichtel.

3) Fix long standing memory corruption issues in r8169 driver by
   performing the chip statistics DMA programming more correctly.
   From Francois Romieu.

4) Handle local broadcast sends over VRF routes properly, from David
   Ahern.

5) Don't refire the DCCP CCID2 timer endlessly, otherwise the socket
   can never be released.  From Alexey Kodanev.

6) Set poll flags properly in VSOCK protocol layer, from Stefan
   Hajnoczi.

Please pull, thanks a lot!

The following changes since commit 5b7d27967dabfb17c21b0d98b29153b9e3ee71e5:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-01-24 
17:24:30 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to ba3169fc7548759be986b168d662e0ba64c2fd88:

  VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING (2018-01-26 11:16:27 -0500)


Alexey Kodanev (1):
  dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state

Dan Streetman (1):
  net: tcp: close sock if net namespace is exiting

David Ahern (1):
  net: vrf: Add support for sends to local broadcast address

Francois Romieu (1):
  r8169: fix memory corruption on retrieval of hardware statistics.

Nicolas Dichtel (1):
  net: don't call update_pmtu unconditionally

Stefan Hajnoczi (1):
  VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING

 drivers/infiniband/ulp/ipoib/ipoib_cm.c |  3 +--
 drivers/net/ethernet/realtek/r8169.c|  9 ++---
 drivers/net/geneve.c|  4 ++--
 drivers/net/vrf.c   |  5 +++--
 drivers/net/vxlan.c |  6 ++
 include/net/dst.h   |  8 
 include/net/net_namespace.h | 10 ++
 net/dccp/ccids/ccid2.c  |  3 +++
 net/ipv4/ip_tunnel.c|  3 +--
 net/ipv4/ip_vti.c   |  2 +-
 net/ipv4/tcp.c  |  3 +++
 net/ipv4/tcp_timer.c| 15 +++
 net/ipv6/ip6_tunnel.c   |  6 ++
 net/ipv6/ip6_vti.c  |  2 +-
 net/ipv6/sit.c  |  4 ++--
 net/vmw_vsock/af_vsock.c|  2 +-
 16 files changed, 57 insertions(+), 28 deletions(-)

Re: pull-request: wireless-drivers 2018-01-26

2018-01-26 Thread David Miller

From: Kalle Valo 
Date: Fri, 26 Jan 2018 18:33:33 +0200

> this is a pull request to the net tree for 4.15. I hate to do late pull
> requests like this but today Sven Joachim found a serious regression in
> one of ssb patches, I hope there's still enough time to get this to
> 4.15.
> 
> But if it's too late, just let me know and I'll queue this for 4.16.

I think this is going to have to wait for 4.16, I just sent what I hope
is my last pull to Linus for this cycle.

Thanks.

Re: pull-request: wireless-drivers 2018-01-26

2018-01-26 Thread Kalle Valo

David Miller  writes:

> From: Kalle Valo 
> Date: Fri, 26 Jan 2018 18:33:33 +0200
>
>> this is a pull request to the net tree for 4.15. I hate to do late pull
>> requests like this but today Sven Joachim found a serious regression in
>> one of ssb patches, I hope there's still enough time to get this to
>> 4.15.
>> 
>> But if it's too late, just let me know and I'll queue this for 4.16.
>
> I think this is going to have to wait for 4.16, I just sent what I hope
> is my last pull to Linus for this cycle.

Yeah, I guessed that when I noticed you sent your pull request just 3
minutes after mine :)

I'll send this to you next week in another pull request, please drop
this one.

-- 
Kalle Valo

[PATCH net-next,v2 0/2] net: sched: introduce em_ipt ematch

2018-01-26 Thread Eyal Birger

From: Eyal Birger 

The following patchset introduces a new tc ematch for matching using
netfilter matches.

This allows early classification as well as mirroning/redirecting traffic
based on logic implemented in netfilter extensions.

Example use case is classification based on the incoming IPSec policy used
during decpsulation using the 'policy' iptables extension (xt_policy).

This patchset is an enhancement of a former series ([1]) which allowed only
policy matching following a suggestion by Pablo Neira Ayuso ([2]).

[1] https://patchwork.ozlabs.org/cover/859887/
[2] https://patchwork.ozlabs.org/patch/859888/

v2:
  Remove skb push/pull and limit functionality to ingress

Eyal Birger (2):
  net: sched: ematch: pass protocol to ematch 'change()' handlers
  net: sched: add em_ipt ematch for calling xtables matches

 include/net/pkt_cls.h|   2 +-
 include/uapi/linux/pkt_cls.h |   3 +-
 include/uapi/linux/tc_ematch/tc_em_ipt.h |  19 +++
 net/sched/Kconfig|  10 ++
 net/sched/Makefile   |   1 +
 net/sched/em_canid.c |   4 +-
 net/sched/em_ipset.c |   4 +-
 net/sched/em_ipt.c   | 244 +++
 net/sched/em_meta.c  |   2 +-
 net/sched/em_nbyte.c |   4 +-
 net/sched/em_text.c  |   2 +-
 net/sched/ematch.c   |   3 +-
 12 files changed, 287 insertions(+), 11 deletions(-)
 create mode 100644 include/uapi/linux/tc_ematch/tc_em_ipt.h
 create mode 100644 net/sched/em_ipt.c

-- 
2.7.4

[PATCH net-next,v2 2/2] net: sched: add em_ipt ematch for calling xtables matches

2018-01-26 Thread Eyal Birger

From: Eyal Birger 

This module allows performing tc classification based on data structures
and implementations provided by netfilter extensions.

Example use case is classification based on the incoming IPSec policy used
during decpsulation using the 'policy' iptables extension (xt_policy).

Only tc ingress is supported in order to avoid modifying the skb at egress
to suit xtable matches skb->data expectations.

Signed-off-by: Eyal Birger 
Acked-by: Pablo Neira Ayuso 

---

This ematch tries its best to provide matches with a similar
environment to the one they are used to; Due to the wide range of criteria
supported by matches, I am not able to test every combination of match and
traffic.
---
 include/uapi/linux/pkt_cls.h |   3 +-
 include/uapi/linux/tc_ematch/tc_em_ipt.h |  19 +++
 net/sched/Kconfig|  10 ++
 net/sched/Makefile   |   1 +
 net/sched/em_ipt.c   | 244 +++
 5 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/tc_ematch/tc_em_ipt.h
 create mode 100644 net/sched/em_ipt.c

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 46c5066..7cafb26d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -555,7 +555,8 @@ enum {
 #defineTCF_EM_VLAN 6
 #defineTCF_EM_CANID7
 #defineTCF_EM_IPSET8
-#defineTCF_EM_MAX  8
+#defineTCF_EM_IPT  9
+#defineTCF_EM_MAX  9
 
 enum {
TCF_EM_PROG_TC
diff --git a/include/uapi/linux/tc_ematch/tc_em_ipt.h 
b/include/uapi/linux/tc_ematch/tc_em_ipt.h
new file mode 100644
index 000..aecd252
--- /dev/null
+++ b/include/uapi/linux/tc_ematch/tc_em_ipt.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_TC_EM_IPT_H
+#define __LINUX_TC_EM_IPT_H
+
+#include 
+#include 
+
+enum {
+   TCA_EM_IPT_UNSPEC,
+   TCA_EM_IPT_HOOK,
+   TCA_EM_IPT_MATCH_NAME,
+   TCA_EM_IPT_MATCH_REVISION,
+   TCA_EM_IPT_MATCH_DATA,
+   __TCA_EM_IPT_MAX
+};
+
+#define TCA_EM_IPT_MAX (__TCA_EM_IPT_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index c03d86a..203e7f7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -658,6 +658,16 @@ config NET_EMATCH_IPSET
  To compile this code as a module, choose M here: the
  module will be called em_ipset.
 
+config NET_EMATCH_IPT
+   tristate "IPtables Matches"
+   depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES
+   ---help---
+ Say Y here to be able to classify packets based on iptables
+ matches.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_ipt.
+
 config NET_CLS_ACT
bool "Actions"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 5b63544..8811d38 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -75,3 +75,4 @@ obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
 obj-$(CONFIG_NET_EMATCH_TEXT)  += em_text.o
 obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o
 obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o
+obj-$(CONFIG_NET_EMATCH_IPT)   += em_ipt.o
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
new file mode 100644
index 000..2103b30
--- /dev/null
+++ b/net/sched/em_ipt.c
@@ -0,0 +1,244 @@
+/*
+ * net/sched/em_ipt.c IPtables matches Ematch
+ *
+ * (c) 2018 Eyal Birger 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct em_ipt_match {
+   const struct xt_match *match;
+   u32 hook;
+   u8 nfproto;
+   u8 match_data[0] __aligned(8);
+};
+
+static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len)
+{
+   struct xt_mtchk_param mtpar = {};
+   union {
+   struct ipt_entry e4;
+   struct ip6t_entry e6;
+   } e = {};
+
+   mtpar.net   = net;
+   mtpar.table = "filter";
+   mtpar.hook_mask = 1 << im->hook;
+   mtpar.family= im->nfproto;
+   mtpar.match = im->match;
+   mtpar.entryinfo = &e;
+   mtpar.matchinfo = (void *)im->match_data;
+   return xt_check_match(&mtpar, mdata_len, 0, 0);
+}
+
+static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = {
+   [TCA_EM_IPT_HOOK]   = { .type = NLA_U32 },
+   [TCA_EM_IPT_MATCH_NAME] = { .type = NLA_STRING,
+   .len = XT_EXTENSION_MAXNAMELEN },
+   [TCA_EM_IPT_MATCH_REVISION] = { .type = NLA_U8 }

[PATCH net-next,v2 1/2] net: sched: ematch: pass protocol to ematch 'change()' handlers

2018-01-26 Thread Eyal Birger

From: Eyal Birger 

In order to allow ematches to create their internal state based on the
L3 protocol specified when creating the filter.

Signed-off-by: Eyal Birger 
---
 include/net/pkt_cls.h | 2 +-
 net/sched/em_canid.c  | 4 ++--
 net/sched/em_ipset.c  | 4 ++--
 net/sched/em_meta.c   | 2 +-
 net/sched/em_nbyte.c  | 4 ++--
 net/sched/em_text.c   | 2 +-
 net/sched/ematch.c| 3 ++-
 7 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8740625..ff5fcb1 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -474,7 +474,7 @@ struct tcf_ematch_tree {
 struct tcf_ematch_ops {
int kind;
int datalen;
-   int (*change)(struct net *net, void *,
+   int (*change)(struct net *net, __be16, void *,
  int, struct tcf_ematch *);
int (*match)(struct sk_buff *, struct tcf_ematch *,
 struct tcf_pkt_info *);
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
index ddd883c..445c10d 100644
--- a/net/sched/em_canid.c
+++ b/net/sched/em_canid.c
@@ -120,8 +120,8 @@ static int em_canid_match(struct sk_buff *skb, struct 
tcf_ematch *m,
return match;
 }
 
-static int em_canid_change(struct net *net, void *data, int len,
- struct tcf_ematch *m)
+static int em_canid_change(struct net *net, __be16 protocol, void *data,
+  int len, struct tcf_ematch *m)
 {
struct can_filter *conf = data; /* Array with rules */
struct canid_match *cm;
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c1b23e3..50f7282 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -19,8 +19,8 @@
 #include 
 #include 
 
-static int em_ipset_change(struct net *net, void *data, int data_len,
-  struct tcf_ematch *em)
+static int em_ipset_change(struct net *net, __be16 protocol, void *data,
+  int data_len, struct tcf_ematch *em)
 {
struct xt_set_info *set = data;
ip_set_id_t index;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index d6e9711..6892efc 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -904,7 +904,7 @@ static const struct nla_policy meta_policy[TCA_EM_META_MAX 
+ 1] = {
[TCA_EM_META_HDR]   = { .len = sizeof(struct tcf_meta_hdr) },
 };
 
-static int em_meta_change(struct net *net, void *data, int len,
+static int em_meta_change(struct net *net, __be16 protocol, void *data, int 
len,
  struct tcf_ematch *m)
 {
int err;
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 07c10ba..62a7a2e 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -23,8 +23,8 @@ struct nbyte_data {
charpattern[0];
 };
 
-static int em_nbyte_change(struct net *net, void *data, int data_len,
-  struct tcf_ematch *em)
+static int em_nbyte_change(struct net *net, __be16 protocol, void *data,
+  int data_len, struct tcf_ematch *em)
 {
struct tcf_em_nbyte *nbyte = data;
 
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 73e2ed5..b5d9e21 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -44,7 +44,7 @@ static int em_text_match(struct sk_buff *skb, struct 
tcf_ematch *m,
return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
 }
 
-static int em_text_change(struct net *net, void *data, int len,
+static int em_text_change(struct net *net, __be16 protocol, void *data, int 
len,
  struct tcf_ematch *m)
 {
struct text_match *tm;
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 1331a4c..a69abd8 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -242,7 +242,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
 
if (em->ops->change) {
-   err = em->ops->change(net, data, data_len, em);
+   err = em->ops->change(net, tp->protocol, data, data_len,
+ em);
if (err < 0)
goto errout;
} else if (data_len > 0) {
-- 
2.7.4

Re: [PATCH 1/3] Partial revert "e1000e: Avoid receiver overrun interrupt bursts"

2018-01-26 Thread Alexander Duyck

On Fri, Jan 26, 2018 at 1:12 AM, Benjamin Poirier  wrote:
> This reverts commit 4aea7a5c5e940c1723add439f4088844cd26196d.
>
> We keep the fix for the first part of the problem (1) described in the log
> of that commit however we use the implementation of e1000_msix_other() from
> before commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt").
> We remove the fix for the second part of the problem (2).
>
> Bursts of "Other" interrupts may once again occur during rxo (receive
> overflow) traffic conditions. This is deemed acceptable in the interest of
> reverting driver code back to its previous state. The e1000e driver should
> be in "maintenance" mode and we want to avoid unforeseen fallout from
> changes that are not strictly necessary.
>
> Link: https://www.spinics.net/lists/netdev/msg480675.html
> Signed-off-by: Benjamin Poirier 

Thanks for doing this.

Only a few minor tweaks I would recommend. Otherwise it looks good.

> ---
>  drivers/net/ethernet/intel/e1000e/defines.h |  1 -
>  drivers/net/ethernet/intel/e1000e/netdev.c  | 32 
> +++--
>  2 files changed, 12 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000e/defines.h 
> b/drivers/net/ethernet/intel/e1000e/defines.h
> index afb7ebe20b24..0641c0098738 100644
> --- a/drivers/net/ethernet/intel/e1000e/defines.h
> +++ b/drivers/net/ethernet/intel/e1000e/defines.h
> @@ -398,7 +398,6 @@
>  #define E1000_ICR_LSC   0x0004 /* Link Status Change */
>  #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */
>  #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */
> -#define E1000_ICR_RXO   0x0040 /* Receiver Overrun */
>  #define E1000_ICR_RXT0  0x0080 /* Rx timer intr (ring 0) */
>  #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */
>  /* If this bit asserted, the driver should claim the interrupt */
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
> b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 9f18d39bdc8f..398e940436f8 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -1914,30 +1914,23 @@ static irqreturn_t e1000_msix_other(int 
> __always_unused irq, void *data)
> struct net_device *netdev = data;
> struct e1000_adapter *adapter = netdev_priv(netdev);
> struct e1000_hw *hw = &adapter->hw;
> -   u32 icr;
> -   bool enable = true;
> -
> -   icr = er32(ICR);
> -   if (icr & E1000_ICR_RXO) {
> -   ew32(ICR, E1000_ICR_RXO);
> -   enable = false;
> -   /* napi poll will re-enable Other, make sure it runs */
> -   if (napi_schedule_prep(&adapter->napi)) {
> -   adapter->total_rx_bytes = 0;
> -   adapter->total_rx_packets = 0;
> -   __napi_schedule(&adapter->napi);
> -   }
> -   }
> -   if (icr & E1000_ICR_LSC) {
> -   ew32(ICR, E1000_ICR_LSC);
> +   u32 icr = er32(ICR);
> +
> +   if (icr & adapter->eiac_mask)
> +   ew32(ICS, (icr & adapter->eiac_mask));

I'm not sure about this bit as it includes queue interrupts if I am
not mistaken. What we should be focusing on are the bits that are not
auto-cleared so this should probably be icr & ~adapter->eiac_mask.
Actually what you might do is something like:
icr &= ~adapter->eiac_mask;
if (icr)
ew32(ICS, icr);

Also a comment explaining that we have to clear the bits that are not
automatically cleared by other interrupt causes might be useful.

> +   if (icr & E1000_ICR_OTHER) {
> +   if (!(icr & E1000_ICR_LSC))
> +   goto no_link_interrupt;

I wouldn't bother with the jump tag. Let's just make this one if
statement checking both OTHER && LSC.

> hw->mac.get_link_status = true;
> /* guard against interrupt when we're going down */
> if (!test_bit(__E1000_DOWN, &adapter->state))
> mod_timer(&adapter->watchdog_timer, jiffies + 1);
> }
>
> -   if (enable && !test_bit(__E1000_DOWN, &adapter->state))
> -   ew32(IMS, E1000_IMS_OTHER);
> +no_link_interrupt:

If you just combine the if statement logic the code naturally flows to
this point anyway without the jump label being needed.

> +   if (!test_bit(__E1000_DOWN, &adapter->state))
> +   ew32(IMS, E1000_IMS_LSC | E1000_IMS_OTHER);
>
> return IRQ_HANDLED;
>  }
> @@ -2707,8 +2700,7 @@ static int e1000e_poll(struct napi_struct *napi, int 
> weight)
> napi_complete_done(napi, work_done);
> if (!test_bit(__E1000_DOWN, &adapter->state)) {
> if (adapter->msix_entries)
> -   ew32(IMS, adapter->rx_ring->ims_val |
> -E1000_IMS_OTHER);
> +

Re: [PATCH bpf-next 1/2] bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map

2018-01-26 Thread Yonghong Song




On 1/25/18 8:47 PM, Eric Dumazet wrote:

On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote:


+find_leftmost:
+   /* Find the leftmost non-intermediate node, all intermediate nodes
+* have exact two children, so this function will never return NULL.
+*/


syzbot [1] disagrees violently with this comment.


+   for (node = rcu_dereference(*root); node;) {
+   if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+   next_node = node;
+   node = rcu_dereference(node->child[0]);
+   }
+do_copy:
+   next_key->prefixlen = next_node->prefixlen;
+   memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+  next_node->data, trie->data_size);


Thanks for reporting the issue. I looked at the update and delete code 
in lpm_trie.c, it looks the comment that each intermediate node has two 
children still holds. But I do find a problem:


616 /* Empty trie */
617 if (!rcu_dereference(trie->root))
618 return -ENOENT;
619
620 /* For invalid key, find the leftmost node in the trie */
621 if (!key || key->prefixlen > trie->max_prefixlen) {
622 root = &trie->root;
623 goto find_leftmost;
624 }
..
672 find_leftmost:
673 /* Find the leftmost non-intermediate node, all intermediate 
nodes
674  * have exact two children, so this function will never 
return NULL.

675  */
676 for (node = rcu_dereference(*root); node;) {
677 if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
678 next_node = node;
679 node = rcu_dereference(node->child[0]);
680 }

It is possible that at line 617, trie->root is not NULL, but
later at line 676 is NULL. This will lead to next_node is NULL.

Will write a test to demonstrate this and examine intermediate node
property and propose a fix soon.



[1]

syzbot hit the following crash on e9dcd80b9d77a92bfae6ce42a451f5c5fd318832
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller-2Dbuganizer.googleplex.com_text-3Ftag-3DConfig-26id-3Db2216f04db2aa337e2bbf5ebd233919c3e2aa05f&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=aHeuptD-ytpgZCGZK_koCI1pUdJBACYveZ3RmN35g90&e=
compiler: gcc (GCC) 7.1.1 20170620


Unfortunately, I don't have any reproducer for this bug yet.
raw crash log: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller-2Dbuganizer.googleplex.com_text-3Ftag-3DCrashLog-26id-3D4f78be02e2cd37040b8796322e02b147caae6024&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=RS0jaMd3n5ipeDtuFCL8B0152mZoHBx0b8xNq9AjDes&e=
dashboard link: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D148b56534d9269ab7433&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=6L_1a0P1zZJmhAN49_cgfs8_LjnBW7h4L-SXEFF4orc&e=

See 
https://urldefense.proofpoint.com/v2/url?u=http-3A__go_syzbot&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=iajwDJDDZzMqC0CxPLANoybk0lVIpuh2r7pz0rcD8kI&e=
 for details on how to handle this bug.

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault:  [#1] SMP KASAN
Dumping ftrace buffer:
(ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682
RSP: 0018:8801aa44f628 EFLAGS: 00010202
RAX: dc00 RBX:  RCX: 81829a9d
RDX: 0004 RSI: c90003b7b000 RDI: 0020
RBP: 8801aa44f8b0 R08: 817e8f95 R09: 0002
R10: 8801aa44f790 R11:  R12: 
R13: 110035489f01 R14: fff4 R15: 
FS:  7fbb3b39b700() GS:8801db30() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 2057a000 CR3: 0001c26e4005 CR4: 001606e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
  map_get_next_key kernel/bpf/syscall.c:842 [inline]
  SYSC_bpf kernel/bpf/syscall.c:1881 [inline]
  SyS_bpf+0x11b4/0x4860 kernel/bpf/syscall.c:1846
  entry_SYSCALL_64_fastpath+0x29/0xa0
RIP: 0033:0x452f19
RSP: 002b:7fbb3b39ac58 EFLAGS: 0212 ORIG_RAX: 0141
RAX: ffda RBX: 0071bea0 RCX: 00452f19
RDX: 0018

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Samudrala, Sridhar




On 1/26/2018 12:14 AM, Siwei Liu wrote:

On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin  wrote:

On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:

On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin  wrote:

On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:

First off, as mentioned in another thread, the model of stacking up
virt-bond functionality over virtio seems a wrong direction to me.
Essentially the migration process would need to carry over all guest
side configurations previously done on the VF/PT and get them moved to
the new device being it virtio or VF/PT.

I might be wrong but I don't see why we should worry about this usecase.
Whoever has a bond configured already has working config for migration.
We are trying to help people who don't, not convert existig users.

That has been placed in the view of cloud providers that the imported
images from the store must be able to run unmodified thus no
additional setup script is allowed (just as Stephen mentioned in
another mail). Cloud users don't care about live migration themselves
but the providers are required to implement such automation mechanism
to make this process transparent if at all possible. The user does not
care about the device underneath being VF or not, but they do care
about consistency all across and the resulting performance
acceleration in making VF the prefered datapath. It is not quite
peculiar user cases but IMHO *any* approach proposed for live
migration should be able to persist the state including network config
e.g. as simple as MTU. Actually this requirement has nothing to do
with virtio but our target users are live migration agnostic, being it
tracking DMA through dirty pages, using virtio as the helper, or
whatsoever, the goal of persisting configs across remains same.

So the patching being discussed here will mostly do exactly that if your
original config was simply a single virtio net device.


True, but I don't see the patch being discussed starts with good
foundation of supporting the same for VF/PT device. That is the core
of the issue.



What kind of configs do your users have right now?

Any configs be it generic or driver specific that the VF/PT device
supports and have been enabled/configured. General network configs
(MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
(hardware offload, # of queues and ring entris, RSC options, rss
rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
flower offload, just to name a few. As cloud providers we don't limit
users from applying driver specific tuning to the NIC/VF, and
sometimes this is essential to achieving best performance for their
workload. We've seen cases like tuning coalescing parameters for
getting low latency, changing rx-flow-hash function for better VXLAN
throughput, or even adopting quite advanced NIC features such as flow
director or cloud filter. We don't expect users to compromise even a
little bit on these. That is once we turn on live migration for the VF
or pass through devices in the VM, it all takes place under the hood,
users (guest admins, applications) don't have to react upon it or even
notice the change. I should note that the majority of live migrations
take place between machines with completely identical hardware, it's
more critical than necessary to keep the config as-is across the move,
stealth while quiet.


This usecase is much more complicated and different than what this patch 
is trying
to address.  Also your usecase seems to be assuming that source and 
destination

hosts are identical and have the same HW.

It makes it pretty hard to transparently migrate all these settings with 
live
migration when we are looking at a solution that unplugs the VF 
interface from

the host and the VF driver is unloaded.


As you see generic bond or bridge cannot suffice the need. That's why
we need a new customized virt bond driver, and tailor it for VM live
migration specifically. Leveraging para-virtual e.g. virtio net device
as the backup path is one thing, tracking through driver config
changes in order to re-config as necessary is another. I would think
without considering the latter, the proposal being discussed is rather
incomplete, and remote to be useful in production.




Without the help of a new
upper layer bond driver that enslaves virtio and VF/PT devices
underneath, virtio will be overloaded with too much specifics being a
VF/PT backup in the future.

So this paragraph already includes at least two conflicting
proposals. On the one hand you want a separate device for
the virtual bond, on the other you are saying a separate
driver.

Just to be crystal clear: separate virtual bond device (netdev ops,
not necessarily bus device) for VM migration specifically with a
separate driver.

Okay, but note that any config someone had on a virtio device won't
propagate to that bond.


Further, the reason to have a separate *driver* was that
some people wanted to share code with netvsc -

[net] i40e/i40evf: Update DESC_NEEDED value to reflect larger value

2018-01-26 Thread Jeff Kirsher

From: Alexander Duyck 

When compared to ixgbe and other previous Intel drivers the i40e and i40evf
drivers actually reserve 2 additional descriptors in maybe_stop_tx for
cache line alignment. We need to update DESC_NEEDED to reflect this as
otherwise we are more likely to return TX_BUSY which will cause issues with
things like xmit_more.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index fbae1182e2ea..675923f47281 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -277,7 +277,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int 
size)
 }
 
 /* Tx Descriptors needed, worst case */
-#define DESC_NEEDED (MAX_SKB_FRAGS + 4)
+#define DESC_NEEDED (MAX_SKB_FRAGS + 6)
 #define I40E_MIN_DESC_PENDING  4
 
 #define I40E_TX_FLAGS_HW_VLAN  BIT(1)
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
index 8d26c85d12e1..1f2959e66c34 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
@@ -260,7 +260,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int 
size)
 }
 
 /* Tx Descriptors needed, worst case */
-#define DESC_NEEDED (MAX_SKB_FRAGS + 4)
+#define DESC_NEEDED (MAX_SKB_FRAGS + 6)
 #define I40E_MIN_DESC_PENDING  4
 
 #define I40E_TX_FLAGS_HW_VLAN  BIT(1)
-- 
2.14.3

Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Michael S. Tsirkin

On Thu, Jan 11, 2018 at 09:58:39PM -0800, Sridhar Samudrala wrote:
> @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = {
>  #endif
>  };
>  
> +static int virtio_netdev_event(struct notifier_block *this,
> +unsigned long event, void *ptr)
> +{
> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> + /* Skip our own events */
> + if (event_dev->netdev_ops == &virtnet_netdev)
> + return NOTIFY_DONE;
> +
> + /* Avoid non-Ethernet type devices */
> + if (event_dev->type != ARPHRD_ETHER)
> + return NOTIFY_DONE;
> +
> + /* Avoid Vlan dev with same MAC registering as VF */
> + if (is_vlan_dev(event_dev))
> + return NOTIFY_DONE;
> +
> + /* Avoid Bonding master dev with same MAC registering as VF */
> + if ((event_dev->priv_flags & IFF_BONDING) &&
> + (event_dev->flags & IFF_MASTER))
> + return NOTIFY_DONE;
> +
> + switch (event) {
> + case NETDEV_REGISTER:
> + return virtnet_register_vf(event_dev);
> + case NETDEV_UNREGISTER:
> + return virtnet_unregister_vf(event_dev);
> + default:
> + return NOTIFY_DONE;
> + }
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> + .notifier_call = virtio_netdev_event,
> +};
> +
>  static __init int virtio_net_driver_init(void)
>  {
>   int ret;
> @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void)
>  ret = register_virtio_driver(&virtio_net_driver);
>   if (ret)
>   goto err_virtio;
> +
> + register_netdevice_notifier(&virtio_netdev_notifier);
>   return 0;
>  err_virtio:
>   cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init);
>  
>  static __exit void virtio_net_driver_exit(void)
>  {
> + unregister_netdevice_notifier(&virtio_netdev_notifier);
>   unregister_virtio_driver(&virtio_net_driver);
>   cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>   cpuhp_remove_multi_state(virtionet_online);

I have a question here: what if PT device driver module loads
and creates a device before virtio?


> -- 
> 2.14.3

[PATCH net-next] sfc: mark some unexported symbols as static

2018-01-26 Thread Edward Cree

From: kbuild test robot 

efx_default_channel_want_txqs() is only used in efx.c, while
 efx_ptp_want_txqs() and efx_ptp_channel_type (a struct) are only used
 in ptp.c.  In all cases these symbols should be static.

Fixes: 2935e3c38228 ("sfc: on 8000 series use TX queues for TX timestamps")
Signed-off-by: Fengguang Wu 
[ec...@solarflare.com: rewrote commit message]
Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/efx.c | 2 +-
 drivers/net/ethernet/sfc/ptp.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 456866b05641..16757cfc5b29 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -896,7 +896,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue)
mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(100));
 }
 
-bool efx_default_channel_want_txqs(struct efx_channel *channel)
+static bool efx_default_channel_want_txqs(struct efx_channel *channel)
 {
return channel->channel - channel->efx->tx_channel_offset <
channel->efx->n_tx_channels;
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 433d29d6bc95..3e2c5b11b5ef 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -366,7 +366,7 @@ bool efx_ptp_use_mac_tx_timestamps(struct efx_nic *efx)
 /* PTP 'extra' channel is still a traffic channel, but we only create TX queues
  * if PTP uses MAC TX timestamps, not if PTP uses the MC directly to transmit.
  */
-bool efx_ptp_want_txqs(struct efx_channel *channel)
+static bool efx_ptp_want_txqs(struct efx_channel *channel)
 {
return efx_ptp_use_mac_tx_timestamps(channel->efx);
 }
@@ -2146,7 +2146,7 @@ static int efx_phc_enable(struct ptp_clock_info *ptp,
return 0;
 }
 
-const struct efx_channel_type efx_ptp_channel_type = {
+static const struct efx_channel_type efx_ptp_channel_type = {
.handle_no_channel  = efx_ptp_handle_no_channel,
.pre_probe  = efx_ptp_probe_channel,
.post_remove= efx_ptp_remove_channel,

Re: [PATCH 2/3] Revert "e1000e: Separate signaling for link check/link up"

2018-01-26 Thread Alexander Duyck

On Fri, Jan 26, 2018 at 1:12 AM, Benjamin Poirier  wrote:
> This reverts commit 19110cfbb34d4af0cdfe14cd243f3b09dc95b013.
> This reverts commit 4110e02eb45ea447ec6f5459c9934de0a273fb91.
>
> ... because they cause an extra 2s delay for the link to come up when
> autoneg is off.
>
> After reverting, the race condition described in the log of commit
> 19110cfbb34d ("e1000e: Separate signaling for link check/link up") is
> reintroduced. It may still be triggered by LSC events but this should not
> result in link flap. It may no longer be triggered by RXO events because
> commit 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts")
> restored reading icr in the Other handler.

With the RXO events removed the only cause for us to transition the
bit should be LSC. I'm not sure if the race condition in that state is
a valid concern or not as the LSC should only get triggered if the
link state toggled, even briefly.

The bigger concern I would have would be the opposite of the original
race that was pointed out:
\ e1000_watchdog_task
\ e1000e_has_link
\ hw->mac.ops.check_for_link() === e1000e_check_for_copper_link
/* link is up */
mac->get_link_status = false;

/* interrupt */
\ e1000_msix_other
hw->mac.get_link_status = true;

link_active = !hw->mac.get_link_status
/* link_active is false, wrongly */

So the question I would have is what if we see the LSC for a link down
just after the check_for_copper_link call completes? It may not be
anything seen in the real world since I don't know if we have any link
flapping issues on e1000e or not without this patch. It is something
to keep in mind for the future though.


> As discussed, the driver should be in "maintenance mode". In the interest
> of stability, revert to the original code as much as possible instead of a
> half-baked solution.

If nothing else we may want to do a follow-up on this patch as we
probably shouldn't be returning the error values to trigger link up.
There are definitely issues to be found here. If nothing else we may
want to explore just returning 1 if auto-neg is disabled instead of
returning an error code.

> Link: https://www.spinics.net/lists/netdev/msg479923.html
> Signed-off-by: Benjamin Poirier 
> ---
>  drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 +++
>  drivers/net/ethernet/intel/e1000e/mac.c | 11 +++
>  drivers/net/ethernet/intel/e1000e/netdev.c  |  2 +-
>  3 files changed, 7 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c 
> b/drivers/net/ethernet/intel/e1000e/ich8lan.c
> index 31277d3bb7dc..d6d4ed7acf03 100644
> --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
> +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
> @@ -1367,9 +1367,6 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw 
> *hw, bool force)
>   *  Checks to see of the link status of the hardware has changed.  If a
>   *  change in link status has been detected, then we read the PHY registers
>   *  to get the current speed/duplex if link exists.
> - *
> - *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
> - *  up).
>   **/
>  static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
>  {
> @@ -1385,7 +1382,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
> e1000_hw *hw)
>  * Change or Rx Sequence Error interrupt.
>  */
> if (!mac->get_link_status)
> -   return 1;
> +   return 0;
>
> /* First we want to see if the MII Status Register reports
>  * link.  If so, then we want to get the current speed/duplex
> @@ -1616,12 +1613,10 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
> e1000_hw *hw)
>  * different link partner.
>  */
> ret_val = e1000e_config_fc_after_link_up(hw);
> -   if (ret_val) {
> +   if (ret_val)
> e_dbg("Error configuring flow control\n");
> -   return ret_val;
> -   }
>
> -   return 1;
> +   return ret_val;
>  }
>
>  static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter)
> diff --git a/drivers/net/ethernet/intel/e1000e/mac.c 
> b/drivers/net/ethernet/intel/e1000e/mac.c
> index f457c5703d0c..b322011ec282 100644
> --- a/drivers/net/ethernet/intel/e1000e/mac.c
> +++ b/drivers/net/ethernet/intel/e1000e/mac.c
> @@ -410,9 +410,6 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw)
>   *  Checks to see of the link status of the hardware has changed.  If a
>   *  change in link status has been detected, then we read the PHY registers
>   *  to get the current speed/duplex if link exists.
> - *
> - *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
> - *  up).
>   **/
>  s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
>  {
> @@ -426,7 +423,7 @@ s32 e1000e_check_for_cop

pull-request: wireless-drivers-next 2018-01-26

2018-01-26 Thread Kalle Valo

Hi Dave,

this is a pull request to net-next for 4.16, more info in the signed tag
below. Please let me know if you have any problems.

Kalle

The following changes since commit ebdd7b491b8a65d65936e07004caabca4a3c94a0:

  Merge branch 'mlxsw-Add-support-for-mirror-action-with-flower' (2018-01-21 
18:21:31 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git 
tags/wireless-drivers-next-for-davem-2018-01-26

for you to fetch changes up to 30ce7f4456ae40e970d9e82fe63c5e55147af0c0:

  mt76: validate rx CCMP PN (2018-01-26 11:20:52 +0200)


wireless-drivers-next patches for 4.16

Major changes:

wil6210

* add PCI device id for Talyn

* support flashless device

ath9k

* improve RSSI/signal accuracy on AR9003 series

mt76

* validate CCMP PN from received frames to avoid replay attacks

qtnfmac

* support 64-bit network stats

* report more hardware information to kernel log and some via ethtool


Arend Van Spriel (2):
  brcmfmac: assure bcdc dcmd api does not return value > 0
  brcmfmac: separate firmware errors from i/o errors

Dedy Lansky (1):
  wil6210: support flashless device

Felix Fietkau (10):
  mt76: fix transmission of encrypted management frames
  ath9k: discard undersized packets
  mt76: retry rx polling as long as there is budget left
  mt76: fix TSF value in probe responses
  mt76: add an intermediate struct for rx status information
  mt76: get station pointer by wcid and pass it to mac80211
  mt76: implement A-MPDU rx reordering in the driver code
  mt76: split mt76_rx_complete
  mt76: pass the per-vif wcid to the core for multicast rx
  mt76: validate rx CCMP PN

Igor Mitsyanko (2):
  qtnfmac: do not use mutexes in timer context
  qtnfmac: do not use bus mutex for events processing

Kalle Valo (1):
  Merge ath-next from git://git.kernel.org/.../kvalo/ath.git

Lior David (2):
  wil6210: fix random failure to bring network interface up
  wil6210: enlarge FW mac_rgf_ext section for Sparrow D0

Lorenzo Bianconi (5):
  mt76x2: fix WMM parameter configuration
  mt76x2: dfs: avoid tasklet scheduling during mt76x2_dfs_init_params()
  mt76x2: dfs: add set_domain handler
  mt76x2: dfs: take into account dfs region in mt76x2_dfs_init_params()
  mt76x2: init: disable all pending tasklets during device removal

Luis de Bethencourt (1):
  rtl8xxxu: Fix trailing semicolon

Maya Erez (5):
  wil6210: add Talyn PCIe device ID
  wil6210: recognize Talyn JTAG ID
  wil6210: add support for Talyn AHB address map
  wil6210: configure OTP HW vectors in SW reset flow
  wil6210: support parsing brd file address from fw file

Ping-Ke Shih (10):
  rtlwifi: btcoex: extend get_wifi_bw to support bandwidth 80M
  rtlwifi: btcoex: Add switch band notify for btc
  rtlwifi: btcoex: Add variable ant_div_cfg to support antenna diversity
  rtlwifi: btcoex: add scan_notify within ips_notify if RFON
  rtlwifi: btcoex: Add wifi_only series ops to control solo card
  rtlwifi: btcoex: add boolean variables dbg_mode
  rtlwifi: 8822be has to report vht capability to mac80211
  rtlwifi: Add ratr_table for newer IC
  rtlwifi: Add spec_ver to check whether use new rate-id or not
  rtlwifi: btcoex: Fix some static warnings from Sparse

Ramon Fried (1):
  wcn36xx: release DMA memory in case of error

Sergey Matyukevich (4):
  qtnfmac: modify supported interface combinations
  qtnfmac: validate interface combinations on changes
  qtnfmac: fix STA disconnect procedure
  qtnfmac: remove redundant 'unlikely' checks

Vasily Ulyanov (5):
  qtnfmac: remove struct qlink_cmd_set_mac_acl
  qtnfmac: fix warnings when mBSS setup is stopped
  qtnfmac: support 64-bit network interface stats
  qtnfmac: get more hardware info from card
  qtnfmac: report hardware/firmware information via ethtool

Wojciech Dubowik (4):
  ath9k: Alternative EEPROM size for AR9003
  ath9k: Read noise floor calibration data from eeprom
  ath9k: Use calibrated noise floor value when available
  ath9k: Display calibration data piers in debugfs

 drivers/net/wireless/ath/ath9k/ar9003_eeprom.c | 133 ++-
 drivers/net/wireless/ath/ath9k/ar9003_eeprom.h |  10 +
 drivers/net/wireless/ath/ath9k/calib.c |  38 +--
 drivers/net/wireless/ath/ath9k/hw.h|   2 +
 drivers/net/wireless/ath/ath9k/recv.c  |   4 +-
 drivers/net/wireless/ath/wcn36xx/dxe.c |  46 +++-
 drivers/net/wireless/ath/wil6210/boot_loader.h |   9 +-
 drivers/net/wireless/ath/wil6210/fw.h  |  18 +-
 drivers/net/wireless/ath/wil6210/fw_inc.c  | 167 -
 drivers/net/wireless/ath/wil6210/interrupt.c   |   6 +-
 drivers/

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread Al Viro

On Fri, Jan 26, 2018 at 11:07:39AM -0500, David Miller wrote:
> >> This is found by a static analysis tool named DCNS written by myself.
> > 
> > The trouble is, places like
> > net/atm/raw.c:65:   vcc->send = atm_send_aal0;
> > net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
> > net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
> > mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
> > ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
> > ? DROP_PACKET : 1;
> > bh_unlock_sock(sk_atm(vcc));
> > in pppoatm_send() definitely is called under a spinlock.
> > 
> > Looking through the driver (in advanced bitrot, as usual for drivers/atm),
> > I'd say that submit_queue() is fucked in head in the "queue full" case.
> > And judging by the history, had been thus since the original merge...
> 
> Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL
> conversions.
> 
> Al's analysis above is similar to how things looked for other patches
> you submited of this nature.
> 
> So because of the lack of care and serious auditing you put into these
> conversions, I really have no choice but to drop them from my queue
> because on the whole they are adding bugs rather than improving the
> code.

FWIW, the tool *does* promise to be useful - as far as I understand it
looks for places where GFP_ATOMIC allocation goes with blocking operations
near every callchain leading there.  And that indicates something fishy
going on - either pointless GFP_ATOMIC (in benign case), or something
much nastier: a callchain that would require GFP_ATOMIC.  In that case
whatever blocking operation found along the way is a bug.

This time it has, AFAICS, caught a genuine bug in drivers/atm/firestream.c -
static void submit_qentry (struct fs_dev *dev, struct queue *q, struct 
FS_QENTRY *qe)
{
u32 wp;
struct FS_QENTRY *cqe;

/* XXX Sanity check: the write pointer can be checked to be
   still the same as the value passed as qe... -- REW */
/*  udelay (5); */
while ((wp = read_fs (dev, Q_WP (q->offset))) & Q_FULL) {
fs_dprintk (FS_DEBUG_TXQ, "Found queue at %x full. Waiting.\n",
q->offset);
schedule ();
}
...
static void submit_queue (struct fs_dev *dev, struct queue *q,
  u32 cmd, u32 p1, u32 p2, u32 p3)
{
struct FS_QENTRY *qe;

qe = get_qentry (dev, q);
qe->cmd = cmd;
qe->p0 = p1;
qe->p1 = p2;
qe->p2 = p3;
submit_qentry (dev,  q, qe);
...
static int fs_send (struct atm_vcc *atm_vcc, struct sk_buff *skb)
{
...
td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
...
submit_queue (dev, &dev->hp_txq,
  QE_TRANSMIT_DE | vcc->channo,
  virt_to_bus (td), 0,
  virt_to_bus (td));
...

Either all callchains leading to fs_send() are in non-atomic contexts
(in which case that GFP_ATOMIC would be pointless) or there's one
where we cannot block.  Any such would be a potential deadlock.

And the latter appears to be the case - fs_send() is atmdev_ops ->send(),
which can end up in atm_vcc ->send(), which can be called from under
->sk_lock.slock.

What would be really useful:
* list of "here's a list of locations where we do something
blocking; each callchain to this GFP_ATOMIC allocation passes either
upstream of one of those without leaving atomic context in between
or downstream without entering one".
* after that - backtracking these callchains further, watching
for ones in atomic contexts.  Can be done manually, but if that tool
can assist in doing so, all the better.  If we do find one, we have
found a deadlock - just take the blocking operation reported for
that callchain and that's it.  If it's not an obvious false positive
(e.g.
if (!foo)
spin_lock(&splat);
.
if (foo)
schedule();
), it's worth reporting to maintainers of the code in question.
* if all callchains reach obviously non-atomic contexts
(syscall entry, for example, or a kernel thread payload, or
a function documented to require a mutex held by caller, etc.) -
then a switch to GFP_KERNEL might be appropriate.  With analysis
of callchains posted as you are posting that.
* either way, having the tool print the callchains out
would be a good idea - for examining them, for writing reports,
etc.

Re: [regresssion 4.15] Userspace compilation broken by uapi/linux/if_ether.h update

2018-01-26 Thread Guillaume Nault

On Fri, Jan 26, 2018 at 11:51:38AM +0100, Guillaume Nault wrote:
> On Thu, Jan 25, 2018 at 11:21:34PM +0100, Hauke Mehrtens wrote:
> > On 01/25/2018 03:58 PM, Guillaume Nault wrote:
> > > Now that linux/libc-compat.h is included in linux/if_ether.h, it is
> > > processed before netinet/in.h. Therefore, it sets the relevant
> > > __UAPI_DEF_* guards to 1 (as _NETINET_IN_H isn't yet defined).
> > > Then netinet/in.h is included, followed by linux/in.h. The later
> > > doesn't realise that what it defines has already been included by
> > > netinet/in.h because the __UAPI_DEF_* guards were set too early.
> > > 
> > This is about this commit:
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6926e041a8920c8ec27e4e155efa760aa01551fd
> > 
> > On option would be to move this into include/uapi/linux/if_ether.h and
> > remove the include for libc-compat.h:
> > #ifndef __UAPI_DEF_ETHHDR
> > #define __UAPI_DEF_ETHHDR   1
> > #endif
> > 
> > This will only work if netinet/if_ether.h is included before
> > linux/if_ether.h, but I think this is very likely.
> >
> I don't see what makes its likely. That's not directly related to your
> point, but for example, glibc guarantees the opposite as it includes
> linux/if_ether.h at the beginning of netinet/if_ether.h.
> 
> > I think we can do this because we do not need some special libc handling
> > like it is done for other symbols as __UAPI_DEF_ETHHDR is currently only
> > needed by musl and not by glibc.
> > 
> That's ok for me as long as existing projects keep compiling. But all
> __UAPI_DEF_* are currently centralised in libc-compat.h. Adding
> __UAPI_DEF_ETHHDR in if_ether.h looks like defeating the purpose of
> libc-compat.h and I wonder if that'd be accepted. Maybe with a
> different name.
> 
> In any case, we're really late in the release cycle. If more discussion
> is needed, it's probably better to revert and take time to work on a
> solution for the next release.
> 
Hi David,

I just realise you've sent your last pull request for this release. I
was waiting for feedbacks in order to avoid a revert. Should I send a
revert now or do you prefer to sort this out later and backport a fix
in 4.15.1?

[bpf PATCH v2 0/3] Series short description

2018-01-26 Thread John Fastabend

The following series implements...

---

John Fastabend (3):
  net: add a UID to use for ULP socket assignment
  bpf: sockmap, add sock close() hook to remove socks
  bpf: sockmap, fix leaking maps with attached but not detached progs


 include/net/tcp.h|8 ++
 kernel/bpf/sockmap.c |  165 --
 net/ipv4/tcp_ulp.c   |   57 -
 net/tls/tls_main.c   |2 +
 4 files changed, 155 insertions(+), 77 deletions(-)

--
Signature

Re: [bpf PATCH v2 0/3] Series short description

2018-01-26 Thread John Fastabend

On 01/26/2018 09:43 AM, John Fastabend wrote:
> The following series implements...
> 
> ---
> 

Sorry random noise on my side fat-fingered some keys.
Please just ignore it. :/

Re: [RFC PATCH 0/2] hv_netvsc: Fix shutdown regression on Win2012 hosts

2018-01-26 Thread Stephen Hemminger

On Tue, 23 Jan 2018 10:34:03 +0100
Mohammed Gamal  wrote:

> Commit 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split") introduced
> a regression that caused VMs not to shutdown after netvsc_device_remove() is
> called. This is caused by GPADL teardown sequence change, and while that was 
> necessary to fix issues with Win2016 hosts, it did introduce a regression for
> earlier versions.
> 
> Prior to commit 0cf737808 the call sequence in netvsc_device_remove() was as 
> follows (as implemented in netvsc_destroy_buf()):
> 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message
> 2- Teardown receive buffer GPADL
> 3- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message
> 4- Teardown send buffer GPADL
> 5- Close vmbus
> 
> This didn't work for WS2016 hosts. Commit 0cf737808 split netvsc_destroy_buf()
> into two functions and rearranged the order as follows
> 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message
> 2- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message
> 3- Close vmbus
> 4- Teardown receive buffer GPADL
> 5- Teardown send buffer GPADL
> 
> That worked well for WS2016 hosts, but for WS2012 hosts it prevented VMs from
> shutting down. 
> 
> This patch series works around this problem. The first patch splits
> netvsc_revoke_buf() and netvsc_teardown_gpadl() into two finer grained
> functions for tearing down send and receive buffers individally. The second 
> patch
> uses the finer grained functions to implement the teardown sequence according 
> to
> the host's version. We keep the behavior introduced in 0cf737808ae7 for 
> Windows
> 2016 hosts, while we re-introduce the old sequence for earlier verions.
> 
> Mohammed Gamal (2):
>   hv_netvsc: Split netvsc_revoke_buf() and netvsc_teardown_gpadl()
>   hv_netvsc: Change GPADL teardown order according to Hyper-V version
> 
>  drivers/net/hyperv/netvsc.c | 50 
> +
>  1 file changed, 42 insertions(+), 8 deletions(-)
> 

What I am experimenting with is sending an NDIS_RESET (instead of setting 
packet filter)
as part of the close processing. This seems more like what the description of 
what Windows
driver does and matches my reading of the public RNDIS specification.

[bpf-next PATCH v2 0/3] bpf: sockmap fixes

2018-01-26 Thread John Fastabend

A set of fixes for sockmap to resolve map/prog not being cleaned
up when maps are not cleaned up from the program side.

For this we pull in the ULP infrastructure to hook into the close()
hook of the sock layer. This seemed natural because we have additional
sockmap features (to add support for TX hooks) that will also use the
ULP infrastructure. This allows us to cleanup entries in the map when
socks are closed() and avoid trying to get the sk_state_change() hook
to fire in all cases.

The second issue resolved here occurs when users don't detach
programs. The gist is a refcnt issue resolved by implementing the
release callback. See patch for details.

For testing I ran both sample/sockmap and selftests bpf/test_maps.c.
Dave Watson ran TLS test suite on v1 version of the patches without
the put_module error path change.

v2 changes rebased onto bpf-next with the following small updates

Patch 1/3 Dave Watson pointed out that we should do a module put
in case future users use this with another ULP other than sockmap.
And module_put is a nop when owner = NULL (sockmap case) so no
harm here and code is more robust.

Patch 2/3 Be explicit and set user_visible to false just to avoid
any reader confusion.

---

John Fastabend (3):
  net: add a UID to use for ULP socket assignment
  bpf: sockmap, add sock close() hook to remove socks
  bpf: sockmap, fix leaking maps with attached but not detached progs


 include/net/tcp.h|8 ++
 kernel/bpf/sockmap.c |  165 --
 net/ipv4/tcp_ulp.c   |   57 -
 net/tls/tls_main.c   |2 +
 4 files changed, 155 insertions(+), 77 deletions(-)

--
Signature

[bpf-next PATCH v2 1/3] net: add a UID to use for ULP socket assignment

2018-01-26 Thread John Fastabend

Create a UID field and enum that can be used to assign ULPs to
sockets. This saves a set of string comparisons if the ULP id
is known.

For sockmap, which is added in the next patches, a ULP is used to
hook into TCP sockets close state. In this case the ULP being added
is done at map insert time and the ULP is known and done on the kernel
side. In this case the named lookup is not needed. Because we don't
want to expose psock internals to user space socket options a user
visible flag is also added. For TLS this is set for BPF it will be
cleared.

Alos remove pr_notice, user gets an error code back and should check
that rather than rely on logs.

Signed-off-by: John Fastabend 
---
 include/net/tcp.h  |6 +
 net/ipv4/tcp_ulp.c |   57 +++-
 net/tls/tls_main.c |2 ++
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 093e967..ba10ca7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1983,6 +1983,10 @@ static inline void tcp_listendrop(const struct sock *sk)
 #define TCP_ULP_MAX128
 #define TCP_ULP_BUF_MAX(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
 
+enum {
+   TCP_ULP_TLS,
+};
+
 struct tcp_ulp_ops {
struct list_headlist;
 
@@ -1991,7 +1995,9 @@ struct tcp_ulp_ops {
/* cleanup ulp */
void (*release)(struct sock *sk);
 
+   int uid;
charname[TCP_ULP_NAME_MAX];
+   booluser_visible;
struct module   *owner;
 };
 int tcp_register_ulp(struct tcp_ulp_ops *type);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 6bb9e14..6d87e7a 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -29,6 +29,18 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
return NULL;
 }
 
+static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
+{
+   struct tcp_ulp_ops *e;
+
+   list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+   if (e->uid == ulp)
+   return e;
+   }
+
+   return NULL;
+}
+
 static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 {
const struct tcp_ulp_ops *ulp = NULL;
@@ -51,6 +63,18 @@ static const struct tcp_ulp_ops 
*__tcp_ulp_find_autoload(const char *name)
return ulp;
 }
 
+static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
+{
+   const struct tcp_ulp_ops *ulp = NULL;
+
+   rcu_read_lock();
+   ulp = tcp_ulp_find_id(uid);
+   if (!ulp || !try_module_get(ulp->owner))
+   ulp = NULL;
+   rcu_read_unlock();
+   return ulp;
+}
+
 /* Attach new upper layer protocol to the list
  * of available protocols.
  */
@@ -59,13 +83,10 @@ int tcp_register_ulp(struct tcp_ulp_ops *ulp)
int ret = 0;
 
spin_lock(&tcp_ulp_list_lock);
-   if (tcp_ulp_find(ulp->name)) {
-   pr_notice("%s already registered or non-unique name\n",
- ulp->name);
+   if (tcp_ulp_find(ulp->name))
ret = -EEXIST;
-   } else {
+   else
list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
-   }
spin_unlock(&tcp_ulp_list_lock);
 
return ret;
@@ -124,6 +145,32 @@ int tcp_set_ulp(struct sock *sk, const char *name)
if (!ulp_ops)
return -ENOENT;
 
+   if (!ulp_ops->user_visible)
+   return -ENOENT;
+
+   err = ulp_ops->init(sk);
+   if (err) {
+   module_put(ulp_ops->owner);
+   return err;
+   }
+
+   icsk->icsk_ulp_ops = ulp_ops;
+   return 0;
+}
+
+int tcp_set_ulp_id(struct sock *sk, int ulp)
+{
+   struct inet_connection_sock *icsk = inet_csk(sk);
+   const struct tcp_ulp_ops *ulp_ops;
+   int err;
+
+   if (icsk->icsk_ulp_ops)
+   return -EEXIST;
+
+   ulp_ops = __tcp_ulp_lookup(ulp);
+   if (!ulp_ops)
+   return -ENOENT;
+
err = ulp_ops->init(sk);
if (err) {
module_put(ulp_ops->owner);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 736719c..b0d5fce 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -484,6 +484,8 @@ static int tls_init(struct sock *sk)
 
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
.name   = "tls",
+   .uid= TCP_ULP_TLS,
+   .user_visible   = true,
.owner  = THIS_MODULE,
.init   = tls_init,
 };

[bpf-next PATCH v2 3/3] bpf: sockmap, fix leaking maps with attached but not detached progs

2018-01-26 Thread John Fastabend

When a program is attached to a map we increment the program refcnt
to ensure that the program is not removed while it is potentially
being referenced from sockmap side. However, if this same program
also references the map (this is a reasonably common pattern in
my programs) then the verifier will also increment the maps refcnt
from the verifier. This is to ensure the map doesn't get garbage
collected while the program has a reference to it.

So we are left in a state where the map holds the refcnt on the
program stopping it from being removed and releasing the map refcnt.
And vice versa the program holds a refcnt on the map stopping it
from releasing the refcnt on the prog.

All this is fine as long as users detach the program while the
map fd is still around. But, if the user omits this detach command
we are left with a dangling map we can no longer release.

To resolve this when the map fd is released decrement the program
references and remove any reference from the map to the program.
This fixes the issue with possibly dangling map and creates a
user side API constraint. That is, the map fd must be held open
for programs to be attached to a map.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 4e7ba37..a30449d 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -598,11 +598,6 @@ static void sock_map_free(struct bpf_map *map)
}
rcu_read_unlock();
 
-   if (stab->bpf_verdict)
-   bpf_prog_put(stab->bpf_verdict);
-   if (stab->bpf_parse)
-   bpf_prog_put(stab->bpf_parse);
-
sock_map_remove_complete(stab);
 }
 
@@ -878,6 +873,19 @@ static int sock_map_update_elem(struct bpf_map *map,
return err;
 }
 
+static void sock_map_release(struct bpf_map *map, struct file *map_file)
+{
+   struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+   struct bpf_prog *orig;
+
+   orig = xchg(&stab->bpf_parse, NULL);
+   if (orig)
+   bpf_prog_put(orig);
+   orig = xchg(&stab->bpf_verdict, NULL);
+   if (orig)
+   bpf_prog_put(orig);
+}
+
 const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
@@ -885,6 +893,7 @@ static int sock_map_update_elem(struct bpf_map *map,
.map_get_next_key = sock_map_get_next_key,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
+   .map_release = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,

[bpf-next PATCH v2 2/3] bpf: sockmap, add sock close() hook to remove socks

2018-01-26 Thread John Fastabend

The selftests test_maps program was leaving dangling BPF sockmap
programs around because not all psock elements were removed from
the map. The elements in turn hold a reference on the BPF program
they are attached to causing BPF programs to stay open even after
test_maps has completed.

The original intent was that sk_state_change() would be called
when TCP socks went through TCP_CLOSE state. However, because
socks may be in SOCK_DEAD state or the sock may be a listening
socket the event is not always triggered.

To resolve this use the ULP infrastructure and register our own
proto close() handler. This fixes the above case.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Reported-by: Prashant Bhole 
Signed-off-by: John Fastabend 
---
 include/net/tcp.h|2 +
 kernel/bpf/sockmap.c |  146 +++---
 2 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ba10ca7..e082101 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1985,6 +1985,7 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum {
TCP_ULP_TLS,
+   TCP_ULP_BPF,
 };
 
 struct tcp_ulp_ops {
@@ -2003,6 +2004,7 @@ struct tcp_ulp_ops {
 int tcp_register_ulp(struct tcp_ulp_ops *type);
 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
+int tcp_set_ulp_id(struct sock *sk, const int ulp);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0314d17..4e7ba37 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@ struct smap_psock {
struct work_struct tx_work;
struct work_struct gc_work;
 
+   struct proto *sk_proto;
+   void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
-   void (*save_state_change)(struct sock *sk);
 };
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,80 @@ static inline struct smap_psock *smap_psock_sk(const struct 
sock *sk)
return rcu_dereference_sk_user_data(sk);
 }
 
+struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+   struct smap_psock *psock;
+
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock))
+   return -EINVAL;
+
+   if (unlikely(psock->sk_proto))
+   return -EBUSY;
+
+   psock->save_close = sk->sk_prot->close;
+   psock->sk_proto = sk->sk_prot;
+   sk->sk_prot = &tcp_bpf_proto;
+   return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+   struct smap_psock *psock = smap_psock_sk(sk);
+
+   if (likely(psock)) {
+   sk->sk_prot = psock->sk_proto;
+   psock->sk_proto = NULL;
+   }
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+void bpf_tcp_close(struct sock *sk, long timeout)
+{
+   struct smap_psock_map_entry *e, *tmp;
+   struct smap_psock *psock;
+   struct sock *osk;
+
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock))
+   return sk->sk_prot->close(sk, timeout);
+
+   write_lock_bh(&sk->sk_callback_lock);
+   list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+   osk = cmpxchg(e->entry, sk, NULL);
+   if (osk == sk) {
+   list_del(&e->list);
+   smap_release_sock(psock, sk);
+   }
+   }
+   write_unlock_bh(&sk->sk_callback_lock);
+   return psock->save_close(sk, timeout);
+}
+
 enum __sk_action {
__SK_DROP = 0,
__SK_PASS,
__SK_REDIRECT,
 };
 
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+   .name   = "bpf_tcp",
+   .uid= TCP_ULP_BPF,
+   .user_visible   = false,
+   .owner  = NULL,
+   .init   = bpf_tcp_init,
+   .release= bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+   tcp_bpf_proto = tcp_prot;
+   tcp_bpf_proto.close = bpf_tcp_close;
+   return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +235,6 @@ static void smap_report_sk_error(struct smap_psock *psock, 
int err)
sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
-   struct smap_psock_map_entry *e, *tmp;
-   struct smap_psock *psock;
-   struct socket_wq *wq;
-   struct sock *osk;
-
-   rcu_read_lock();
-
-   /* Allowing transitions into an established syn_recv states allows
-* for early binding sockets

Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Samudrala, Sridhar




On 1/26/2018 8:58 AM, Michael S. Tsirkin wrote:

On Thu, Jan 11, 2018 at 09:58:39PM -0800, Sridhar Samudrala wrote:

@@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = {
  #endif
  };
  
+static int virtio_netdev_event(struct notifier_block *this,

+  unsigned long event, void *ptr)
+{
+   struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+   /* Skip our own events */
+   if (event_dev->netdev_ops == &virtnet_netdev)
+   return NOTIFY_DONE;
+
+   /* Avoid non-Ethernet type devices */
+   if (event_dev->type != ARPHRD_ETHER)
+   return NOTIFY_DONE;
+
+   /* Avoid Vlan dev with same MAC registering as VF */
+   if (is_vlan_dev(event_dev))
+   return NOTIFY_DONE;
+
+   /* Avoid Bonding master dev with same MAC registering as VF */
+   if ((event_dev->priv_flags & IFF_BONDING) &&
+   (event_dev->flags & IFF_MASTER))
+   return NOTIFY_DONE;
+
+   switch (event) {
+   case NETDEV_REGISTER:
+   return virtnet_register_vf(event_dev);
+   case NETDEV_UNREGISTER:
+   return virtnet_unregister_vf(event_dev);
+   default:
+   return NOTIFY_DONE;
+   }
+}
+
+static struct notifier_block virtio_netdev_notifier = {
+   .notifier_call = virtio_netdev_event,
+};
+
  static __init int virtio_net_driver_init(void)
  {
int ret;
@@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void)
  ret = register_virtio_driver(&virtio_net_driver);
if (ret)
goto err_virtio;
+
+   register_netdevice_notifier(&virtio_netdev_notifier);
return 0;
  err_virtio:
cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
@@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init);
  
  static __exit void virtio_net_driver_exit(void)

  {
+   unregister_netdevice_notifier(&virtio_netdev_notifier);
unregister_virtio_driver(&virtio_net_driver);
cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
cpuhp_remove_multi_state(virtionet_online);

I have a question here: what if PT device driver module loads
and creates a device before virtio?


Initially i also had this question if we get NETDEV_REGISTER events for 
netdevs
that are already present. But it looks like 
register_netdevice_notifier() will cause

replay of all the registration and up events of existing devices.

/**
 * register_netdevice_notifier - register a network notifier block
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

Thanks
Sridhar

[net-next 15/15] ixgbe: don't set RXDCTL.RLPML for 82599

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

commit 2de6aa3a666e ("ixgbe: Add support for padding packet")

Uses RXDCTL.RLPML to limit the maximum frame size on Rx when using
build_skb. Unfortunately that register does not work on 82599.

Added an explicit check to avoid setting this register on 82599 MAC.

Extended the comment related to the setting of RXDCTL.RLPML to better
explain its purpose.

Signed-off-by: Emil Tantilov 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 643c7288ea0f..0da5aa2c8aba 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4133,11 +4133,15 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter 
*adapter,
rxdctl &= ~0x3F;
rxdctl |=  0x080420;
 #if (PAGE_SIZE < 8192)
-   } else {
+   /* RXDCTL.RLPML does not work on 82599 */
+   } else if (hw->mac.type != ixgbe_mac_82599EB) {
rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
IXGBE_RXDCTL_RLPML_EN);
 
-   /* Limit the maximum frame size so we don't overrun the skb */
+   /* Limit the maximum frame size so we don't overrun the skb.
+* This can happen in SRIOV mode when the MTU of the VF is
+* higher than the MTU of the PF.
+*/
if (ring_uses_build_skb(ring) &&
!test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
rxdctl |= IXGBE_MAX_2K_FRAME_BUILD_SKB |
-- 
2.14.3

[net-next 10/15] ixgbevf: use ARRAY_SIZE for various array sizing calculations

2018-01-26 Thread Jeff Kirsher

From: Colin Ian King 

Use the ARRAY_SIZE macro on various arrays to determine
size of the arrays. Improvement suggested by coccinelle.

Signed-off-by: Colin Ian King 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/vf.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c 
b/drivers/net/ethernet/intel/ixgbevf/vf.c
index 64c93e8becc6..38d3a327c1bc 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.c
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.c
@@ -286,7 +286,7 @@ static s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 
index, u8 *addr)
ether_addr_copy(msg_addr, addr);
 
ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-sizeof(msgbuf) / sizeof(u32));
+ARRAY_SIZE(msgbuf));
if (!ret_val) {
msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
@@ -456,8 +456,7 @@ static s32 ixgbevf_set_rar_vf(struct ixgbe_hw *hw, u32 
index, u8 *addr,
ether_addr_copy(msg_addr, addr);
 
ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-sizeof(msgbuf) / sizeof(u32));
-
+ARRAY_SIZE(msgbuf));
msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
/* if nacked the address was rejected, use "perm_addr" */
@@ -574,7 +573,7 @@ static s32 ixgbevf_update_xcast_mode(struct ixgbe_hw *hw, 
int xcast_mode)
msgbuf[1] = xcast_mode;
 
err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-sizeof(msgbuf) / sizeof(u32));
+ARRAY_SIZE(msgbuf));
if (err)
return err;
 
@@ -614,7 +613,7 @@ static s32 ixgbevf_set_vfta_vf(struct ixgbe_hw *hw, u32 
vlan, u32 vind,
msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT;
 
err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-sizeof(msgbuf) / sizeof(u32));
+ARRAY_SIZE(msgbuf));
if (err)
goto mbx_err;
 
@@ -826,7 +825,7 @@ static s32 ixgbevf_set_rlpml_vf(struct ixgbe_hw *hw, u16 
max_size)
msgbuf[1] = max_size;
 
ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-sizeof(msgbuf) / sizeof(u32));
+ARRAY_SIZE(msgbuf));
if (ret_val)
return ret_val;
if ((msgbuf[0] & IXGBE_VF_SET_LPE) &&
@@ -872,8 +871,7 @@ static int ixgbevf_negotiate_api_version_vf(struct ixgbe_hw 
*hw, int api)
msg[1] = api;
msg[2] = 0;
 
-   err = ixgbevf_write_msg_read_ack(hw, msg, msg,
-sizeof(msg) / sizeof(u32));
+   err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg));
if (!err) {
msg[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
@@ -924,8 +922,7 @@ int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int 
*num_tcs,
msg[0] = IXGBE_VF_GET_QUEUE;
msg[1] = msg[2] = msg[3] = msg[4] = 0;
 
-   err = ixgbevf_write_msg_read_ack(hw, msg, msg,
-sizeof(msg) / sizeof(u32));
+   err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg));
if (!err) {
msg[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
-- 
2.14.3

[net-next 14/15] ixgbe: Fix && vs || typo

2018-01-26 Thread Jeff Kirsher

From: Dan Carpenter 

"offset" can't be both 0x0 and 0x so presumably || was intended
instead of &&.  That matches with how this check is done in other
functions.

Fixes: 73834aec7199 ("ixgbe: extend firmware version support")
Signed-off-by: Dan Carpenter 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 7ac7ef9b37ff..61188f343955 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -4087,7 +4087,7 @@ void ixgbe_get_oem_prod_version(struct ixgbe_hw *hw,
hw->eeprom.ops.read(hw, NVM_OEM_PROD_VER_PTR, &offset);
 
/* Return is offset to OEM Product Version block is invalid */
-   if (offset == 0x0 && offset == NVM_INVALID_PTR)
+   if (offset == 0x0 || offset == NVM_INVALID_PTR)
return;
 
/* Read product version block */
-- 
2.14.3

[net-next 07/15] ixgbevf: clear rx_buffer_info in configure instead of clean

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit d2bead576e67
("igb: Clear Rx buffer_info in configure instead of clean")

This change makes it so that instead of going through the entire ring on Rx
cleanup we only go through the region that was designated to be cleaned up
and stop when we reach the region where new allocations should start.

In addition we can avoid having to perform a memset on the Rx buffer_info
structures until we are about to start using the ring again.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 26 +++
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 350afec3dde8..a793f9ea05e7 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1773,6 +1773,10 @@ static void ixgbevf_configure_rx_ring(struct 
ixgbevf_adapter *adapter,
IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0);
ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx);
 
+   /* initialize rx_buffer_info */
+   memset(ring->rx_buffer_info, 0,
+  sizeof(struct ixgbevf_rx_buffer) * ring->count);
+
/* initialize Rx descriptor 0 */
rx_desc = IXGBEVF_RX_DESC(ring, 0);
rx_desc->wb.upper.length = 0;
@@ -2131,8 +2135,7 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter)
  **/
 static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring)
 {
-   unsigned long size;
-   unsigned int i;
+   u16 i = rx_ring->next_to_clean;
 
/* Free Rx ring sk_buff */
if (rx_ring->skb) {
@@ -2140,17 +2143,11 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
rx_ring->skb = NULL;
}
 
-   /* ring already cleared, nothing to do */
-   if (!rx_ring->rx_buffer_info)
-   return;
-
/* Free all the Rx ring pages */
-   for (i = 0; i < rx_ring->count; i++) {
+   while (i != rx_ring->next_to_alloc) {
struct ixgbevf_rx_buffer *rx_buffer;
 
rx_buffer = &rx_ring->rx_buffer_info[i];
-   if (!rx_buffer->page)
-   continue;
 
/* Invalidate cache lines that may have been written to by
 * device so that we avoid corrupting memory.
@@ -2171,11 +2168,14 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
__page_frag_cache_drain(rx_buffer->page,
rx_buffer->pagecnt_bias);
 
-   rx_buffer->page = NULL;
+   i++;
+   if (i == rx_ring->count)
+   i = 0;
}
 
-   size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
-   memset(rx_ring->rx_buffer_info, 0, size);
+   rx_ring->next_to_alloc = 0;
+   rx_ring->next_to_clean = 0;
+   rx_ring->next_to_use = 0;
 }
 
 /**
@@ -3090,7 +3090,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring 
*rx_ring)
int size;
 
size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
-   rx_ring->rx_buffer_info = vzalloc(size);
+   rx_ring->rx_buffer_info = vmalloc(size);
if (!rx_ring->rx_buffer_info)
goto err;
 
-- 
2.14.3

[net-next 13/15] ixgbe: add support for reporting 5G link speed

2018-01-26 Thread Jeff Kirsher

From: Paul Greenwalt 

Since 5G link speed is supported by some devices, add reporting of 5G link
speed.

Signed-off-by: Paul Greenwalt 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index bbb622f15a77..643c7288ea0f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7259,6 +7259,9 @@ static void ixgbe_watchdog_link_is_up(struct 
ixgbe_adapter *adapter)
case IXGBE_LINK_SPEED_10GB_FULL:
speed_str = "10 Gbps";
break;
+   case IXGBE_LINK_SPEED_5GB_FULL:
+   speed_str = "5 Gbps";
+   break;
case IXGBE_LINK_SPEED_2_5GB_FULL:
speed_str = "2.5 Gbps";
break;
-- 
2.14.3

[net-next 04/15] ixgbevf: add support for DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit 5be5955425c2
("igb: update driver to make use of DMA_ATTR_SKIP_CPU_SYNC")
and
commit 7bd175928280 ("igb: Add support for DMA_ATTR_WEAK_ORDERING")

Convert the calls to dma_map/unmap_page() to the attributes version
and add DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING which should help
improve performance on some platforms.

Move sync_for_cpu call before we perform a prefetch to avoid
invalidating the first 128 bytes of the packet on architectures where
that call may invalidate the cache.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  3 ++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 57 ++-
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 581f44bbd7b3..b1da9f41c1dc 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -260,6 +260,9 @@ static inline void ixgbevf_write_tail(struct ixgbevf_ring 
*ring, u32 value)
 #define MIN_MSIX_Q_VECTORS 1
 #define MIN_MSIX_COUNT (MIN_MSIX_Q_VECTORS + NON_Q_VECTORS)
 
+#define IXGBEVF_RX_DMA_ATTR \
+   (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 /* board specific private data structure */
 struct ixgbevf_adapter {
/* this field must be first, see ixgbevf_process_skb_fields */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 725fe2dca868..fbd493efd14e 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -595,8 +595,8 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring 
*rx_ring,
}
 
/* map page for use */
-   dma = dma_map_page(rx_ring->dev, page, 0,
-  PAGE_SIZE, DMA_FROM_DEVICE);
+   dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR);
 
/* if mapping failed free memory back to system since
 * there isn't much point in holding memory we can't use
@@ -639,6 +639,12 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring 
*rx_ring,
if (!ixgbevf_alloc_mapped_page(rx_ring, bi))
break;
 
+   /* sync the buffer for use by the device */
+   dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+bi->page_offset,
+IXGBEVF_RX_BUFSZ,
+DMA_FROM_DEVICE);
+
/* Refresh the desc even if pkt_addr didn't change
 * because each write-back erases this info.
 */
@@ -741,12 +747,6 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring 
*rx_ring,
new_buff->page = old_buff->page;
new_buff->dma = old_buff->dma;
new_buff->page_offset = old_buff->page_offset;
-
-   /* sync the buffer for use by the device */
-   dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
-new_buff->page_offset,
-IXGBEVF_RX_BUFSZ,
-DMA_FROM_DEVICE);
 }
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
@@ -862,6 +862,13 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
page = rx_buffer->page;
prefetchw(page);
 
+   /* we are reusing so sync this buffer for CPU use */
+   dma_sync_single_range_for_cpu(rx_ring->dev,
+ rx_buffer->dma,
+ rx_buffer->page_offset,
+ size,
+ DMA_FROM_DEVICE);
+
if (likely(!skb)) {
void *page_addr = page_address(page) +
  rx_buffer->page_offset;
@@ -887,21 +894,15 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
prefetchw(skb->data);
}
 
-   /* we are reusing so sync this buffer for CPU use */
-   dma_sync_single_range_for_cpu(rx_ring->dev,
- rx_buffer->dma,
- rx_buffer->page_offset,
- size,
- DMA_FROM_DEVICE);
-
/* pull page into skb */
if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
/* hand second half of page back to the ring */
ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
} else {
/* we are not reusing the buffer so unmap it */
-   dma_unmap_page(rx_ring->dev, rx_buf

[net-next 05/15] ixgbevf: update code to better handle incrementing page count

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit bd4171a5d4c2
("igb: update code to better handle incrementing page count")

Update the driver code so that we do bulk updates of the page reference
count instead of just incrementing it by one reference at a time.  The
advantage to doing this is that we cut down on atomic operations and
this in turn should give us a slight improvement in cycles per packet.
In addition if we eventually move this over to using build_skb the gains
will be more noticeable.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  7 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 30 +--
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index b1da9f41c1dc..c70a789035ae 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -62,7 +62,12 @@ struct ixgbevf_tx_buffer {
 struct ixgbevf_rx_buffer {
dma_addr_t dma;
struct page *page;
-   unsigned int page_offset;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+   __u32 page_offset;
+#else
+   __u16 page_offset;
+#endif
+   __u16 pagecnt_bias;
 };
 
 struct ixgbevf_stats {
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index fbd493efd14e..ae2402ddd9fb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -611,6 +611,7 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring 
*rx_ring,
bi->dma = dma;
bi->page = page;
bi->page_offset = 0;
+   bi->pagecnt_bias = 1;
 
return true;
 }
@@ -747,6 +748,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring 
*rx_ring,
new_buff->page = old_buff->page;
new_buff->dma = old_buff->dma;
new_buff->page_offset = old_buff->page_offset;
+   new_buff->pagecnt_bias = old_buff->pagecnt_bias;
 }
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
@@ -758,13 +760,15 @@ static bool ixgbevf_can_reuse_rx_page(struct 
ixgbevf_rx_buffer *rx_buffer,
  struct page *page,
  const unsigned int truesize)
 {
+   unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+
/* avoid re-using remote pages */
if (unlikely(ixgbevf_page_is_reserved(page)))
return false;
 
 #if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
-   if (unlikely(page_count(page) != 1))
+   if (unlikely(page_ref_count(page) != pagecnt_bias))
return false;
 
/* flip page offset to other buffer */
@@ -778,10 +782,15 @@ static bool ixgbevf_can_reuse_rx_page(struct 
ixgbevf_rx_buffer *rx_buffer,
return false;
 
 #endif
-   /* Even if we own the page, we are not allowed to use atomic_set()
-* This would break get_page_unless_zero() users.
+
+   /* If we have drained the page fragment pool we need to update
+* the pagecnt_bias and page count so that we fully restock the
+* number of references the driver holds.
 */
-   page_ref_inc(page);
+   if (unlikely(pagecnt_bias == 1)) {
+   page_ref_add(page, USHRT_MAX);
+   rx_buffer->pagecnt_bias = USHRT_MAX;
+   }
 
return true;
 }
@@ -827,7 +836,6 @@ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring 
*rx_ring,
return true;
 
/* this page cannot be reused so discard it */
-   put_page(page);
return false;
}
 
@@ -899,10 +907,13 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
/* hand second half of page back to the ring */
ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
} else {
-   /* we are not reusing the buffer so unmap it */
+   /* We are not reusing the buffer so unmap it and free
+* any references we are holding to it
+*/
dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
 PAGE_SIZE, DMA_FROM_DEVICE,
 IXGBEVF_RX_DMA_ATTR);
+   __page_frag_cache_drain(page, rx_buffer->pagecnt_bias);
}
 
/* clear contents of buffer_info */
@@ -2135,6 +2146,8 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
struct ixgbevf_rx_buffer *rx_buffer;
 
rx_buffer = &rx_ring->rx_buffer_info[i];
+   if (!rx_buffer->page)
+   continue;
 
/* Invalidate cache lines that may have been written to by
 * device so that we avoid corruptin

[net-next 08/15] ixgbevf: improve performance and reduce size of ixgbevf_tx_map()

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit ec718254cbfe
("ixgbe: Improve performance and reduce size of ixgbe_tx_map")

This change is meant to both improve the performance and reduce the size of
ixgbevf_tx_map().

Expand the work done in the main loop by pushing first into tx_buffer.
This allows us to pull in the dma_mapping_error check, the tx_buffer value
assignment, and the initial DMA value assignment to the Tx descriptor.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 45 ++-
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index a793f9ea05e7..d3415ee38597 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3532,34 +3532,37 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
   struct ixgbevf_tx_buffer *first,
   const u8 hdr_len)
 {
-   dma_addr_t dma;
struct sk_buff *skb = first->skb;
struct ixgbevf_tx_buffer *tx_buffer;
union ixgbe_adv_tx_desc *tx_desc;
-   struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
-   unsigned int data_len = skb->data_len;
-   unsigned int size = skb_headlen(skb);
-   unsigned int paylen = skb->len - hdr_len;
+   struct skb_frag_struct *frag;
+   dma_addr_t dma;
+   unsigned int data_len, size;
u32 tx_flags = first->tx_flags;
-   __le32 cmd_type;
+   __le32 cmd_type = ixgbevf_tx_cmd_type(tx_flags);
u16 i = tx_ring->next_to_use;
 
tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 
-   ixgbevf_tx_olinfo_status(tx_desc, tx_flags, paylen);
-   cmd_type = ixgbevf_tx_cmd_type(tx_flags);
+   ixgbevf_tx_olinfo_status(tx_desc, tx_flags, skb->len - hdr_len);
+
+   size = skb_headlen(skb);
+   data_len = skb->data_len;
 
dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
-   if (dma_mapping_error(tx_ring->dev, dma))
-   goto dma_error;
 
-   /* record length, and DMA address */
-   dma_unmap_len_set(first, len, size);
-   dma_unmap_addr_set(first, dma, dma);
+   tx_buffer = first;
 
-   tx_desc->read.buffer_addr = cpu_to_le64(dma);
+   for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
+   if (dma_mapping_error(tx_ring->dev, dma))
+   goto dma_error;
+
+   /* record length, and DMA address */
+   dma_unmap_len_set(tx_buffer, len, size);
+   dma_unmap_addr_set(tx_buffer, dma, dma);
+
+   tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
-   for (;;) {
while (unlikely(size > IXGBE_MAX_DATA_PER_TXD)) {
tx_desc->read.cmd_type_len =
cmd_type | cpu_to_le32(IXGBE_MAX_DATA_PER_TXD);
@@ -3570,12 +3573,12 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
i = 0;
}
+   tx_desc->read.olinfo_status = 0;
 
dma += IXGBE_MAX_DATA_PER_TXD;
size -= IXGBE_MAX_DATA_PER_TXD;
 
tx_desc->read.buffer_addr = cpu_to_le64(dma);
-   tx_desc->read.olinfo_status = 0;
}
 
if (likely(!data_len))
@@ -3589,23 +3592,15 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
i = 0;
}
+   tx_desc->read.olinfo_status = 0;
 
size = skb_frag_size(frag);
data_len -= size;
 
dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
   DMA_TO_DEVICE);
-   if (dma_mapping_error(tx_ring->dev, dma))
-   goto dma_error;
 
tx_buffer = &tx_ring->tx_buffer_info[i];
-   dma_unmap_len_set(tx_buffer, len, size);
-   dma_unmap_addr_set(tx_buffer, dma, dma);
-
-   tx_desc->read.buffer_addr = cpu_to_le64(dma);
-   tx_desc->read.olinfo_status = 0;
-
-   frag++;
}
 
/* write last descriptor with RS and EOP bits */
-- 
2.14.3

1 2 >

1 - 100 of 193 matches

Mail list logo