[PATCH] xfrm: Add ESN support for IPSec HW offload
From: Yossef Efraim This patch adds ESN support to IPsec device offload. Adding new xfrm device operation to synchronize device ESN. Signed-off-by: Yossef Efraim Signed-off-by: Shannon Nelson Signed-off-by: Steffen Klassert --- Documentation/networking/xfrm_device.txt | 3 +++ include/linux/netdevice.h| 1 + include/net/xfrm.h | 12 net/xfrm/xfrm_device.c | 11 +-- net/xfrm/xfrm_replay.c | 2 ++ 5 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt index 2d9d588cd34b..50c34ca65efe 100644 --- a/Documentation/networking/xfrm_device.txt +++ b/Documentation/networking/xfrm_device.txt @@ -41,6 +41,7 @@ struct xfrmdev_ops { void(*xdo_dev_state_free) (struct xfrm_state *x); bool(*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); + void(*xdo_dev_state_advance_esn) (struct xfrm_state *x); }; The NIC driver offering ipsec offload will need to implement these @@ -117,6 +118,8 @@ the stack in xfrm_input(). hand the packet to napi_gro_receive() as usual +In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn(). +Driver will check packet seq number and update HW ESN state machine if needed. When the SA is removed by the user, the driver's xdo_dev_state_delete() is asked to disable the offload. Later, xdo_dev_state_free() is called diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ed0799a12bf2..540151875444 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -851,6 +851,7 @@ struct xfrmdev_ops { void(*xdo_dev_state_free) (struct xfrm_state *x); bool(*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); + void(*xdo_dev_state_advance_esn) (struct xfrm_state *x); }; #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2e6d4fe6b0ba..7d2077665c0b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1904,6 +1904,14 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + + if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) + xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); +} + static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; @@ -1974,6 +1982,10 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x return false; } +static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) +{ +} + static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { return false; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 75982506617b..93520106731f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -147,8 +147,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (!x->type_offload) return -EINVAL; - /* We don't yet support UDP encapsulation, TFC padding and ESN. */ - if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) + /* We don't yet support UDP encapsulation and TFC padding. */ + if (x->encap || x->tfcpad) return -EINVAL; dev = dev_get_by_index(net, xuo->ifindex); @@ -178,6 +178,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return 0; } + if (x->props.flags & XFRM_STATE_ESN && + !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { + xso->dev = NULL; + dev_put(dev); + return -EINVAL; + } + xso->dev = dev; xso->num_exthdrs = 1; xso->flags = xuo->flags; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 02501817227b..1d38c6acf8af 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -551,6 +551,8 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) bitnr = replay_esn->replay_window - (diff - pos); } + xfrm_dev_state_advance_esn(x); + nr = bitnr >> 5; bitnr = bitnr & 0x1F; replay_esn->bmp[nr] |= (1U << bitnr); -- 2.14.1
pull request (net-next): ipsec-next 2018-01-26
One last patch for this development cycle: 1) Add ESN support for IPSec HW offload. From Yossef Efraim. Please pull or let me know if there are problems. Thanks! The following changes since commit 4f7d58517f461aa6e7b7509668f04021e089323d: Merge tag 'linux-can-next-for-4.16-20180116' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next (2018-01-17 16:08:25 -0500) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master for you to fetch changes up to 50bd870a9e5cca9fcf5fb4c130c373643d7d9906: xfrm: Add ESN support for IPSec HW offload (2018-01-18 10:42:59 +0100) Yossef Efraim (1): xfrm: Add ESN support for IPSec HW offload Documentation/networking/xfrm_device.txt | 3 +++ include/linux/netdevice.h| 1 + include/net/xfrm.h | 12 net/xfrm/xfrm_device.c | 11 +-- net/xfrm/xfrm_replay.c | 2 ++ 5 files changed, 27 insertions(+), 2 deletions(-)
Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin wrote: > On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote: >> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin wrote: >> > On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote: >> >> First off, as mentioned in another thread, the model of stacking up >> >> virt-bond functionality over virtio seems a wrong direction to me. >> >> Essentially the migration process would need to carry over all guest >> >> side configurations previously done on the VF/PT and get them moved to >> >> the new device being it virtio or VF/PT. >> > >> > I might be wrong but I don't see why we should worry about this usecase. >> > Whoever has a bond configured already has working config for migration. >> > We are trying to help people who don't, not convert existig users. >> >> That has been placed in the view of cloud providers that the imported >> images from the store must be able to run unmodified thus no >> additional setup script is allowed (just as Stephen mentioned in >> another mail). Cloud users don't care about live migration themselves >> but the providers are required to implement such automation mechanism >> to make this process transparent if at all possible. The user does not >> care about the device underneath being VF or not, but they do care >> about consistency all across and the resulting performance >> acceleration in making VF the prefered datapath. It is not quite >> peculiar user cases but IMHO *any* approach proposed for live >> migration should be able to persist the state including network config >> e.g. as simple as MTU. Actually this requirement has nothing to do >> with virtio but our target users are live migration agnostic, being it >> tracking DMA through dirty pages, using virtio as the helper, or >> whatsoever, the goal of persisting configs across remains same. > > So the patching being discussed here will mostly do exactly that if your > original config was simply a single virtio net device. > True, but I don't see the patch being discussed starts with good foundation of supporting the same for VF/PT device. That is the core of the issue. > > What kind of configs do your users have right now? Any configs be it generic or driver specific that the VF/PT device supports and have been enabled/configured. General network configs (MAC, IP address, VLAN, MTU, iptables rules), ethtool settings (hardware offload, # of queues and ring entris, RSC options, rss rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc flower offload, just to name a few. As cloud providers we don't limit users from applying driver specific tuning to the NIC/VF, and sometimes this is essential to achieving best performance for their workload. We've seen cases like tuning coalescing parameters for getting low latency, changing rx-flow-hash function for better VXLAN throughput, or even adopting quite advanced NIC features such as flow director or cloud filter. We don't expect users to compromise even a little bit on these. That is once we turn on live migration for the VF or pass through devices in the VM, it all takes place under the hood, users (guest admins, applications) don't have to react upon it or even notice the change. I should note that the majority of live migrations take place between machines with completely identical hardware, it's more critical than necessary to keep the config as-is across the move, stealth while quiet. As you see generic bond or bridge cannot suffice the need. That's why we need a new customized virt bond driver, and tailor it for VM live migration specifically. Leveraging para-virtual e.g. virtio net device as the backup path is one thing, tracking through driver config changes in order to re-config as necessary is another. I would think without considering the latter, the proposal being discussed is rather incomplete, and remote to be useful in production. > > >> > >> >> Without the help of a new >> >> upper layer bond driver that enslaves virtio and VF/PT devices >> >> underneath, virtio will be overloaded with too much specifics being a >> >> VF/PT backup in the future. >> > >> > So this paragraph already includes at least two conflicting >> > proposals. On the one hand you want a separate device for >> > the virtual bond, on the other you are saying a separate >> > driver. >> >> Just to be crystal clear: separate virtual bond device (netdev ops, >> not necessarily bus device) for VM migration specifically with a >> separate driver. > > Okay, but note that any config someone had on a virtio device won't > propagate to that bond. > >> > >> > Further, the reason to have a separate *driver* was that >> > some people wanted to share code with netvsc - and that >> > one does not create a separate device, which you can't >> > change without breaking existing configs. >> >> I'm not sure I understand this statement. netvsc is already another >> netdev being created than the enslaved VF netdev, why it bothers? >
[PATCH] atm: he: Replace GFP_ATOMIC with GFP_KERNEL in he_open
After checking all possible call chains to he_open() here, my tool finds that he_open() is never called in atomic context. And this function is assigned to a function pointer "dev->ops->open", which is only called by __vcc_connect() (net/atm/common.c) through dev->ops->send(), and __vcc_connect() is only called by vcc_connect(), which calls mutex_lock(), so it indicates that he_open() can call functions which may sleep. Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. Signed-off-by: Jia-Ju Bai --- drivers/atm/he.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/he.c b/drivers/atm/he.c index e58538c..fea5bf0 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -2135,7 +2135,7 @@ static int he_start(struct atm_dev *dev) cid = he_mkcid(he_dev, vpi, vci); - he_vcc = kmalloc(sizeof(struct he_vcc), GFP_ATOMIC); + he_vcc = kmalloc(sizeof(struct he_vcc), GFP_KERNEL); if (he_vcc == NULL) { hprintk("unable to allocate he_vcc during open\n"); return -ENOMEM; -- 1.7.9.5
[PATCH net-next 5/5] net/smc: return booleans instead of integers
From: Gustavo A. R. Silva Return statements in functions returning bool should use true/false instead of 1/0. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Ursula Braun --- net/smc/smc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index bfbe20234105..9518986c97b1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -252,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed) static inline bool using_ipsec(struct smc_sock *smc) { return (smc->clcsock->sk->sk_policy[0] || - smc->clcsock->sk->sk_policy[1]) ? 1 : 0; + smc->clcsock->sk->sk_policy[1]) ? true : false; } #else static inline bool using_ipsec(struct smc_sock *smc) { - return 0; + return false; } #endif -- 2.13.5
[PATCH net-next 4/5] net/smc: release clcsock from tcp_listen_worker
Closing a listen socket may hit the warning WARN_ON(sock_owned_by_user(sk)) of tcp_close(), if the wake up of the smc_tcp_listen_worker has not yet finished. This patch introduces smc_close_wait_listen_clcsock() making sure the listening internal clcsock has been closed in smc_tcp_listen_work(), before the listening external SMC socket finishes closing. Signed-off-by: Ursula Braun --- net/smc/af_smc.c| 13 - net/smc/smc_close.c | 33 - 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 732a37ddbc21..267e68379110 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -670,6 +670,10 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { + if (isk->clcsock) { + sock_release(isk->clcsock); + isk->clcsock = NULL; + } new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ continue; @@ -969,8 +973,15 @@ static void smc_tcp_listen_work(struct work_struct *work) } out: + if (lsmc->clcsock) { + sock_release(lsmc->clcsock); + lsmc->clcsock = NULL; + } release_sock(lsk); - lsk->sk_data_ready(lsk); /* no more listening, wake accept */ + /* no more listening, wake up smc_close_wait_listen_clcsock and +* accept +*/ + lsk->sk_state_change(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_listen */ } diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 4339852a8910..e339c0186dcf 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -19,6 +19,8 @@ #include "smc_cdc.h" #include "smc_close.h" +#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) + static void smc_close_cleanup_listen(struct sock *parent) { struct sock *sk; @@ -28,6 +30,27 @@ static void smc_close_cleanup_listen(struct sock *parent) smc_close_non_accepted(sk); } +static void smc_close_wait_listen_clcsock(struct smc_sock *smc) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + signed long timeout; + + timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME; + add_wait_queue(sk_sleep(sk), &wait); + do { + release_sock(sk); + if (smc->clcsock) + timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE, +timeout); + sched_annotate_sleep(); + lock_sock(sk); + if (!smc->clcsock) + break; + } while (timeout); + remove_wait_queue(sk_sleep(sk), &wait); +} + /* wait for sndbuf data being transmitted */ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) { @@ -114,7 +137,6 @@ static void smc_close_active_abort(struct smc_sock *smc) break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - sock_release(smc->clcsock); if (!smc_cdc_rxed_any_close(&smc->conn)) sk->sk_state = SMC_PEERABORTWAIT; else @@ -128,7 +150,6 @@ static void smc_close_active_abort(struct smc_sock *smc) if (!txflags->peer_conn_closed) { /* just SHUTDOWN_SEND done */ sk->sk_state = SMC_PEERABORTWAIT; - sock_release(smc->clcsock); } else { sk->sk_state = SMC_CLOSED; } @@ -136,8 +157,6 @@ static void smc_close_active_abort(struct smc_sock *smc) break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: - if (!txflags->peer_conn_closed) - sock_release(smc->clcsock); sk->sk_state = SMC_CLOSED; break; case SMC_PEERFINCLOSEWAIT: @@ -177,8 +196,6 @@ int smc_close_active(struct smc_sock *smc) switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_CLOSED; - if (smc->smc_listen_work.func) - cancel_work_sync(&smc->smc_listen_work); break; case SMC_LISTEN: sk->sk_state = SMC_CLOSED; @@ -187,11 +204,9 @@ int smc_close_active(struct smc_sock *smc) rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); /* wake up kernel_accept of smc_tcp_listen_worker */ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); + smc_close_wait_listen_clcsock(smc); } - release_sock(sk); smc_close_cleanup_listen(sk); - cancel_work_sync(&smc->smc_listen_work); - lo
[PATCH net-next 0/5] net/smc: fixes 2018-01-26
Dave, here are some more smc patches. The first 4 patches take care about different aspects of smc socket closing, the 5th patch improves coding style. Thanks, Ursula Gustavo A. R. Silva (1): smc: return booleans instead of integers Ursula Braun (4): net/smc: handle device, port, and QP error events net/smc: smc_poll improvements net/smc: replace sock_put worker by socket refcounting net/smc: release clcsock from tcp_listen_worker net/smc/af_smc.c| 152 ++-- net/smc/smc.h | 5 +- net/smc/smc_cdc.c | 20 +++ net/smc/smc_close.c | 96 + net/smc/smc_close.h | 1 - net/smc/smc_core.c | 6 +-- net/smc/smc_ib.c| 38 - 7 files changed, 191 insertions(+), 127 deletions(-) -- 2.13.5
[PATCH net-next 3/5] net/smc: replace sock_put worker by socket refcounting
Proper socket refcounting makes the sock_put worker obsolete. Signed-off-by: Ursula Braun --- net/smc/af_smc.c| 65 + net/smc/smc.h | 1 - net/smc/smc_cdc.c | 20 + net/smc/smc_close.c | 63 ++- net/smc/smc_close.h | 1 - net/smc/smc_core.c | 6 ++--- 6 files changed, 88 insertions(+), 68 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 90c22a854f28..732a37ddbc21 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -115,7 +115,6 @@ static int smc_release(struct socket *sock) goto out; smc = smc_sk(sk); - sock_hold(sk); if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again @@ -124,10 +123,7 @@ static int smc_release(struct socket *sock) else lock_sock(sk); - if (smc->use_fallback) { - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); - } else { + if (!smc->use_fallback) { rc = smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; @@ -136,20 +132,21 @@ static int smc_release(struct socket *sock) sock_release(smc->clcsock); smc->clcsock = NULL; } + if (smc->use_fallback) { + sock_put(sk); /* passive closing */ + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } /* detach socket */ sock_orphan(sk); sock->sk = NULL; - if (smc->use_fallback) { - schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); - } else if (sk->sk_state == SMC_CLOSED) { + if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); - } release_sock(sk); - sock_put(sk); + sk->sk_prot->unhash(sk); + sock_put(sk); /* final sock_put */ out: return rc; } @@ -181,7 +178,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); - INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); @@ -399,6 +395,8 @@ static int smc_connect_rdma(struct smc_sock *smc) int rc = 0; u8 ibport; + sock_hold(&smc->sk); /* sock put in passive closing */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { /* peer has not signalled SMC-capability */ smc->use_fallback = true; @@ -542,6 +540,8 @@ static int smc_connect_rdma(struct smc_sock *smc) mutex_unlock(&smc_create_lgr_pending); smc_conn_free(&smc->conn); out_err: + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return rc; } @@ -620,7 +620,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); new_sk->sk_prot->unhash(new_sk); - sock_put(new_sk); + sock_put(new_sk); /* final */ *new_smc = NULL; goto out; } @@ -637,7 +637,7 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) { struct smc_sock *par = smc_sk(parent); - sock_hold(sk); + sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); spin_unlock(&par->accept_q_lock); @@ -653,7 +653,7 @@ static void smc_accept_unlink(struct sock *sk) list_del_init(&smc_sk(sk)->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); - sock_put(sk); + sock_put(sk); /* sock_hold in smc_accept_enqueue */ } /* remove a sock from the accept queue to bind it to a new socket created @@ -671,7 +671,7 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); - sock_put(new_sk); + sock_put(new_sk); /* final */ continue; } if (new_sock) @@ -686,14 +686,11 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); - sock_hold(sk); lock_sock(sk); if (!sk->sk_lingertime)
[PATCH net-next 2/5] net/smc: smc_poll improvements
Increase the socket refcount during poll wait. Take the socket lock before checking socket state. For a listening socket return a mask independent of state SMC_ACTIVE and cover errors or closed state as well. Get rid of the accept_q loop in smc_accept_poll(). Signed-off-by: Ursula Braun --- net/smc/af_smc.c | 74 ++-- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cf0e11978b66..90c22a854f28 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1122,21 +1122,15 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, static unsigned int smc_accept_poll(struct sock *parent) { - struct smc_sock *isk; - struct sock *sk; - - lock_sock(parent); - list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)isk; + struct smc_sock *isk = smc_sk(parent); + int mask = 0; - if (sk->sk_state == SMC_ACTIVE) { - release_sock(parent); - return POLLIN | POLLRDNORM; - } - } - release_sock(parent); + spin_lock(&isk->accept_q_lock); + if (!list_empty(&isk->accept_q)) + mask = POLLIN | POLLRDNORM; + spin_unlock(&isk->accept_q_lock); - return 0; + return mask; } static unsigned int smc_poll(struct file *file, struct socket *sock, @@ -1147,9 +1141,15 @@ static unsigned int smc_poll(struct file *file, struct socket *sock, struct smc_sock *smc; int rc; + if (!sk) + return POLLNVAL; + smc = smc_sk(sock->sk); + sock_hold(sk); + lock_sock(sk); if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ + release_sock(sk); mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); /* if non-blocking connect finished ... */ lock_sock(sk); @@ -1161,37 +1161,43 @@ static unsigned int smc_poll(struct file *file, struct socket *sock, rc = smc_connect_rdma(smc); if (rc < 0) mask |= POLLERR; - else - /* success cases including fallback */ - mask |= POLLOUT | POLLWRNORM; + /* success cases including fallback */ + mask |= POLLOUT | POLLWRNORM; } } - release_sock(sk); } else { - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == SMC_LISTEN) - /* woken up by sk_data_ready in smc_listen_work() */ - mask |= smc_accept_poll(sk); + if (sk->sk_state != SMC_CLOSED) { + release_sock(sk); + sock_poll_wait(file, sk_sleep(sk), wait); + lock_sock(sk); + } if (sk->sk_err) mask |= POLLERR; - if (atomic_read(&smc->conn.sndbuf_space) || - (sk->sk_shutdown & SEND_SHUTDOWN)) { - mask |= POLLOUT | POLLWRNORM; - } else { - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - } - if (atomic_read(&smc->conn.bytes_to_rcv)) - mask |= POLLIN | POLLRDNORM; if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) mask |= POLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLIN | POLLRDNORM | POLLRDHUP; - if (sk->sk_state == SMC_APPCLOSEWAIT1) - mask |= POLLIN; + if (sk->sk_state == SMC_LISTEN) { + /* woken up by sk_data_ready in smc_listen_work() */ + mask = smc_accept_poll(sk); + } else { + if (atomic_read(&smc->conn.sndbuf_space) || + sk->sk_shutdown & SEND_SHUTDOWN) { + mask |= POLLOUT | POLLWRNORM; + } else { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + if (atomic_read(&smc->conn.bytes_to_rcv)) + mask |= POLLIN | POLLRDNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + if (sk->sk_state == SMC_APPCLOSEWAIT1
[PATCH net-next 1/5] net/smc: handle device, port, and QP error events
RoCE device changes cause an IB event, processed in the global event handler for the ROCE device. Problems for a certain Queue Pair cause a QP event, processed in the QP event handler for this QP. Among those events are port errors and other fatal device errors. All link groups using such a port or device must be terminated in those cases. Signed-off-by: Ursula Braun --- net/smc/smc_ib.c | 38 +- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 90f1a7f9085c..2a8957bd6d38 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -141,6 +141,17 @@ int smc_ib_ready_link(struct smc_link *lnk) return rc; } +static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *l; + + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + smc_lgr_terminate(lgr); + } +} + /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -151,6 +162,8 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); + if (!smc_ib_port_active(smcibdev, port_idx + 1)) + smc_ib_port_terminate(smcibdev, port_idx + 1); } } @@ -165,15 +178,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_PORT_ERR: - port_idx = ibevent->element.port_num - 1; - set_bit(port_idx, &smcibdev->port_event_mask); - schedule_work(&smcibdev->port_event_work); - /* fall through */ case IB_EVENT_DEVICE_FATAL: - /* tbd in follow-on patch: -* abnormal close of corresponding connections -*/ - break; case IB_EVENT_PORT_ACTIVE: port_idx = ibevent->element.port_num - 1; set_bit(port_idx, &smcibdev->port_event_mask); @@ -186,7 +191,8 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, void smc_ib_dealloc_protection_domain(struct smc_link *lnk) { - ib_dealloc_pd(lnk->roce_pd); + if (lnk->roce_pd) + ib_dealloc_pd(lnk->roce_pd); lnk->roce_pd = NULL; } @@ -203,14 +209,18 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { + struct smc_ib_device *smcibdev = + (struct smc_ib_device *)ibevent->device; + u8 port_idx; + switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: case IB_EVENT_GID_CHANGE: case IB_EVENT_PORT_ERR: case IB_EVENT_QP_ACCESS_ERR: - /* tbd in follow-on patch: -* abnormal close of corresponding connections -*/ + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); break; default: break; @@ -219,7 +229,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - ib_destroy_qp(lnk->roce_qp); + if (lnk->roce_qp) + ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; } @@ -462,6 +473,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) { if (!smcibdev->initialized) return; + smcibdev->initialized = 0; smc_wr_remove_dev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); ib_destroy_cq(smcibdev->roce_cq_recv); -- 2.13.5
Re: [PATCH net-next v1] samples/bpf: Partially fixes the bpf.o build
On 26/01/2018 03:16, Alexei Starovoitov wrote: > On Fri, Jan 26, 2018 at 01:39:30AM +0100, Mickaël Salaün wrote: >> Do not build lib/bpf/bpf.o with this Makefile but use the one from the >> library directory. This avoid making a buggy bpf.o file (e.g. missing >> symbols). > > could you provide an example? > What symbols will be missing? > I don't think there is an issue with existing Makefile. You can run this commands: make -C samples/bpf; nm tools/lib/bpf/bpf.o > a; make -C tools/lib/bpf; nm tools/lib/bpf/bpf.o > b; diff -u a b Symbols like bzero and sys_bpf are missing with the samples/bpf Makefile, which makes the bpf.o shrink from 25K to 7K. > >> This patch is useful if some code (e.g. Landlock tests) needs both the >> bpf.o (from tools/lib/bpf) and the bpf_load.o (from samples/bpf). > > is that some future patches? Yes, I'll send them next week. > > we're trying to move everything form samples/bpf/ into selftests/bpf/ > and convert to use libbpf.a instead of obsolete bpf_load.c > Please use this approach for landlock as well. Ok, it should be better with this lib. > >> Signed-off-by: Mickaël Salaün >> Cc: Alexei Starovoitov >> Cc: Daniel Borkmann >> --- >> >> This is not a complet fix because the call to multi_depend with >> $(host-cmulti) from scripts/Makefile.host force the build of bpf.o >> anyway. I'm not sure how to completely avoid this automatic build >> though. >> --- >> samples/bpf/Makefile | 5 - >> 1 file changed, 4 insertions(+), 1 deletion(-) >> >> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile >> index 7f61a3d57fa7..64335bb94f9f 100644 >> --- a/samples/bpf/Makefile >> +++ b/samples/bpf/Makefile >> @@ -201,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH) >> endif >> >> # Trick to allow make to be run from this directory >> -all: >> +all: $(LIBBPF) >> $(MAKE) -C ../../ $(CURDIR)/ >> >> clean: >> $(MAKE) -C ../../ M=$(CURDIR) clean >> @rm -f *~ >> >> +$(LIBBPF): FORCE >> +$(MAKE) -C $(dir $@) $(notdir $@) >> + >> $(obj)/syscall_nrs.s: $(src)/syscall_nrs.c >> $(call if_changed_dep,cc_s_c) >> >> -- >> 2.15.1 >> > signature.asc Description: OpenPGP digital signature
Re: [PATCH net-next 0/3 V1] rtnetlink: enable IFLA_IF_NETNSID for RTM_{DEL,SET}LINK
On Fri, 26 Jan 2018 00:34:51 +0100, Nicolas Dichtel wrote: > Why meaningful? The user knows that the answer is like if if was done in > another > netns. It enables to have only one netlink socket instead of one per netns. > But > the code using it will be the same. Because you can't use it to query the linked interface. You can't even use it as an opaque value to track interfaces (netnsid+ifindex) because netnsids are not unique across net name spaces. You can easily have two interfaces that have all the ifindex, ifname, netnsid (and basically everything else) identical but being completely different interfaces. That's really not helpful. > I fear that with your approach, it will results to a lot of complexity in the > kernel. The complexity is (at least partly) already there. It's an inevitable result of the design decision to have relative identifiers. I agree that we should think about how to make this easy to implement. I like your idea of doing this somehow generically. Perhaps it's possible to do while keeping the netnsids valid in the caller's netns? > What is really missing for me, is a way to get a fd from an nsid. The user > should be able to call RTM_GETNSID with an fd and a nsid and the kernel > performs > the needed operations so that the fd points to the corresponding netns. That's what I was missing, too. I even looked into implementing it. But opening a fd on behalf of the process and returning it over netlink is a wrong thing to do. Netlink messages can get lost. Then you have a fd leak you can do nothing about. Given that we have netnsids used for so much stuff already (like NETLINK_LISTEN_ALL_NSID) you need to track them anyway. And if you need to track them, why bother with another identifier? It would be better if netnsid can be used universally for anything. Then there will be no need for the conversion. Jiri
[PATCH] tcp: release sk_frag.page in tcp_disconnect
socket can be disconnected and gets transformed back to a listening socket, if sk_frag.page is not released, which will be cloned into a new socket by sk_clone_lock, but the reference count of this page is increased, lead to a use after free or double free issue Signed-off-by: Li RongQing Cc: Eric Dumazet --- net/ipv4/tcp.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f08eebe60446..73f068406519 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2431,6 +2431,12 @@ int tcp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + sk->sk_error_report(sk); return err; } -- 2.11.0
[PATCH net-next] sfc: add suffix to large constant in ptp
Fixes: 1280c0f8aafc ("sfc: support second + quarter ns time format for receive datapath") Reported-by: kbuild test robot Signed-off-by: Bert Kenward --- drivers/net/ethernet/sfc/ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 433d29d6bc95..f1cc2ed76029 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -643,7 +643,7 @@ static int efx_ptp_get_attributes(struct efx_nic *efx) case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_QTR_NANOSECONDS: ptp->ns_to_nic_time = efx_ptp_ns_to_s_qns; ptp->nic_to_kernel_time = efx_ptp_s_qns_to_ktime_correction; - ptp->nic_time.minor_max = 40; + ptp->nic_time.minor_max = 40UL; ptp->nic_time.sync_event_minor_shift = 24; break; default: -- 2.13.6
[PATCH 2/3] Revert "e1000e: Separate signaling for link check/link up"
This reverts commit 19110cfbb34d4af0cdfe14cd243f3b09dc95b013. This reverts commit 4110e02eb45ea447ec6f5459c9934de0a273fb91. ... because they cause an extra 2s delay for the link to come up when autoneg is off. After reverting, the race condition described in the log of commit 19110cfbb34d ("e1000e: Separate signaling for link check/link up") is reintroduced. It may still be triggered by LSC events but this should not result in link flap. It may no longer be triggered by RXO events because commit 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts") restored reading icr in the Other handler. As discussed, the driver should be in "maintenance mode". In the interest of stability, revert to the original code as much as possible instead of a half-baked solution. Link: https://www.spinics.net/lists/netdev/msg479923.html Signed-off-by: Benjamin Poirier --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 +++ drivers/net/ethernet/intel/e1000e/mac.c | 11 +++ drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 31277d3bb7dc..d6d4ed7acf03 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1367,9 +1367,6 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) * Checks to see of the link status of the hardware has changed. If a * change in link status has been detected, then we read the PHY registers * to get the current speed/duplex if link exists. - * - * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link - * up). **/ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) { @@ -1385,7 +1382,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) * Change or Rx Sequence Error interrupt. */ if (!mac->get_link_status) - return 1; + return 0; /* First we want to see if the MII Status Register reports * link. If so, then we want to get the current speed/duplex @@ -1616,12 +1613,10 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) * different link partner. */ ret_val = e1000e_config_fc_after_link_up(hw); - if (ret_val) { + if (ret_val) e_dbg("Error configuring flow control\n"); - return ret_val; - } - return 1; + return ret_val; } static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter) diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c index f457c5703d0c..b322011ec282 100644 --- a/drivers/net/ethernet/intel/e1000e/mac.c +++ b/drivers/net/ethernet/intel/e1000e/mac.c @@ -410,9 +410,6 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw) * Checks to see of the link status of the hardware has changed. If a * change in link status has been detected, then we read the PHY registers * to get the current speed/duplex if link exists. - * - * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link - * up). **/ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) { @@ -426,7 +423,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) * Change or Rx Sequence Error interrupt. */ if (!mac->get_link_status) - return 1; + return 0; /* First we want to see if the MII Status Register reports * link. If so, then we want to get the current speed/duplex @@ -464,12 +461,10 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) * different link partner. */ ret_val = e1000e_config_fc_after_link_up(hw); - if (ret_val) { + if (ret_val) e_dbg("Error configuring flow control\n"); - return ret_val; - } - return 1; + return ret_val; } /** diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 398e940436f8..ed103b9a8d3a 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5091,7 +5091,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter) case e1000_media_type_copper: if (hw->mac.get_link_status) { ret_val = hw->mac.ops.check_for_link(hw); - link_active = ret_val > 0; + link_active = !hw->mac.get_link_status; } else { link_active = true; } -- 2.15.1
[PATCH 1/3] Partial revert "e1000e: Avoid receiver overrun interrupt bursts"
This reverts commit 4aea7a5c5e940c1723add439f4088844cd26196d. We keep the fix for the first part of the problem (1) described in the log of that commit however we use the implementation of e1000_msix_other() from before commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt"). We remove the fix for the second part of the problem (2). Bursts of "Other" interrupts may once again occur during rxo (receive overflow) traffic conditions. This is deemed acceptable in the interest of reverting driver code back to its previous state. The e1000e driver should be in "maintenance" mode and we want to avoid unforeseen fallout from changes that are not strictly necessary. Link: https://www.spinics.net/lists/netdev/msg480675.html Signed-off-by: Benjamin Poirier --- drivers/net/ethernet/intel/e1000e/defines.h | 1 - drivers/net/ethernet/intel/e1000e/netdev.c | 32 +++-- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index afb7ebe20b24..0641c0098738 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -398,7 +398,6 @@ #define E1000_ICR_LSC 0x0004 /* Link Status Change */ #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */ #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */ -#define E1000_ICR_RXO 0x0040 /* Receiver Overrun */ #define E1000_ICR_RXT0 0x0080 /* Rx timer intr (ring 0) */ #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */ /* If this bit asserted, the driver should claim the interrupt */ diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 9f18d39bdc8f..398e940436f8 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -1914,30 +1914,23 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) struct net_device *netdev = data; struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; - u32 icr; - bool enable = true; - - icr = er32(ICR); - if (icr & E1000_ICR_RXO) { - ew32(ICR, E1000_ICR_RXO); - enable = false; - /* napi poll will re-enable Other, make sure it runs */ - if (napi_schedule_prep(&adapter->napi)) { - adapter->total_rx_bytes = 0; - adapter->total_rx_packets = 0; - __napi_schedule(&adapter->napi); - } - } - if (icr & E1000_ICR_LSC) { - ew32(ICR, E1000_ICR_LSC); + u32 icr = er32(ICR); + + if (icr & adapter->eiac_mask) + ew32(ICS, (icr & adapter->eiac_mask)); + + if (icr & E1000_ICR_OTHER) { + if (!(icr & E1000_ICR_LSC)) + goto no_link_interrupt; hw->mac.get_link_status = true; /* guard against interrupt when we're going down */ if (!test_bit(__E1000_DOWN, &adapter->state)) mod_timer(&adapter->watchdog_timer, jiffies + 1); } - if (enable && !test_bit(__E1000_DOWN, &adapter->state)) - ew32(IMS, E1000_IMS_OTHER); +no_link_interrupt: + if (!test_bit(__E1000_DOWN, &adapter->state)) + ew32(IMS, E1000_IMS_LSC | E1000_IMS_OTHER); return IRQ_HANDLED; } @@ -2707,8 +2700,7 @@ static int e1000e_poll(struct napi_struct *napi, int weight) napi_complete_done(napi, work_done); if (!test_bit(__E1000_DOWN, &adapter->state)) { if (adapter->msix_entries) - ew32(IMS, adapter->rx_ring->ims_val | -E1000_IMS_OTHER); + ew32(IMS, adapter->rx_ring->ims_val); else e1000_irq_enable(adapter); } -- 2.15.1
[PATCH 3/3] Revert "e1000e: Do not read ICR in Other interrupt"
This reverts commit 16ecba59bc333d6282ee057fb02339f77a880beb. It was reported that emulated e1000e devices in vmware esxi 6.5 Build 7526125 do not link up after commit 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts"). Some tracing shows that after e1000e_trigger_lsc() is called, ICR reads out as 0x0 in e1000_msix_other() on emulated e1000e devices. In comparison, on real e1000e 82574 hardware, icr=0x8004 (_INT_ASSERTED | _LSC) in the same situation. Some experimentation showed that this flaw in vmware e1000e emulation can be worked around by not setting Other in EIAC. This is how it was before commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt"). Since the ICR read in the Other interrupt handler has already been restored, this patch effectively reverts the remainder of commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt"). Fixes: 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts") Signed-off-by: Benjamin Poirier --- drivers/net/ethernet/intel/e1000e/netdev.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index ed103b9a8d3a..fffc1f0e3895 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -1916,6 +1916,13 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) struct e1000_hw *hw = &adapter->hw; u32 icr = er32(ICR); + /* Certain events (such as RXO) which trigger Other do not set +* INT_ASSERTED. In that case, read to clear of icr does not take +* place. +*/ + if (!(icr & E1000_ICR_INT_ASSERTED)) + ew32(ICR, E1000_ICR_OTHER); + if (icr & adapter->eiac_mask) ew32(ICS, (icr & adapter->eiac_mask)); @@ -2033,7 +2040,6 @@ static void e1000_configure_msix(struct e1000_adapter *adapter) hw->hw_addr + E1000_EITR_82574(vector)); else writel(1, hw->hw_addr + E1000_EITR_82574(vector)); - adapter->eiac_mask |= E1000_IMS_OTHER; /* Cause Tx interrupts on every write back */ ivar |= BIT(31); @@ -2258,7 +2264,7 @@ static void e1000_irq_enable(struct e1000_adapter *adapter) if (adapter->msix_entries) { ew32(EIAC_82574, adapter->eiac_mask & E1000_EIAC_MASK_82574); - ew32(IMS, adapter->eiac_mask | E1000_IMS_LSC); + ew32(IMS, adapter->eiac_mask | E1000_IMS_OTHER | E1000_IMS_LSC); } else if (hw->mac.type >= e1000_pch_lpt) { ew32(IMS, IMS_ENABLE_MASK | E1000_IMS_ECCER); } else { -- 2.15.1
[PATCH 0/3] e1000e: Revert interrupt handling changes
As discussed in the thread "[RFC PATCH] e1000e: Remove Other from EIAC.", https://www.spinics.net/lists/netdev/msg479311.html The following list of commits was considered: 4d432f67ff00 e1000e: Remove unreachable code (v4.5-rc1) 16ecba59bc33 e1000e: Do not read ICR in Other interrupt (v4.5-rc1) a61cfe4ffad7 e1000e: Do not write lsc to ics in msi-x mode (v4.5-rc1) 0a8047ac68e5 e1000e: Fix msi-x interrupt automask (v4.5-rc1) 19110cfbb34d e1000e: Separate signaling for link check/link up (v4.15-rc1) 4aea7a5c5e94 e1000e: Avoid receiver overrun interrupt bursts (v4.15-rc1) 4110e02eb45e e1000e: Fix e1000_check_for_copper_link_ich8lan return value. (v4.15-rc8) There have a been a slew of regressions due to unforeseen consequences (receive overflow triggers Other, vmware's emulated e1000e) and programming mistakes (4110e02eb45e). Since the e1000e driver is supposed to be in maintenance mode, this patch series revisits the above changes to prune them down. After this series, the remaining differences related to how interrupts were handled at commit 4d432f67ff00 ("e1000e: Remove unreachable code", v4.5-rc1) are: * the changes in commit 0a8047ac68e5 ("e1000e: Fix msi-x interrupt automask", v4.5-rc1) are preserved. * we manually clear Other from icr in e1000_msix_other(). We try to go back to a long lost time when things were simple and drivers ran smoothly. Benjamin Poirier (3): Partial revert "e1000e: Avoid receiver overrun interrupt bursts" Revert "e1000e: Separate signaling for link check/link up" Revert "e1000e: Do not read ICR in Other interrupt" drivers/net/ethernet/intel/e1000e/defines.h | 1 - drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 ++-- drivers/net/ethernet/intel/e1000e/mac.c | 11 ++-- drivers/net/ethernet/intel/e1000e/netdev.c | 44 ++--- 4 files changed, 27 insertions(+), 40 deletions(-) -- 2.15.1
Re: [PATCH net-next 0/3 V1] rtnetlink: enable IFLA_IF_NETNSID for RTM_{DEL,SET}LINK
Le 26/01/2018 à 09:36, Jiri Benc a écrit : > On Fri, 26 Jan 2018 00:34:51 +0100, Nicolas Dichtel wrote: >> Why meaningful? The user knows that the answer is like if if was done in >> another >> netns. It enables to have only one netlink socket instead of one per netns. >> But >> the code using it will be the same. > > Because you can't use it to query the linked interface. You can't even > use it as an opaque value to track interfaces (netnsid+ifindex) because > netnsids are not unique across net name spaces. You can easily have two > interfaces that have all the ifindex, ifname, netnsid (and basically > everything else) identical but being completely different interfaces. Yes, the user have to map those info correctly. And this complexifies the (user) code a lot. > That's really not helpful. > >> I fear that with your approach, it will results to a lot of complexity in the >> kernel. > > The complexity is (at least partly) already there. It's an inevitable > result of the design decision to have relative identifiers. Yes, you're right. My approach moves the complexity to the user, which make this feature hard to use. > > I agree that we should think about how to make this easy to implement. > I like your idea of doing this somehow generically. Perhaps it's > possible to do while keeping the netnsids valid in the caller's netns? Yes. I agree that it will be a lot easier to use if the conversion is done in the kernel. And having a generic mechanism will also help a lot to use it. > >> What is really missing for me, is a way to get a fd from an nsid. The user >> should be able to call RTM_GETNSID with an fd and a nsid and the kernel >> performs >> the needed operations so that the fd points to the corresponding netns. > > That's what I was missing, too. I even looked into implementing it. But > opening a fd on behalf of the process and returning it over netlink is a > wrong thing to do. Netlink messages can get lost. Then you have a fd > leak you can do nothing about. Yes, I also looked at this ;-) > > Given that we have netnsids used for so much stuff already (like > NETLINK_LISTEN_ALL_NSID) you need to track them anyway. And if you need > to track them, why bother with another identifier? It would be better > if netnsid can be used universally for anything. Then there will be no > need for the conversion. I like this idea a lot. So the missing part is a setns() using the nsid ;-) Regards, Nicolas
pull-request: can-next 2018-01-26
Hello David, this is a pull request for net-next/master consisting of 3 patches. The first two patches target the CAN documentation. The first is by me and fixes pointer to location of fsl,mpc5200-mscan node in the mpc5200 documentation. The second patch is by Robert Schwebel and it converts the plain ASCII documentation to restructured text. The third patch is by Fabrizio Castro add the r8a774[35] support to the rcar_can dt-bindings documentation. regards, Marc --- The following changes since commit 9515a2e082f91457db0ecff4b65371d0fb5d9aad: net/ipv4: Allow send to local broadcast from a socket bound to a VRF (2018-01-25 21:51:31 -0500) are available in the Git repository at: ssh://g...@gitolite.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git tags/linux-can-next-for-4.16-20180126 for you to fetch changes up to 216bf2f490c214b8a0702b52cc957138ba24bc3f: dt-bindings: can: rcar_can: document r8a774[35] can support (2018-01-26 10:47:17 +0100) linux-can-next-for-4.16-20180126 Fabrizio Castro (1): dt-bindings: can: rcar_can: document r8a774[35] can support Marc Kleine-Budde (1): Documentation/devicetree: mpc5200.txt: fix pointer to location of fsl,mpc5200-mscan node Robert Schwebel (1): can: migrate documentation to restructured text .../devicetree/bindings/net/can/rcar_can.txt |7 +- .../devicetree/bindings/powerpc/fsl/mpc5200.txt|2 +- Documentation/networking/00-INDEX |2 - Documentation/networking/can.rst | 1437 Documentation/networking/can.txt | 1308 -- Documentation/networking/index.rst |1 + MAINTAINERS|2 +- drivers/net/can/dev.c |2 +- drivers/net/can/vcan.c |2 +- net/can/Kconfig|2 +- 10 files changed, 1448 insertions(+), 1317 deletions(-) create mode 100644 Documentation/networking/can.rst delete mode 100644 Documentation/networking/can.txt -- Pengutronix e.K. | Marc Kleine-Budde | Industrial Linux Solutions| Phone: +49-231-2826-924 | Vertretung West/Dortmund | Fax: +49-5121-206917- | Amtsgericht Hildesheim, HRA 2686 | http://www.pengutronix.de | signature.asc Description: OpenPGP digital signature
Re: [PATCH 1/8] dt-bindings: can: rcar_can: document r8a774[35] can support
On 01/25/2018 09:53 AM, Marc Kleine-Budde wrote: > On 01/24/2018 06:22 PM, Fabrizio Castro wrote: >> thank you for Acking the patch, just wondering if this patch has any >> chance to end up in v4.16? > > I can take this via the linux-can tree, if no one else takes it. Done - included in the latest pull request. Marc -- Pengutronix e.K. | Marc Kleine-Budde | Industrial Linux Solutions| Phone: +49-231-2826-924 | Vertretung West/Dortmund | Fax: +49-5121-206917- | Amtsgericht Hildesheim, HRA 2686 | http://www.pengutronix.de | signature.asc Description: OpenPGP digital signature
[PATCH] e1000e: allocate ring descriptors with dma_zalloc_coherent
Descriptor rings were not initialized at zero when allocated When area contained garbage data, it caused skb_over_panic in e1000_clean_rx_irq (if data had E1000_RXD_STAT_DD bit set) This patch makes use of dma_zalloc_coherent to make sure the ring is memset at 0 to prevent the area from containing garbage. Following is the signature of the panic: IODDR0@0.0: skbuff: skb_over_panic: text:80407b20 len:64010 put:64010 head:ab46d800 data:ab46d842 tail:0xab47d24c end:0xab46df40 dev:eth0 IODDR0@0.0: BUG: failure at net/core/skbuff.c:105/skb_panic()! IODDR0@0.0: Kernel panic - not syncing: BUG! IODDR0@0.0: IODDR0@0.0: Process swapper/0 (pid: 0, threadinfo=81728000, task=8173cc00 ,cpu: 0) IODDR0@0.0: SP = <815a1c0c> IODDR0@0.0: Stack: 0001 IODDR0@0.0: b2d89800 815e33ac IODDR0@0.0: ea73c040 0001 IODDR0@0.0: 60040003 fa0a IODDR0@0.0: 0002 IODDR0@0.0: IODDR0@0.0: 804540c0 815a1c70 IODDR0@0.0: b2744000 602ac070 IODDR0@0.0: 815a1c44 b2d89800 IODDR0@0.0: 8173cc00 815a1c08 IODDR0@0.0: IODDR0@0.0: 0006 IODDR0@0.0: 815a1b50 IODDR0@0.0: 80079434 0001 IODDR0@0.0: ab46df40 b2744000 IODDR0@0.0: b2d89800 IODDR0@0.0: IODDR0@0.0: fa0a 8045745c IODDR0@0.0: 815a1c88 fa0a IODDR0@0.0: 80407b20 b2789f80 IODDR0@0.0: 0005 80407b20 IODDR0@0.0: IODDR0@0.0: IODDR0@0.0: Call Trace: IODDR0@0.0: [<804540bc>] skb_panic+0xa4/0xa8 IODDR0@0.0: [<80079430>] console_unlock+0x2f8/0x6d0 IODDR0@0.0: [<80457458>] skb_put+0xa0/0xc0 IODDR0@0.0: [<80407b1c>] e1000_clean_rx_irq+0x2dc/0x3e8 IODDR0@0.0: [<80407b1c>] e1000_clean_rx_irq+0x2dc/0x3e8 IODDR0@0.0: [<804079c8>] e1000_clean_rx_irq+0x188/0x3e8 IODDR0@0.0: [<80407b1c>] e1000_clean_rx_irq+0x2dc/0x3e8 IODDR0@0.0: [<80468b48>] __dev_kfree_skb_any+0x88/0xa8 IODDR0@0.0: [<804101ac>] e1000e_poll+0x94/0x288 IODDR0@0.0: [<8046e9d4>] net_rx_action+0x19c/0x4e8 IODDR0@0.0: ... IODDR0@0.0: Maximum depth to print reached. Use kstack= To specify a custom value (where 0 means to display the full backtrace) IODDR0@0.0: ---[ end Kernel panic - not syncing: BUG! Signed-off-by: Pierre-Yves Kerbrat Signed-off-by: Marius Gligor --- drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 1298b69..26121ed 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -2333,7 +2333,7 @@ static int e1000_alloc_ring_dma(struct e1000_adapter *adapter, { struct pci_dev *pdev = adapter->pdev; - ring->desc = dma_alloc_coherent(&pdev->dev, ring->size, &ring->dma, + ring->desc = dma_zalloc_coherent(&pdev->dev, ring->size, &ring->dma, GFP_KERNEL); if (!ring->desc) return -ENOMEM; -- 1.8.3.1
[PATCH v2] net: macb: Handle HRESP error
From: Harini Katakam Handle HRESP error by doing a SW reset of RX and TX and re-initializing the descriptors, RX and TX queue pointers. Signed-off-by: Harini Katakam Signed-off-by: Michal Simek --- v2: Rebased on top of latest net-next and reinitialized all rx queues. drivers/net/ethernet/cadence/macb.h | 3 ++ drivers/net/ethernet/cadence/macb_main.c | 59 +--- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index c50c5ec..8665982 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -13,6 +13,7 @@ #include #include #include +#include #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP) #define MACB_EXT_DESC @@ -1200,6 +1201,8 @@ struct macb { struct ethtool_rx_fs_list rx_fs_list; spinlock_t rx_fs_lock; unsigned int max_tuples; + + struct tasklet_struct hresp_err_tasklet; }; #ifdef CONFIG_MACB_USE_HWSTAMP diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 234667e..e84afcf 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int budget) return work_done; } +static void macb_hresp_error_task(unsigned long data) +{ + struct macb *bp = (struct macb *)data; + struct net_device *dev = bp->dev; + struct macb_queue *queue = bp->queues; + unsigned int q; + u32 ctrl; + + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { + queue_writel(queue, IDR, MACB_RX_INT_FLAGS | +MACB_TX_INT_FLAGS | +MACB_BIT(HRESP)); + } + ctrl = macb_readl(bp, NCR); + ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE)); + macb_writel(bp, NCR, ctrl); + + netif_tx_stop_all_queues(dev); + netif_carrier_off(dev); + + bp->macbgem_ops.mog_init_rings(bp); + + /* Initialize TX and RX buffers */ + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { + queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma)); +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT + if (bp->hw_dma_cap & HW_DMA_CAP_64B) + queue_writel(queue, RBQPH, +upper_32_bits(queue->rx_ring_dma)); +#endif + queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma)); +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT + if (bp->hw_dma_cap & HW_DMA_CAP_64B) + queue_writel(queue, TBQPH, +upper_32_bits(queue->tx_ring_dma)); +#endif + + /* Enable interrupts */ + queue_writel(queue, IER, +MACB_RX_INT_FLAGS | +MACB_TX_INT_FLAGS | +MACB_BIT(HRESP)); + } + + ctrl |= MACB_BIT(RE) | MACB_BIT(TE); + macb_writel(bp, NCR, ctrl); + + netif_carrier_on(dev); + netif_tx_start_all_queues(dev); +} + static irqreturn_t macb_interrupt(int irq, void *dev_id) { struct macb_queue *queue = dev_id; @@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id) } if (status & MACB_BIT(HRESP)) { - /* TODO: Reset the hardware, and maybe move the -* netdev_err to a lower-priority context as well -* (work queue?) -*/ + tasklet_schedule(&bp->hresp_err_tasklet); netdev_err(dev, "DMA bus error: HRESP not OK\n"); if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) @@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev) goto err_out_unregister_mdio; } + tasklet_init(&bp->hresp_err_tasklet, macb_hresp_error_task, +(unsigned long)bp); + phy_attached_info(phydev); netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n", -- 2.7.4 This email and any attachments are intended for the sole use of the named recipient(s) and contain(s) confidential information that may be proprietary, privileged or copyrighted under applicable law. If you are not the intended recipient, do not read, copy, or forward this email message or any attachments. Delete this email message and any attachments immediately.
Re: [regresssion 4.15] Userspace compilation broken by uapi/linux/if_ether.h update
On Thu, Jan 25, 2018 at 11:21:34PM +0100, Hauke Mehrtens wrote: > On 01/25/2018 03:58 PM, Guillaume Nault wrote: > > Hi, > > > > Commit 6926e041a892 ("uapi/if_ether.h: prevent redefinition of struct > > ethhdr"), > > can break compilation of userspace programs (in my case, accel-ppp > > (https://accel-ppp.org)). > > > > This happens for userspace programs that end up including > > linux/if_ether.h, netinet/in.h and linux/in.h in this order: > > > > # cat test_ifether.c > > #include > > #include > > #include > > > > int main(void) > > { > > return 0; > > } > > > > # gcc test_ifether.c > > In file included from test_ifether.c:2:0: > > /usr/include/linux/in.h:29:3: error: redeclaration of enumerator > > ‘IPPROTO_IP’ > >IPPROTO_IP = 0, /* Dummy protocol for TCP */ > >^ > > /usr/include/netinet/in.h:42:5: note: previous definition of ‘IPPROTO_IP’ > > was here > > IPPROTO_IP = 0,/* Dummy protocol for TCP. */ > > ^~ > > > > > > Now that linux/libc-compat.h is included in linux/if_ether.h, it is > > processed before netinet/in.h. Therefore, it sets the relevant > > __UAPI_DEF_* guards to 1 (as _NETINET_IN_H isn't yet defined). > > Then netinet/in.h is included, followed by linux/in.h. The later > > doesn't realise that what it defines has already been included by > > netinet/in.h because the __UAPI_DEF_* guards were set too early. > > > > Of course the situation is a bit more complicated on real projects, as > > these files aren't included directly. For example, in accel-ppp, the > > PPPoE module (accel-ppp/accel-pppd/ctrl/pppoe/pppoe.c) uses > > #include/* includes linux/if_ether.h */ > > #include /* includes netinet/in.h */ > > #include /* (through pppoe.h), includes linux/in.h */ > > > > > > I don't have a satisfying solution for now, but I'd really like it if > > we could avoid shipping a kernel which forces userspace to play with > > include files ordering to keep compiling. > > > Hi, > > This is about this commit: > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6926e041a8920c8ec27e4e155efa760aa01551fd > > On option would be to move this into include/uapi/linux/if_ether.h and > remove the include for libc-compat.h: > #ifndef __UAPI_DEF_ETHHDR > #define __UAPI_DEF_ETHHDR 1 > #endif > > This will only work if netinet/if_ether.h is included before > linux/if_ether.h, but I think this is very likely. > I don't see what makes its likely. That's not directly related to your point, but for example, glibc guarantees the opposite as it includes linux/if_ether.h at the beginning of netinet/if_ether.h. > I think we can do this because we do not need some special libc handling > like it is done for other symbols as __UAPI_DEF_ETHHDR is currently only > needed by musl and not by glibc. > That's ok for me as long as existing projects keep compiling. But all __UAPI_DEF_* are currently centralised in libc-compat.h. Adding __UAPI_DEF_ETHHDR in if_ether.h looks like defeating the purpose of libc-compat.h and I wonder if that'd be accepted. Maybe with a different name. In any case, we're really late in the release cycle. If more discussion is needed, it's probably better to revert and take time to work on a solution for the next release.
[PATCH net-next 0/2] net: hns3: add support ethtool_ops.{set|get}_coalesce for VF
This patch-set adds ethtool_ops.{get|set}_coalesce to VF and fix one related bug. HNS3 PF and VF driver use the common enet layer, as the ethtool_ops.{get|set}_coalesce to PF have upstreamed, just need add the ops to hns3vf_ethtool_ops. [Patch 1/2] fix a related bug for the VF ethtool_ops.{set| get}_coalesce. Fuyun Liang (2): net: hns3: add get/set_coalesce support to VF net: hns3: add int_gl_idx setup for VF drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c| 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c| 8 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 + 3 files changed, 15 insertions(+) -- 1.9.1
[PATCH net-next 1/2] net: hns3: add get/set_coalesce support to VF
From: Fuyun Liang This patch adds ethtool_ops.get/set_coalesce support to VF. Since PF and VF share the same get/set_coalesce interface, we only need to set hns3_get/set_coalesce to the ethtool_ops when supporting get/set_coalesce for VF. Signed-off-by: Fuyun Liang Signed-off-by: Peng Li --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 7410205..b034c7f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1109,6 +1109,8 @@ static int hns3_set_phys_id(struct net_device *netdev, .set_rxfh = hns3_set_rss, .get_link_ksettings = hns3_get_link_ksettings, .get_channels = hns3_get_channels, + .get_coalesce = hns3_get_coalesce, + .set_coalesce = hns3_set_coalesce, }; static const struct ethtool_ops hns3_ethtool_ops = { -- 1.9.1
[PATCH net-next 2/2] net: hns3: add int_gl_idx setup for VF
From: Fuyun Liang Just like PF, if the int_gl_idx of VF does not be set, the default interrupt coalesce index of VF is 0. But it should be GL1 for TX queues and GL0 for RX queues. This patch adds the int_gl_idx setup for VF. Fixes: 200ecda42598 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support") Signed-off-by: Fuyun Liang Signed-off-by: Peng Li --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c| 8 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 + 2 files changed, 13 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 96f453f..f38fc5c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -116,6 +116,9 @@ static int hclge_get_ring_chain_from_mbx( hnae_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B, req->msg[3]); ring_chain->tqp_index = hclge_get_queue_id(vport->nic.kinfo.tqp[req->msg[4]]); + hnae_set_field(ring_chain->int_gl_idx, HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + req->msg[5]); cur_chain = ring_chain; @@ -133,6 +136,11 @@ static int hclge_get_ring_chain_from_mbx( [req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i + HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 1]]); + hnae_set_field(new_chain->int_gl_idx, HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i + + HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 2]); + cur_chain->next = new_chain; cur_chain = new_chain; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 3d2bc9a..0d89965 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -565,6 +565,11 @@ static int hclgevf_bind_ring_to_vector(struct hnae3_handle *handle, bool en, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B); req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 1] = node->tqp_index; + req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 2] = + hnae_get_field(node->int_gl_idx, + HNAE3_RING_GL_IDX_M, + HNAE3_RING_GL_IDX_S); + if (i == (HCLGE_MBX_VF_MSG_DATA_NUM - HCLGEVF_RING_MAP_MBX_BASIC_MSG_NUM) / HCLGEVF_RING_NODE_VARIABLE_NUM) { -- 1.9.1
Re: [PATCH net-next 0/3 V1] rtnetlink: enable IFLA_IF_NETNSID for RTM_{DEL,SET}LINK
On Thu, Jan 25, 2018 at 01:59:06PM +0100, Christian Brauner wrote: > On Wed, Jan 24, 2018 at 03:26:31PM +0100, Christian Brauner wrote: > > Hi, > > > > Based on the previous discussion this enables passing a IFLA_IF_NETNSID > > property along with RTM_SETLINK and RTM_DELLINK requests. The patch for > > RTM_NEWLINK will be sent out in a separate patch since there are more > > corner-cases to think about. > > > > Best, > > Christian > > > > Changelog 2018-01-24: > > * Preserve old behavior and report -ENODEV when either ifindex or ifname is > > provided and IFLA_GROUP is set. Spotted by Wolfgang Bumiller. > > > > Christian Brauner (3): > > rtnetlink: enable IFLA_IF_NETNSID in do_setlink() > > rtnetlink: enable IFLA_IF_NETNSID for RTM_SETLINK > > rtnetlink: enable IFLA_IF_NETNSID for RTM_DELLINK > > > > net/core/rtnetlink.c | 96 > > > > 1 file changed, 75 insertions(+), 21 deletions(-) > > > > -- > > 2.14.1 > > > > I've done more testing around enabling IFLA_IF_NETNSID for RTM_NEWLINK > as well and I think that the change is actually trivial. However, I > would like to wait before sending this patch out until we have agreed on > the patches for RTM_SETLINK and RTM_DELLINK in this series. Once we have > merged those I can just send another small commit. Or - if changes to > this patch series are required - I can just add it in a v2. Jiri, do you want that patch resent with the small commit for RTM_NEWLINK included or are you fine with sending it after this series? Thanks! Christian
[PATCH net-next 2/3] cxgb4: fix incorrect condition for using firmware LDST commands
Only contact firmware if it's alive _AND_ if use_bd (use backdoor access) is not set when issuing FW_LDST_CMD. Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index be795d0b0b7e..047609ef0515 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -5090,7 +5090,7 @@ int t4_read_rss(struct adapter *adapter, u16 *map) static unsigned int t4_use_ldst(struct adapter *adap) { - return (adap->flags & FW_OK) || !adap->use_bd; + return (adap->flags & FW_OK) && !adap->use_bd; } /** -- 2.14.1
[PATCH net-next 3/3] cxgb4: use backdoor access to collect dumps when firmware crashed
Fallback to backdoor register access to collect dumps if firmware is crashed. Fixes TID, SGE Queue Context, and MPS TCAM dump collection. Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar --- drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 51 +++--- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c index 8b95117c2923..557fd8bfd54e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c @@ -1567,6 +1567,12 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init, tid1->ver_hdr.size = sizeof(struct cudbg_tid_info_region_rev1) - sizeof(struct cudbg_ver_hdr); + /* If firmware is not attached/alive, use backdoor register +* access to collect dump. +*/ + if (!is_fw_attached(pdbg_init)) + goto fill_tid; + #define FW_PARAM_PFVF_A(param) \ (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) | \ FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_##param) | \ @@ -1604,6 +1610,9 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init, tid->nhpftids = val[1] - val[0] + 1; } +#undef FW_PARAM_PFVF_A + +fill_tid: tid->ntids = padap->tids.ntids; tid->nstids = padap->tids.nstids; tid->stid_base = padap->tids.stid_base; @@ -1623,8 +1632,6 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init, tid->ip_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV4_A); tid->ipv6_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV6_A); -#undef FW_PARAM_PFVF_A - return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff); } @@ -1866,11 +1873,18 @@ int cudbg_collect_dump_context(struct cudbg_init *pdbg_init, max_ctx_size = region_info[i].end - region_info[i].start + 1; max_ctx_qid = max_ctx_size / SGE_CTXT_SIZE; - t4_sge_ctxt_flush(padap, padap->mbox, i); - rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i], - region_info[i].start, max_ctx_size, - (__be32 *)ctx_buf, 1); - if (rc) { + /* If firmware is not attached/alive, use backdoor register +* access to collect dump. +*/ + if (is_fw_attached(pdbg_init)) { + t4_sge_ctxt_flush(padap, padap->mbox, i); + + rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i], + region_info[i].start, max_ctx_size, + (__be32 *)ctx_buf, 1); + } + + if (rc || !is_fw_attached(pdbg_init)) { max_ctx_qid = CUDBG_LOWMEM_MAX_CTXT_QIDS; cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, i, &buff); @@ -1946,9 +1960,10 @@ static void cudbg_mps_rpl_backdoor(struct adapter *padap, mps_rplc->rplc31_0 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP0_A)); } -static int cudbg_collect_tcam_index(struct adapter *padap, +static int cudbg_collect_tcam_index(struct cudbg_init *pdbg_init, struct cudbg_mps_tcam *tcam, u32 idx) { + struct adapter *padap = pdbg_init->adap; u64 tcamy, tcamx, val; u32 ctl, data2; int rc = 0; @@ -2033,12 +2048,22 @@ static int cudbg_collect_tcam_index(struct adapter *padap, htons(FW_LDST_CMD_FID_V(FW_LDST_MPS_RPLC) | FW_LDST_CMD_IDX_V(idx)); - rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd, sizeof(ldst_cmd), - &ldst_cmd); - if (rc) + /* If firmware is not attached/alive, use backdoor register +* access to collect dump. +*/ + if (is_fw_attached(pdbg_init)) + rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd, + sizeof(ldst_cmd), &ldst_cmd); + + if (rc || !is_fw_attached(pdbg_init)) { cudbg_mps_rpl_backdoor(padap, &mps_rplc); - else + /* Ignore error since we collected directly from +* reading registers. +*/ + rc = 0; + } else { mps_rplc = ldst_cmd.u.mps.rplc; + } tcam->rplc[0] = ntohl(mps_rplc.rplc31_0); tcam->rplc[1] = ntohl(mps_rplc.rplc63_32); @@ -2075,7 +2100,7 @@ int cudbg_collect_mps_tcam(struct cudbg_init *pdbg_init, tcam = (struct cudbg_mps_tcam *)temp_buff.data; for (i = 0; i < n; i++) { - rc = cudbg_collect_tcam_index(padap, tcam, i); + rc = cudbg_collect_tc
[PATCH net-next 0/3] cxgb4: fix dump collection when firmware crashed
Patch 1 resets FW_OK flag, if firmware reports error. Patch 2 fixes incorrect condition for using firmware LDST commands. Patch 3 fixes dump collection logic to use backdoor register access to collect dumps when firmware is crashed. Thanks, Rahul Rahul Lakkireddy (3): cxgb4: reset FW_OK flag on firmware crash cxgb4: fix incorrect condition for using firmware LDST commands cxgb4: use backdoor access to collect dumps when firmware crashed drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 51 +++--- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 6 ++- 2 files changed, 42 insertions(+), 15 deletions(-) -- 2.14.1
[PATCH net-next 1/3] cxgb4: reset FW_OK flag on firmware crash
If firmware reports error, reset FW_OK flag. Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index af27d2b0f79f..be795d0b0b7e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -195,9 +195,11 @@ static void t4_report_fw_error(struct adapter *adap) u32 pcie_fw; pcie_fw = t4_read_reg(adap, PCIE_FW_A); - if (pcie_fw & PCIE_FW_ERR_F) + if (pcie_fw & PCIE_FW_ERR_F) { dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n", reason[PCIE_FW_EVAL_G(pcie_fw)]); + adap->flags &= ~FW_OK; + } } /* -- 2.14.1
Re: [PATCH net] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
On 01/25/2018 09:03 PM, Eric Dumazet wrote: > On Thu, 2018-01-25 at 20:43 +0300, Alexey Kodanev wrote: >> ccid2_hc_tx_rto_expire() timer callback always restarts the timer >> again and can run indefinitely (unless it is stopped outside), and >> after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at >> dismantle time"), which moved sk_stop_timer() to sk_destruct(), >> this started to happen quite often. The timer prevents releasing >> the socket, as a result, sk_destruct() won't be called. >> >> Found with LTP/dccp_ipsec tests running on the bonding device, >> which later couldn't be unloaded after the tests were completed: >> >> unregister_netdevice: waiting for bond0 to become free. Usage count = 148 >> >> Fixes: 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time") >> Signed-off-by: Alexey Kodanev >> --- > > I understand your fix, but not why commit 120e9dabaf55 is bug origin. > > Looks like this always had been buggy : Timer logic should have checked > socket state from day 0. Hi Eric, Agree, I'll change to the initial commit id. I've added commit 120e9dabaf55 because ccid_hc_tx_delete() (and sk_stop_timer()) moved from dccp_destroy_sock() to sk_destruct(), and only after this change the chances that the timer won't stop increased significantly. > > I did not move sk_stop_timer() to sk_destruct() > ccid_hc_tx_delete() includes sk_stop_timer(): ccid_hc_tx_delete() ccid2_hc_tx_exit(struct sock *sk) sk_stop_timer(sk, &hc->tx_rtotimer); ccid3_hc_tx_exit(struct sock *sk) sk_stop_timer(sk, &hc->tx_no_feedback_timer); Thanks, Alexey
[PATCH] VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING
select(2) with wfds but no rfds must return when the socket is shut down by the peer. This way userspace notices socket activity and gets -EPIPE from the next write(2). Currently select(2) does not return for virtio-vsock when a SEND+RCV shutdown packet is received. This is because vsock_poll() only sets POLLOUT | POLLWRNORM for TCP_CLOSE, not the TCP_CLOSING state that the socket is in when the shutdown is received. Signed-off-by: Stefan Hajnoczi --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5d28abf87fbf..c9473d698525 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -951,7 +951,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; -- 2.14.3
[PATCH net v2] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
ccid2_hc_tx_rto_expire() timer callback always restarts the timer again and can run indefinitely (unless it is stopped outside), and after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from dccp_destroy_sock() to sk_destruct(), this started to happen quite often. The timer prevents releasing the socket, as a result, sk_destruct() won't be called. Found with LTP/dccp_ipsec tests running on the bonding device, which later couldn't be unloaded after the tests were completed: unregister_netdevice: waiting for bond0 to become free. Usage count = 148 Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") Signed-off-by: Alexey Kodanev --- v2: * corrected bug origin commit id * clarified commit message about sk_stop_timer() net/dccp/ccids/ccid2.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 1c75cd1..92d016e 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) -- 1.7.1
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote: > After checking all possible call chains to fs_send() here, > my tool finds that fs_send() is never called in atomic context. > And this function is assigned to a function pointer "dev->ops->send", > which is only called by vcc_sendmsg() (net/atm/common.c) > through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(), > it indicates that fs_send() can call functions which may sleep. > Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. > > This is found by a static analysis tool named DCNS written by myself. The trouble is, places like net/atm/raw.c:65: vcc->send = atm_send_aal0; net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) ? DROP_PACKET : 1; bh_unlock_sock(sk_atm(vcc)); in pppoatm_send() definitely is called under a spinlock. Looking through the driver (in advanced bitrot, as usual for drivers/atm), I'd say that submit_queue() is fucked in head in the "queue full" case. And judging by the history, had been thus since the original merge...
Re: aio poll, io_pgetevents and a new in-kernel poll API V4
On Thu, Jan 25, 2018 at 03:10:25PM -0500, Benjamin LaHaise wrote: > I implemented something similar back in December, but did so without > changing the in-kernel poll API. See below for the patch that implements > it. Is changing the in-kernel poll API really desirable given how many > drivers that will touch? I had various previous versions that did not touch the driver API, but there are a couple issues with that: (1) you cannot make the API race free. With the existing convoluted poll_table_struct-based API you can't check for pending items before adding yourself to the waitqueue in a race free manner. (2) you cannot make the submit non-blocking without deferring to a workqueue or similar and thus incurring another context switch (3) you cannot deliver events from the wakeup callback, incurring another context switch, this time in the wakeup path that actually matters for some applications (3) the in-kernel poll API really is broken to start with and needs to be fixed anyway. I'd rather rely on that instead of working around decades old cruft that has no reason to exist.
Re: [PATCH net-next 03/12] ptr_ring: READ/WRITE_ONCE for __ptr_ring_empty
On Fri, Jan 26, 2018 at 11:19:58AM +0800, Jason Wang wrote: > > > On 2018年01月26日 10:44, Michael S. Tsirkin wrote: > > On Fri, Jan 26, 2018 at 10:37:58AM +0800, Jason Wang wrote: > > > > > > On 2018年01月26日 07:36, Michael S. Tsirkin wrote: > > > > Lockless __ptr_ring_empty requires that consumer head is read and > > > > written at once, atomically. Annotate accordingly to make sure compiler > > > > does it correctly. Switch locked callers to __ptr_ring_peek which does > > > > not support the lockless operation. > > > > > > > > Signed-off-by: Michael S. Tsirkin > > > > --- > > > >include/linux/ptr_ring.h | 11 --- > > > >1 file changed, 8 insertions(+), 3 deletions(-) > > > > > > > > diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h > > > > index 8594c7b..9a72d8f 100644 > > > > --- a/include/linux/ptr_ring.h > > > > +++ b/include/linux/ptr_ring.h > > > > @@ -196,7 +196,9 @@ static inline void *__ptr_ring_peek(struct ptr_ring > > > > *r) > > > > */ > > > >static inline bool __ptr_ring_empty(struct ptr_ring *r) > > > >{ > > > > - return !__ptr_ring_peek(r); > > > > + if (likely(r->size)) > > > > + return !r->queue[READ_ONCE(r->consumer_head)]; > > > > + return true; > > > >} > > > So after patch 8, __ptr_ring_peek() did: > > > > > > static inline void *__ptr_ring_peek(struct ptr_ring *r) > > > { > > > if (likely(r->size)) > > > return READ_ONCE(r->queue[r->consumer_head]); > > > return NULL; > > > } > > > > > > Looks like a duplication. > > > > > > Thanks > > Nope - they are different. > > > > The reason is that __ptr_ring_peek does not need to read the consumer_head > > once > > since callers have a lock, > > I get this. > > > and __ptr_ring_empty does not need to read > > the queue once since it merely compares it to 0. > > > > Do this still work if it was called inside a loop? > > Thanks Sure because compiler does not know head didn't change. -- MST
Re: [PATCH net-next 12/12] tools/virtio: fix smp_mb on x86
On Fri, Jan 26, 2018 at 11:56:14AM +0800, Jason Wang wrote: > > > On 2018年01月26日 07:36, Michael S. Tsirkin wrote: > > Offset 128 overlaps the last word of the redzone. > > Use 132 which is always beyond that. > > > > Signed-off-by: Michael S. Tsirkin > > --- > > tools/virtio/ringtest/main.h | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h > > index 593a328..301d59b 100644 > > --- a/tools/virtio/ringtest/main.h > > +++ b/tools/virtio/ringtest/main.h > > @@ -111,7 +111,7 @@ static inline void busy_wait(void) > > } > > #if defined(__x86_64__) || defined(__i386__) > > -#define smp_mb() asm volatile("lock; addl $0,-128(%%rsp)" ::: > > "memory", "cc") > > Just wonder did "rsp" work for __i386__ ? > > Thanks Oh you are right of course. Probably no one ever run this one on i386 :) I'll add a patch on top as this is not a new bug. > > +#define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: > > "memory", "cc") > > #else > > /* > >* Not using __ATOMIC_SEQ_CST since gcc docs say they are only > > synchronized
Re: [Patch net-next v3 3/3] net_sched: implement ->change_tx_queue_len() for pfifo_fast
On Fri, Jan 26, 2018 at 11:57:59AM +0800, Jason Wang wrote: > > > On 2018年01月26日 10:26, Cong Wang wrote: > > pfifo_fast used to drop based on qdisc_dev(qdisc)->tx_queue_len, > > so we have to resize skb array when we change tx_queue_len. > > > > Other qdiscs which read tx_queue_len are fine because they > > all save it to sch->limit or somewhere else in qdisc during init. > > They don't have to implement this, it is nicer if they do so > > that users don't have to re-configure qdisc after changing > > tx_queue_len. > > > > Cc: John Fastabend > > Signed-off-by: Cong Wang > > --- > > net/sched/sch_generic.c | 18 ++ > > 1 file changed, 18 insertions(+) > > > > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > > index 08f9fa27e06e..190570f21b20 100644 > > --- a/net/sched/sch_generic.c > > +++ b/net/sched/sch_generic.c > > @@ -763,6 +763,23 @@ static void pfifo_fast_destroy(struct Qdisc *sch) > > } > > } > > +static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, > > + unsigned int new_len) > > +{ > > + struct pfifo_fast_priv *priv = qdisc_priv(sch); > > + struct skb_array *bands[PFIFO_FAST_BANDS]; > > + int prio; > > + > > + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { > > + struct skb_array *q = band2list(priv, prio); > > + > > + bands[prio] = q; > > + } > > + > > + return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, > > +GFP_KERNEL); > > +} > > + > > struct Qdisc_ops pfifo_fast_ops __read_mostly = { > > .id = "pfifo_fast", > > .priv_size = sizeof(struct pfifo_fast_priv), > > @@ -773,6 +790,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { > > .destroy= pfifo_fast_destroy, > > .reset = pfifo_fast_reset, > > .dump = pfifo_fast_dump, > > + .change_tx_queue_len = pfifo_fast_change_tx_queue_len, > > .owner = THIS_MODULE, > > .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, > > }; > > Is __skb_array_empty() in pfifo_fast_dequeue() still safe after this change? > > Thanks I think it isn't. If you want to use resize *and* use unlocked variants, you must lock all producers and consumers when resizing yourself. -- MST
Re: [PATCH net-next] ptr_ring: fix integer overflow
On Fri, Jan 26, 2018 at 11:44:22AM +0800, Jason Wang wrote: > > > On 2018年01月26日 01:31, Michael S. Tsirkin wrote: > > On Thu, Jan 25, 2018 at 10:17:38PM +0800, Jason Wang wrote: > > > > > > On 2018年01月25日 21:45, Michael S. Tsirkin wrote: > > > > On Thu, Jan 25, 2018 at 03:31:42PM +0800, Jason Wang wrote: > > > > > We try to allocate one more entry for lockless peeking. The adding > > > > > operation may overflow which causes zero to be passed to kmalloc(). > > > > > In this case, it returns ZERO_SIZE_PTR without any notice by ptr > > > > > ring. Try to do producing or consuming on such ring will lead NULL > > > > > dereference. Fix this detect and fail early. > > > > > > > > > > Fixes: bcecb4bbf88a ("net: ptr_ring: otherwise safe empty checks can > > > > > overrun array bounds") > > > > > Reported-by:syzbot+87678bcf753b44c39...@syzkaller.appspotmail.com > > > > > Cc: John Fastabend > > > > > Signed-off-by: Jason Wang > > > > Ugh that's just way too ugly. > > > > I'll work on dropping the extra + 1 - but calling this > > > > function with -1 size is the real source of the bug. > > > > Do you know how come we do that? > > > > > > > It looks e.g try to change tx_queue_len to UINT_MAX. And we probably can't > > > prevent user form trying to do this? > > > > > > Thanks > > Right. BTW why net-next? Isn't the crash exploitable in net? > > > > Commit bcecb4bbf88a exists only in net-next. Right you are. > And in net we check r->size > before trying to dereference the queue. > > Thanks I was wondering what it's about btw. Does anyone really create 0 size rings? -- MST
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
On 2018/1/26 20:05, Al Viro wrote: On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote: After checking all possible call chains to fs_send() here, my tool finds that fs_send() is never called in atomic context. And this function is assigned to a function pointer "dev->ops->send", which is only called by vcc_sendmsg() (net/atm/common.c) through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(), it indicates that fs_send() can call functions which may sleep. Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. The trouble is, places like net/atm/raw.c:65: vcc->send = atm_send_aal0; net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) ? DROP_PACKET : 1; bh_unlock_sock(sk_atm(vcc)); in pppoatm_send() definitely is called under a spinlock. Looking through the driver (in advanced bitrot, as usual for drivers/atm), I'd say that submit_queue() is fucked in head in the "queue full" case. And judging by the history, had been thus since the original merge... Thanks for reply :) I am sorry for this false positive. I think other ATM related patches that I submitted are also false positives, sorry. My tool did not handle this situation of passing function pointer, and I will improve the tool... Thanks, Jia-Ju Bai
Re: [Patch net-next v3 3/3] net_sched: implement ->change_tx_queue_len() for pfifo_fast
On Thu, Jan 25, 2018 at 08:01:42PM -0800, Cong Wang wrote: > On Thu, Jan 25, 2018 at 7:57 PM, Jason Wang wrote: > > > > > > On 2018年01月26日 10:26, Cong Wang wrote: > >> > >> pfifo_fast used to drop based on qdisc_dev(qdisc)->tx_queue_len, > >> so we have to resize skb array when we change tx_queue_len. > >> > >> Other qdiscs which read tx_queue_len are fine because they > >> all save it to sch->limit or somewhere else in qdisc during init. > >> They don't have to implement this, it is nicer if they do so > >> that users don't have to re-configure qdisc after changing > >> tx_queue_len. > >> > >> Cc: John Fastabend > >> Signed-off-by: Cong Wang > >> --- > >> net/sched/sch_generic.c | 18 ++ > >> 1 file changed, 18 insertions(+) > >> > >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > >> index 08f9fa27e06e..190570f21b20 100644 > >> --- a/net/sched/sch_generic.c > >> +++ b/net/sched/sch_generic.c > >> @@ -763,6 +763,23 @@ static void pfifo_fast_destroy(struct Qdisc *sch) > >> } > >> } > >> +static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, > >> + unsigned int new_len) > >> +{ > >> + struct pfifo_fast_priv *priv = qdisc_priv(sch); > >> + struct skb_array *bands[PFIFO_FAST_BANDS]; > >> + int prio; > >> + > >> + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { > >> + struct skb_array *q = band2list(priv, prio); > >> + > >> + bands[prio] = q; > >> + } > >> + > >> + return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, > >> +GFP_KERNEL); > >> +} > >> + > >> struct Qdisc_ops pfifo_fast_ops __read_mostly = { > >> .id = "pfifo_fast", > >> .priv_size = sizeof(struct pfifo_fast_priv), > >> @@ -773,6 +790,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { > >> .destroy= pfifo_fast_destroy, > >> .reset = pfifo_fast_reset, > >> .dump = pfifo_fast_dump, > >> + .change_tx_queue_len = pfifo_fast_change_tx_queue_len, > >> .owner = THIS_MODULE, > >> .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, > >> }; > > > > > > Is __skb_array_empty() in pfifo_fast_dequeue() still safe after this change? > > Yes, we sync with dequeue path before calling ->change_tx_queue_len(). > I already mentioned this in patch 2/3. This part? + bool up = dev->flags & IFF_UP; + unsigned int i; + int ret = 0; + + if (up) + dev_deactivate(dev); + + for (i = 0; i < dev->num_tx_queues; i++) { + ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); + + /* TODO: revert changes on a partial failure */ + if (ret) + break; + } + + if (up) + dev_activate(dev); I wonder whether it really is safe to read dev->flags like that without any locks. -- MST
Re: [Patch net-next v3 2/3] net_sched: plug in qdisc ops change_tx_queue_len
On Thu, Jan 25, 2018 at 06:26:23PM -0800, Cong Wang wrote: > Introduce a new qdisc ops ->change_tx_queue_len() so that > each qdisc could decide how to implement this if it wants. > Previously we simply read dev->tx_queue_len, after pfifo_fast > switches to skb array, we need this API to resize the skb array > when we change dev->tx_queue_len. > > To avoid handling race conditions with TX BH, we need to > deactivate all TX queues before change the value and bring them > back after we are done, this also makes implementation easier. > > Cc: John Fastabend > Signed-off-by: Cong Wang > --- > include/net/sch_generic.h | 2 ++ > net/core/dev.c| 1 + > net/sched/sch_generic.c | 33 + > 3 files changed, 36 insertions(+) > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h > index eac43e8ca96d..e2ab13687fb9 100644 > --- a/include/net/sch_generic.h > +++ b/include/net/sch_generic.h > @@ -200,6 +200,7 @@ struct Qdisc_ops { > struct nlattr *arg, > struct netlink_ext_ack *extack); > void(*attach)(struct Qdisc *sch); > + int (*change_tx_queue_len)(struct Qdisc *, unsigned > int); > > int (*dump)(struct Qdisc *, struct sk_buff *); > int (*dump_stats)(struct Qdisc *, struct gnet_dump > *); > @@ -489,6 +490,7 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *, > void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); > void qdisc_class_hash_destroy(struct Qdisc_class_hash *); > > +int dev_qdisc_change_tx_queue_len(struct net_device *dev); > void dev_init_scheduler(struct net_device *dev); > void dev_shutdown(struct net_device *dev); > void dev_activate(struct net_device *dev); > diff --git a/net/core/dev.c b/net/core/dev.c > index e0b0c2784070..c8443cfaa17a 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -7070,6 +7070,7 @@ int dev_change_tx_queue_len(struct net_device *dev, > unsigned long new_len) > dev->tx_queue_len = orig_len; > return res; > } > + return dev_qdisc_change_tx_queue_len(dev); > } > > return 0; > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > index 1816bde47256..08f9fa27e06e 100644 > --- a/net/sched/sch_generic.c > +++ b/net/sched/sch_generic.c > @@ -1178,6 +1178,39 @@ void dev_deactivate(struct net_device *dev) > } > EXPORT_SYMBOL(dev_deactivate); > > +static int qdisc_change_tx_queue_len(struct net_device *dev, > + struct netdev_queue *dev_queue) > +{ > + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; > + const struct Qdisc_ops *ops = qdisc->ops; > + > + if (ops->change_tx_queue_len) > + return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); > + return 0; > +} > + > +int dev_qdisc_change_tx_queue_len(struct net_device *dev) > +{ > + bool up = dev->flags & IFF_UP; > + unsigned int i; > + int ret = 0; > + > + if (up) > + dev_deactivate(dev); This drops all packets in the queue. I don't think tweaking the queue length did this previously - did it? If not this change might surprise some people. > + > + for (i = 0; i < dev->num_tx_queues; i++) { > + ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); > + > + /* TODO: revert changes on a partial failure */ > + if (ret) > + break; > + } > + > + if (up) > + dev_activate(dev); > + return ret; > +} > + > static void dev_init_scheduler_queue(struct net_device *dev, >struct netdev_queue *dev_queue, >void *_qdisc) > -- > 2.13.0
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
On 2018/1/26 21:56, Jia-Ju Bai wrote: On 2018/1/26 20:05, Al Viro wrote: On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote: After checking all possible call chains to fs_send() here, my tool finds that fs_send() is never called in atomic context. And this function is assigned to a function pointer "dev->ops->send", which is only called by vcc_sendmsg() (net/atm/common.c) through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(), it indicates that fs_send() can call functions which may sleep. Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. The trouble is, places like net/atm/raw.c:65: vcc->send = atm_send_aal0; net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) ? DROP_PACKET : 1; bh_unlock_sock(sk_atm(vcc)); in pppoatm_send() definitely is called under a spinlock. Looking through the driver (in advanced bitrot, as usual for drivers/atm), I'd say that submit_queue() is fucked in head in the "queue full" case. And judging by the history, had been thus since the original merge... Thanks for reply :) I am sorry for this false positive. I think other ATM related patches that I submitted are also false positives, sorry. My tool did not handle this situation of passing function pointer, and I will improve the tool... Thanks, Jia-Ju Bai I check the code again, and confirm only my patches about "send" are false positives. I think other my patches that are about "open" does not has this problem: https://marc.info/?l=linux-kernel&m=151693791432626&w=2 https://marc.info/?l=linux-kernel&m=151695475503314&w=2 https://marc.info/?l=linux-kernel&m=151693150131512&w=2 I hope you can have a check :) Thanks, Jia-Ju Bai
Re: dvb usb issues since kernel 4.9
Hi Alan, Em Mon, 8 Jan 2018 14:15:35 -0500 (EST) Alan Stern escreveu: > On Mon, 8 Jan 2018, Linus Torvalds wrote: > > > Can somebody tell which softirq it is that dvb/usb cares about? > > I don't know about the DVB part. The USB part is a little difficult to > analyze, mostly because the bug reports I've seen are mostly from > people running non-vanilla kernels. I suspect that the main reason for people not using non-vanilla Kernels is that, among other bugs, the dwc2 upstream driver has serious troubles handling ISOCH traffic. Using Kernel 4.15-rc7 from this git tree: https://git.linuxtv.org/mchehab/experimental.git/log/?h=softirq_fixup (e. g. with the softirq bug partially reverted with Linux patch, and the DWC2 deferred probe fixed) With a PCTV 461e device, with uses em28xx driver + Montage frontend (with is the same used on dvbsky hardware - except for em28xx). This device doesn't support bulk for DVB, just ISOCH. The drivers work fine on x86. Using a test signal at the bit rate of 56698,4 Kbits/s, that's what happens, when capturing less than one second of data: $ dvbv5-zap -c ~/dvb_channel.conf "tv brasil" -l universal -X 100 -m -t2dvbv5-zap -c ~/dvb_channel.conf "tv brasil" -l universal -X 100 -m -t2 Using LNBf UNIVERSAL Universal, Europe Freqs : 10800 to 11800 MHz, LO: 9750 MHz Freqs : 11600 to 12700 MHz, LO: 10600 MHz using demux 'dvb0.demux0' reading channels from file '/home/mchehab/dvb_channel.conf' tuning to 11468000 Hz (0x00) Signal= -33.90dBm Lock (0x1f) Signal= -33.90dBm C/N= 30.28dB postBER= 2.33x10^-6 dvb_dev_set_bufsize: buffer set to 6160384 dvb_set_pesfilter to 0x2000 354.08s: Starting capture 354.73s: only read 59220 bytes 354.73s: Stopping capture [ 354.000827] dwc2 3f98.usb: DWC OTG HCD EP DISABLE: bEndpointAddress=0x84, ep->hcpriv=116f41b2 [ 354.000859] dwc2 3f98.usb: DWC OTG HCD EP RESET: bEndpointAddress=0x84 [ 354.010744] dwc2 3f98.usb: --Host Channel 5 Interrupt: Frame Overrun-- ... (hundreds of thousands of Frame Overrun messages) [ 354.660857] dwc2 3f98.usb: --Host Channel 5 Interrupt: Frame Overrun-- [ 354.660935] dwc2 3f98.usb: DWC OTG HCD URB Dequeue [ 354.660959] dwc2 3f98.usb: Called usb_hcd_giveback_urb() [ 354.660966] dwc2 3f98.usb: urb->status = 0 [ 354.660992] dwc2 3f98.usb: DWC OTG HCD URB Dequeue [ 354.661001] dwc2 3f98.usb: Called usb_hcd_giveback_urb() [ 354.661008] dwc2 3f98.usb: urb->status = 0 [ 354.661054] dwc2 3f98.usb: DWC OTG HCD URB Dequeue [ 354.661065] dwc2 3f98.usb: Called usb_hcd_giveback_urb() [ 354.661072] dwc2 3f98.usb: urb->status = 0 [ 354.661107] dwc2 3f98.usb: DWC OTG HCD URB Dequeue [ 354.661120] dwc2 3f98.usb: Called usb_hcd_giveback_urb() [ 354.661127] dwc2 3f98.usb: urb->status = 0 [ 354.661146] dwc2 3f98.usb: DWC OTG HCD URB Dequeue [ 354.661158] dwc2 3f98.usb: Called usb_hcd_giveback_urb() [ 354.661165] dwc2 3f98.usb: urb->status = 0 Kernel was compiled with: CONFIG_USB_DWC2=y CONFIG_USB_DWC2_HOST=y # CONFIG_USB_DWC2_PERIPHERAL is not set # CONFIG_USB_DWC2_DUAL_ROLE is not set # CONFIG_USB_DWC2_PCI is not set CONFIG_USB_DWC2_DEBUG=y # CONFIG_USB_DWC2_VERBOSE is not set # CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set CONFIG_USB_DWC2_DEBUG_PERIODIC=y As reference, that's the output of lsusb for the PCTV usb hardware: $ lsusb -v -d 2013:0258 Bus 001 Device 005: ID 2013:0258 PCTV Systems Couldn't open device, some information will be missing Device Descriptor: bLength18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass0 (Defined at Interface level) bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize064 idVendor 0x2013 PCTV Systems idProduct 0x0258 bcdDevice1.00 iManufacturer 3 iProduct1 iSerial 2 bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 41 bNumInterfaces 1 bConfigurationValue 1 iConfiguration 0 bmAttributes 0x80 (Bus Powered) MaxPower 500mA Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber0 bAlternateSetting 0 bNumEndpoints 1 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x84 EP 4 IN bmAttributes1 Transfer TypeIsochronous Synch Type None Usage Type Data wMaxPacketSize 0x 1x 0 bytes bInterva
Re: [RFC PATCH net-next] sfc: efx_default_channel_want_txqs() can be static
On 26/01/18 01:03, kbuild test robot wrote: > Fixes: 2935e3c38228 ("sfc: on 8000 series use TX queues for TX timestamps") > Signed-off-by: Fengguang Wu Acked-by: Edward Cree Dave, can you take this directly or do you need it reposted without RFC tags? I'm not sure what the procedure is for robopatches. > --- > efx.c |2 +- > ptp.c |4 ++-- > 2 files changed, 3 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c > index 456866b0..16757cf 100644 > --- a/drivers/net/ethernet/sfc/efx.c > +++ b/drivers/net/ethernet/sfc/efx.c > @@ -896,7 +896,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue) > mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(100)); > } > > -bool efx_default_channel_want_txqs(struct efx_channel *channel) > +static bool efx_default_channel_want_txqs(struct efx_channel *channel) > { > return channel->channel - channel->efx->tx_channel_offset < > channel->efx->n_tx_channels; > diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c > index 433d29d..3e2c5b1 100644 > --- a/drivers/net/ethernet/sfc/ptp.c > +++ b/drivers/net/ethernet/sfc/ptp.c > @@ -366,7 +366,7 @@ bool efx_ptp_use_mac_tx_timestamps(struct efx_nic *efx) > /* PTP 'extra' channel is still a traffic channel, but we only create TX > queues > * if PTP uses MAC TX timestamps, not if PTP uses the MC directly to > transmit. > */ > -bool efx_ptp_want_txqs(struct efx_channel *channel) > +static bool efx_ptp_want_txqs(struct efx_channel *channel) > { > return efx_ptp_use_mac_tx_timestamps(channel->efx); > } > @@ -2146,7 +2146,7 @@ static int efx_phc_enable(struct ptp_clock_info *ptp, > return 0; > } > > -const struct efx_channel_type efx_ptp_channel_type = { > +static const struct efx_channel_type efx_ptp_channel_type = { > .handle_no_channel = efx_ptp_handle_no_channel, > .pre_probe = efx_ptp_probe_channel, > .post_remove= efx_ptp_remove_channel,
Re: [PATCH v2 net-next 0/3] net/ipv6: Add support for ONLINK flag
From: David Ahern Date: Thu, 25 Jan 2018 16:55:06 -0800 > Add support for RTNH_F_ONLINK with ipv6 routes. > > First patch moves existing gateway validation into helper. The onlink > flag requires a different set of checks and the existing validation > makes ip6_route_info_create long enough. > > Second patch makes the table id and lookup flag an option to > ip6_nh_lookup_table. onlink check needs to verify the gateway without > the RT6_LOOKUP_F_IFACE flag and PBR with VRF means the table id can > vary between the table the route is inserted and the VRF the egress > device is enslaved to. > > Third patch adds support for RTNH_F_ONLINK. > > I have a set of test cases in a format based on the framework Ido and > Jiri are working on. Once that goes in I will adapt the script and > submit. > > v2 > - removed table id check. Too constraining for PBR with VRF use cases Series applied, thanks David.
Re: pull request (net-next): ipsec-next 2018-01-26
From: Steffen Klassert Date: Fri, 26 Jan 2018 08:59:54 +0100 > One last patch for this development cycle: > > 1) Add ESN support for IPSec HW offload. >From Yossef Efraim. > > Please pull or let me know if there are problems. Pulled, thanks Steffen.
[PATCH net-next] bnxt_en: cleanup DIM work on device shutdown
From: Andy Gospodarek Make sure to cancel any pending work that might update driver coalesce settings when taking down an interface. Fixes: 6a8788f25625 ("bnxt_en: add support for software dynamic interrupt moderation") Signed-off-by: Andy Gospodarek Cc: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4b001d2..1500243 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6082,8 +6082,14 @@ static void bnxt_disable_napi(struct bnxt *bp) if (!bp->bnapi) return; - for (i = 0; i < bp->cp_nr_rings; i++) + for (i = 0; i < bp->cp_nr_rings; i++) { + struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring; + + if (bp->bnapi[i]->rx_ring) + cancel_work_sync(&cpr->dim.work); + napi_disable(&bp->bnapi[i]->napi); + } } static void bnxt_enable_napi(struct bnxt *bp) -- 2.7.4
Re: [PATCH net-next 0/5] net/smc: fixes 2018-01-26
From: Ursula Braun Date: Fri, 26 Jan 2018 09:28:45 +0100 > here are some more smc patches. The first 4 patches take care about > different aspects of smc socket closing, the 5th patch improves > coding style. Series applied, thank you.
Re: [PATCH net-next] sfc: add suffix to large constant in ptp
From: Bert Kenward Date: Fri, 26 Jan 2018 08:51:47 + > Fixes: 1280c0f8aafc ("sfc: support second + quarter ns time format for > receive datapath") > Reported-by: kbuild test robot > Signed-off-by: Bert Kenward Applied.
Re: [bpf PATCH 1/3] net: add a UID to use for ULP socket assignment
On 01/25/18 04:27 PM, John Fastabend wrote: > I did not however retest TLS with the small change to ULP layer. > Mostly because I don't have a TLS setup. I plan/hope to get around > to writing either a sample or preferably a selftest for this case > as well (assuming I didn't miss one). > @Dave Watson can you take a quick look to verify the changes are > good on TLS ULP side. Looks reasonable, and passes my test suite. One comment below Tested-by: Dave Watson > Signed-off-by: John Fastabend > --- > include/net/tcp.h |6 ++ > net/ipv4/tcp_ulp.c | 53 > +++- > net/tls/tls_main.c |2 ++ > 3 files changed, 56 insertions(+), 5 deletions(-) > > diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c > index 6bb9e14..8ef437d 100644 > --- a/net/ipv4/tcp_ulp.c > +++ b/net/ipv4/tcp_ulp.c > @@ -133,3 +157,22 @@ int tcp_set_ulp(struct sock *sk, const char *name) > icsk->icsk_ulp_ops = ulp_ops; > return 0; > } > + > +int tcp_set_ulp_id(struct sock *sk, int ulp) > +{ > + struct inet_connection_sock *icsk = inet_csk(sk); > + const struct tcp_ulp_ops *ulp_ops; > + int err; > + > + if (icsk->icsk_ulp_ops) > + return -EEXIST; > + > + ulp_ops = __tcp_ulp_lookup(ulp); > + if (!ulp_ops) > + return -ENOENT; > + > + err = ulp_ops->init(sk); > + if (!err) > + icsk->icsk_ulp_ops = ulp_ops; Does this need module_put on error, similar to tcp_set_ulp? > + return err; > +}
Re: pull-request: can-next 2018-01-26,pull-request: can-next 2018-01-26
From: Marc Kleine-Budde Date: Fri, 26 Jan 2018 11:01:41 +0100 > this is a pull request for net-next/master consisting of 3 patches. > > The first two patches target the CAN documentation. The first is by me > and fixes pointer to location of fsl,mpc5200-mscan node in the mpc5200 > documentation. The second patch is by Robert Schwebel and it converts > the plain ASCII documentation to restructured text. > > The third patch is by Fabrizio Castro add the r8a774[35] support to the > rcar_can dt-bindings documentation. Pulled, thanks Marc.
Re: [PATCH v2] net: macb: Handle HRESP error
From: Harini Katakam Date: Fri, 26 Jan 2018 16:12:11 +0530 > From: Harini Katakam > > Handle HRESP error by doing a SW reset of RX and TX and > re-initializing the descriptors, RX and TX queue pointers. > > Signed-off-by: Harini Katakam > Signed-off-by: Michal Simek > --- > v2: > Rebased on top of latest net-next and reinitialized > all rx queues. Your patch was corrupted by your email client, it changed TAB characters into sequences of SPACES amongst other things. We cannot integrate your changes until you fix this. > This email and any attachments are intended for the sole use of the named > recipient(s) and contain(s) confidential information that may be proprietary, > privileged or copyrighted under applicable law. If you are not the intended > recipient, do not read, copy, or forward this email message or any > attachments. Delete this email message and any attachments immediately. This is also completely inappropriate for a public development list and you must eliminate this signature on future postings or we will have to ignore them. Thank you.
Re: [PATCH net v2] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
On Fri, 2018-01-26 at 15:14 +0300, Alexey Kodanev wrote: > ccid2_hc_tx_rto_expire() timer callback always restarts the timer > again and can run indefinitely (unless it is stopped outside), and after > commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), > which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from > dccp_destroy_sock() to sk_destruct(), this started to happen quite often. > The timer prevents releasing the socket, as a result, sk_destruct() won't > be called. > > Found with LTP/dccp_ipsec tests running on the bonding device, > which later couldn't be unloaded after the tests were completed: > > unregister_netdevice: waiting for bond0 to become free. Usage count = 148 > > Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") > Signed-off-by: Alexey Kodanev > --- Reviewed-by: Eric Dumazet Thanks Alexey.
Re: [PATCH net-next 5/5] net/smc: return booleans instead of integers
On Fri, 2018-01-26 at 09:28 +0100, Ursula Braun wrote: > From: Gustavo A. R. Silva > > Return statements in functions returning bool should use > true/false instead of 1/0. > > This issue was detected with the help of Coccinelle. > > Signed-off-by: Gustavo A. R. Silva > Signed-off-by: Ursula Braun > --- > net/smc/smc.h | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/net/smc/smc.h b/net/smc/smc.h > index bfbe20234105..9518986c97b1 100644 > --- a/net/smc/smc.h > +++ b/net/smc/smc.h > @@ -252,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed) > static inline bool using_ipsec(struct smc_sock *smc) > { > return (smc->clcsock->sk->sk_policy[0] || > - smc->clcsock->sk->sk_policy[1]) ? 1 : 0; > + smc->clcsock->sk->sk_policy[1]) ? true : false; > } Hello Ursula, If you ever have to touch this code again, please follow the style of other kernel code and leave out the "? true : false" part and also the parentheses. Both are superfluous. Thanks, Bart.
Re: [PATCH net-next 0/2] net: hns3: add support ethtool_ops.{set|get}_coalesce for VF
From: Peng Li Date: Fri, 26 Jan 2018 19:31:23 +0800 > This patch-set adds ethtool_ops.{get|set}_coalesce to VF and > fix one related bug. > > HNS3 PF and VF driver use the common enet layer, as the > ethtool_ops.{get|set}_coalesce to PF have upstreamed, just > need add the ops to hns3vf_ethtool_ops. > > [Patch 1/2] fix a related bug for the VF ethtool_ops.{set| > get}_coalesce. Series applied, thank you.
Re: [PATCH net-next 0/3] cxgb4: fix dump collection when firmware crashed
From: Rahul Lakkireddy Date: Fri, 26 Jan 2018 17:05:53 +0530 > Patch 1 resets FW_OK flag, if firmware reports error. > > Patch 2 fixes incorrect condition for using firmware LDST commands. > > Patch 3 fixes dump collection logic to use backdoor register > access to collect dumps when firmware is crashed. Series applied, thank you.
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
From: Al Viro Date: Fri, 26 Jan 2018 12:05:22 + > On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote: >> After checking all possible call chains to fs_send() here, >> my tool finds that fs_send() is never called in atomic context. >> And this function is assigned to a function pointer "dev->ops->send", >> which is only called by vcc_sendmsg() (net/atm/common.c) >> through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(), >> it indicates that fs_send() can call functions which may sleep. >> Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. >> >> This is found by a static analysis tool named DCNS written by myself. > > The trouble is, places like > net/atm/raw.c:65: vcc->send = atm_send_aal0; > net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; > net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; > mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. > ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) > ? DROP_PACKET : 1; > bh_unlock_sock(sk_atm(vcc)); > in pppoatm_send() definitely is called under a spinlock. > > Looking through the driver (in advanced bitrot, as usual for drivers/atm), > I'd say that submit_queue() is fucked in head in the "queue full" case. > And judging by the history, had been thus since the original merge... Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL conversions. Al's analysis above is similar to how things looked for other patches you submited of this nature. So because of the lack of care and serious auditing you put into these conversions, I really have no choice but to drop them from my queue because on the whole they are adding bugs rather than improving the code. Thank you.
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
From: Jia-Ju Bai Date: Fri, 26 Jan 2018 21:56:32 +0800 > My tool did not handle this situation of passing function pointer, and > I will improve the tool... Never automate these kinds of changes. Actually audit the change fully _yourself_ as a human after the tool indicates a possibility to you. This is why most of your changes have bugs, you rely on the tool too much.
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
From: Jia-Ju Bai Date: Fri, 26 Jan 2018 22:17:08 +0800 > > > On 2018/1/26 21:56, Jia-Ju Bai wrote: >> >> >> On 2018/1/26 20:05, Al Viro wrote: >>> On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote: After checking all possible call chains to fs_send() here, my tool finds that fs_send() is never called in atomic context. And this function is assigned to a function pointer "dev->ops->send", which is only called by vcc_sendmsg() (net/atm/common.c) through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(), it indicates that fs_send() can call functions which may sleep. Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. >>> The trouble is, places like >>> net/atm/raw.c:65: vcc->send = atm_send_aal0; >>> net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; >>> net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; >>> mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. >>> ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) >>> ? DROP_PACKET : 1; >>> bh_unlock_sock(sk_atm(vcc)); >>> in pppoatm_send() definitely is called under a spinlock. >>> >>> Looking through the driver (in advanced bitrot, as usual for >>> drivers/atm), >>> I'd say that submit_queue() is fucked in head in the "queue full" >>> case. >>> And judging by the history, had been thus since the original merge... >> >> Thanks for reply :) >> >> I am sorry for this false positive. >> I think other ATM related patches that I submitted are also false >> positives, sorry. >> My tool did not handle this situation of passing function pointer, and >> I will improve the tool... >> >> >> Thanks, >> Jia-Ju Bai > > I check the code again, and confirm only my patches about "send" are > false positives. > I think other my patches that are about "open" does not has this > problem: > https://marc.info/?l=linux-kernel&m=151693791432626&w=2 > https://marc.info/?l=linux-kernel&m=151695475503314&w=2 > https://marc.info/?l=linux-kernel&m=151693150131512&w=2 > > I hope you can have a check :) No, _you_ have a check. All of these patches will be dropped, sorry.
Re: [RFC PATCH net-next] sfc: efx_default_channel_want_txqs() can be static
From: Edward Cree Date: Fri, 26 Jan 2018 15:13:05 + > On 26/01/18 01:03, kbuild test robot wrote: >> Fixes: 2935e3c38228 ("sfc: on 8000 series use TX queues for TX timestamps") >> Signed-off-by: Fengguang Wu > Acked-by: Edward Cree > > Dave, can you take this directly or do you need it reposted without RFC tags? > I'm not sure what the procedure is for robopatches. No I cannot. Don't you even notice that the subject line and commit message are totally inaccurate? They say that one function is being marked static. But the patch actually marks two different functions static, as well as a structure which is also completely not mentioned in the commit message nor subject line.
Re: [PATCH net v2] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
From: Alexey Kodanev Date: Fri, 26 Jan 2018 15:14:16 +0300 > ccid2_hc_tx_rto_expire() timer callback always restarts the timer > again and can run indefinitely (unless it is stopped outside), and after > commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), > which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from > dccp_destroy_sock() to sk_destruct(), this started to happen quite often. > The timer prevents releasing the socket, as a result, sk_destruct() won't > be called. > > Found with LTP/dccp_ipsec tests running on the bonding device, > which later couldn't be unloaded after the tests were completed: > > unregister_netdevice: waiting for bond0 to become free. Usage count = 148 > > Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") > Signed-off-by: Alexey Kodanev > --- > > v2: * corrected bug origin commit id > * clarified commit message about sk_stop_timer() Applied and queued up for -stable.
Re: [PATCH] VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING
From: Stefan Hajnoczi Date: Fri, 26 Jan 2018 11:48:25 + > select(2) with wfds but no rfds must return when the socket is shut down > by the peer. This way userspace notices socket activity and gets -EPIPE > from the next write(2). > > Currently select(2) does not return for virtio-vsock when a SEND+RCV > shutdown packet is received. This is because vsock_poll() only sets > POLLOUT | POLLWRNORM for TCP_CLOSE, not the TCP_CLOSING state that the > socket is in when the shutdown is received. > > Signed-off-by: Stefan Hajnoczi Applied and queued up for -stable, thank you.
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
On 2018/1/27 0:07, David Miller wrote: From: Al Viro Date: Fri, 26 Jan 2018 12:05:22 + On Fri, Jan 26, 2018 at 04:00:27PM +0800, Jia-Ju Bai wrote: After checking all possible call chains to fs_send() here, my tool finds that fs_send() is never called in atomic context. And this function is assigned to a function pointer "dev->ops->send", which is only called by vcc_sendmsg() (net/atm/common.c) through vcc->dev->ops->send(), and vcc_sendmsg() calls schedule(), it indicates that fs_send() can call functions which may sleep. Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. The trouble is, places like net/atm/raw.c:65: vcc->send = atm_send_aal0; net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) ? DROP_PACKET : 1; bh_unlock_sock(sk_atm(vcc)); in pppoatm_send() definitely is called under a spinlock. Looking through the driver (in advanced bitrot, as usual for drivers/atm), I'd say that submit_queue() is fucked in head in the "queue full" case. And judging by the history, had been thus since the original merge... Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL conversions. Al's analysis above is similar to how things looked for other patches you submited of this nature. So because of the lack of care and serious auditing you put into these conversions, I really have no choice but to drop them from my queue because on the whole they are adding bugs rather than improving the code. Thank you. Okay, I can understand that careless GFP_KERNEL conversions will introduce bugs. Your concern is right. Sorry for my previous incorrect patches... I will manually carefully check and audit my patches before I submit them. Thanks, Jia-Ju Bai
Re: [bpf PATCH 1/3] net: add a UID to use for ULP socket assignment
On 01/26/2018 07:52 AM, Dave Watson wrote: > On 01/25/18 04:27 PM, John Fastabend wrote: >> I did not however retest TLS with the small change to ULP layer. >> Mostly because I don't have a TLS setup. I plan/hope to get around >> to writing either a sample or preferably a selftest for this case >> as well (assuming I didn't miss one). > >> @Dave Watson can you take a quick look to verify the changes are >> good on TLS ULP side. > > Looks reasonable, and passes my test suite. One comment below > > Tested-by: Dave Watson > >> Signed-off-by: John Fastabend >> --- >> include/net/tcp.h |6 ++ >> net/ipv4/tcp_ulp.c | 53 >> +++- >> net/tls/tls_main.c |2 ++ >> 3 files changed, 56 insertions(+), 5 deletions(-) >> >> diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c >> index 6bb9e14..8ef437d 100644 >> --- a/net/ipv4/tcp_ulp.c >> +++ b/net/ipv4/tcp_ulp.c >> @@ -133,3 +157,22 @@ int tcp_set_ulp(struct sock *sk, const char *name) >> icsk->icsk_ulp_ops = ulp_ops; >> return 0; >> } >> + >> +int tcp_set_ulp_id(struct sock *sk, int ulp) >> +{ >> +struct inet_connection_sock *icsk = inet_csk(sk); >> +const struct tcp_ulp_ops *ulp_ops; >> +int err; >> + >> +if (icsk->icsk_ulp_ops) >> +return -EEXIST; >> + >> +ulp_ops = __tcp_ulp_lookup(ulp); >> +if (!ulp_ops) >> +return -ENOENT; >> + >> +err = ulp_ops->init(sk); >> +if (!err) >> +icsk->icsk_ulp_ops = ulp_ops; > > Does this need module_put on error, similar to tcp_set_ulp? Not needed in current use because its only being used with an in-kernel ULP. However, best to fix it so that we don't have a (hard to detect) bug first time someone uses this with a module ULP. Thanks I'll spin a v2 this morning. > >> +return err; >> +}
pull-request: wireless-drivers 2018-01-26
Hi Dave, this is a pull request to the net tree for 4.15. I hate to do late pull requests like this but today Sven Joachim found a serious regression in one of ssb patches, I hope there's still enough time to get this to 4.15. But if it's too late, just let me know and I'll queue this for 4.16. Kalle The following changes since commit 1e19c4d689dc1e95bafd23ef68fbc0c6b9e05180: net: vrf: Add support for sends to local broadcast address (2018-01-25 21:51:03 -0500) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git tags/wireless-drivers-for-davem-2018-01-26 for you to fetch changes up to a9e6d44ddeccd3522670e641f1ed9b068e746ff7: ssb: Do not disable PCI host on non-Mips (2018-01-26 16:55:34 +0200) wireless-drivers fixes for 4.15 ssb * last minute patch to fix b43 and b44 drivers on non-mips platforms, a regression from v4.15-rc9 Sven Joachim (1): ssb: Do not disable PCI host on non-Mips drivers/ssb/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
[GIT] Networking
1) The per-network-namespace loopback device, and thus it's namespace, can have it's teardown deferred for a long time if a kernel created TCP socket closes and the namespace is exiting meanwhile. The kernel keeps trying to finish the close sequence until it times out (which takes quite some time). Fix this by forcing the socket closed in this situation, from Dan Streetman. 2) Fix regression where we're trying to invoke the update_pmtu method on route types (in this case metadata tunnel routes) that don't implement the dst_ops method. Fix from Nicolas Dichtel. 3) Fix long standing memory corruption issues in r8169 driver by performing the chip statistics DMA programming more correctly. From Francois Romieu. 4) Handle local broadcast sends over VRF routes properly, from David Ahern. 5) Don't refire the DCCP CCID2 timer endlessly, otherwise the socket can never be released. From Alexey Kodanev. 6) Set poll flags properly in VSOCK protocol layer, from Stefan Hajnoczi. Please pull, thanks a lot! The following changes since commit 5b7d27967dabfb17c21b0d98b29153b9e3ee71e5: Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-01-24 17:24:30 -0800) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git for you to fetch changes up to ba3169fc7548759be986b168d662e0ba64c2fd88: VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING (2018-01-26 11:16:27 -0500) Alexey Kodanev (1): dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state Dan Streetman (1): net: tcp: close sock if net namespace is exiting David Ahern (1): net: vrf: Add support for sends to local broadcast address Francois Romieu (1): r8169: fix memory corruption on retrieval of hardware statistics. Nicolas Dichtel (1): net: don't call update_pmtu unconditionally Stefan Hajnoczi (1): VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING drivers/infiniband/ulp/ipoib/ipoib_cm.c | 3 +-- drivers/net/ethernet/realtek/r8169.c| 9 ++--- drivers/net/geneve.c| 4 ++-- drivers/net/vrf.c | 5 +++-- drivers/net/vxlan.c | 6 ++ include/net/dst.h | 8 include/net/net_namespace.h | 10 ++ net/dccp/ccids/ccid2.c | 3 +++ net/ipv4/ip_tunnel.c| 3 +-- net/ipv4/ip_vti.c | 2 +- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_timer.c| 15 +++ net/ipv6/ip6_tunnel.c | 6 ++ net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 4 ++-- net/vmw_vsock/af_vsock.c| 2 +- 16 files changed, 57 insertions(+), 28 deletions(-)
Re: pull-request: wireless-drivers 2018-01-26
From: Kalle Valo Date: Fri, 26 Jan 2018 18:33:33 +0200 > this is a pull request to the net tree for 4.15. I hate to do late pull > requests like this but today Sven Joachim found a serious regression in > one of ssb patches, I hope there's still enough time to get this to > 4.15. > > But if it's too late, just let me know and I'll queue this for 4.16. I think this is going to have to wait for 4.16, I just sent what I hope is my last pull to Linus for this cycle. Thanks.
Re: pull-request: wireless-drivers 2018-01-26
David Miller writes: > From: Kalle Valo > Date: Fri, 26 Jan 2018 18:33:33 +0200 > >> this is a pull request to the net tree for 4.15. I hate to do late pull >> requests like this but today Sven Joachim found a serious regression in >> one of ssb patches, I hope there's still enough time to get this to >> 4.15. >> >> But if it's too late, just let me know and I'll queue this for 4.16. > > I think this is going to have to wait for 4.16, I just sent what I hope > is my last pull to Linus for this cycle. Yeah, I guessed that when I noticed you sent your pull request just 3 minutes after mine :) I'll send this to you next week in another pull request, please drop this one. -- Kalle Valo
[PATCH net-next,v2 0/2] net: sched: introduce em_ipt ematch
From: Eyal Birger The following patchset introduces a new tc ematch for matching using netfilter matches. This allows early classification as well as mirroning/redirecting traffic based on logic implemented in netfilter extensions. Example use case is classification based on the incoming IPSec policy used during decpsulation using the 'policy' iptables extension (xt_policy). This patchset is an enhancement of a former series ([1]) which allowed only policy matching following a suggestion by Pablo Neira Ayuso ([2]). [1] https://patchwork.ozlabs.org/cover/859887/ [2] https://patchwork.ozlabs.org/patch/859888/ v2: Remove skb push/pull and limit functionality to ingress Eyal Birger (2): net: sched: ematch: pass protocol to ematch 'change()' handlers net: sched: add em_ipt ematch for calling xtables matches include/net/pkt_cls.h| 2 +- include/uapi/linux/pkt_cls.h | 3 +- include/uapi/linux/tc_ematch/tc_em_ipt.h | 19 +++ net/sched/Kconfig| 10 ++ net/sched/Makefile | 1 + net/sched/em_canid.c | 4 +- net/sched/em_ipset.c | 4 +- net/sched/em_ipt.c | 244 +++ net/sched/em_meta.c | 2 +- net/sched/em_nbyte.c | 4 +- net/sched/em_text.c | 2 +- net/sched/ematch.c | 3 +- 12 files changed, 287 insertions(+), 11 deletions(-) create mode 100644 include/uapi/linux/tc_ematch/tc_em_ipt.h create mode 100644 net/sched/em_ipt.c -- 2.7.4
[PATCH net-next,v2 2/2] net: sched: add em_ipt ematch for calling xtables matches
From: Eyal Birger This module allows performing tc classification based on data structures and implementations provided by netfilter extensions. Example use case is classification based on the incoming IPSec policy used during decpsulation using the 'policy' iptables extension (xt_policy). Only tc ingress is supported in order to avoid modifying the skb at egress to suit xtable matches skb->data expectations. Signed-off-by: Eyal Birger Acked-by: Pablo Neira Ayuso --- This ematch tries its best to provide matches with a similar environment to the one they are used to; Due to the wide range of criteria supported by matches, I am not able to test every combination of match and traffic. --- include/uapi/linux/pkt_cls.h | 3 +- include/uapi/linux/tc_ematch/tc_em_ipt.h | 19 +++ net/sched/Kconfig| 10 ++ net/sched/Makefile | 1 + net/sched/em_ipt.c | 244 +++ 5 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/tc_ematch/tc_em_ipt.h create mode 100644 net/sched/em_ipt.c diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 46c5066..7cafb26d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -555,7 +555,8 @@ enum { #defineTCF_EM_VLAN 6 #defineTCF_EM_CANID7 #defineTCF_EM_IPSET8 -#defineTCF_EM_MAX 8 +#defineTCF_EM_IPT 9 +#defineTCF_EM_MAX 9 enum { TCF_EM_PROG_TC diff --git a/include/uapi/linux/tc_ematch/tc_em_ipt.h b/include/uapi/linux/tc_ematch/tc_em_ipt.h new file mode 100644 index 000..aecd252 --- /dev/null +++ b/include/uapi/linux/tc_ematch/tc_em_ipt.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_TC_EM_IPT_H +#define __LINUX_TC_EM_IPT_H + +#include +#include + +enum { + TCA_EM_IPT_UNSPEC, + TCA_EM_IPT_HOOK, + TCA_EM_IPT_MATCH_NAME, + TCA_EM_IPT_MATCH_REVISION, + TCA_EM_IPT_MATCH_DATA, + __TCA_EM_IPT_MAX +}; + +#define TCA_EM_IPT_MAX (__TCA_EM_IPT_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index c03d86a..203e7f7 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -658,6 +658,16 @@ config NET_EMATCH_IPSET To compile this code as a module, choose M here: the module will be called em_ipset. +config NET_EMATCH_IPT + tristate "IPtables Matches" + depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES + ---help--- + Say Y here to be able to classify packets based on iptables + matches. + + To compile this code as a module, choose M here: the + module will be called em_ipt. + config NET_CLS_ACT bool "Actions" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 5b63544..8811d38 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -75,3 +75,4 @@ obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o +obj-$(CONFIG_NET_EMATCH_IPT) += em_ipt.o diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c new file mode 100644 index 000..2103b30 --- /dev/null +++ b/net/sched/em_ipt.c @@ -0,0 +1,244 @@ +/* + * net/sched/em_ipt.c IPtables matches Ematch + * + * (c) 2018 Eyal Birger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct em_ipt_match { + const struct xt_match *match; + u32 hook; + u8 nfproto; + u8 match_data[0] __aligned(8); +}; + +static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len) +{ + struct xt_mtchk_param mtpar = {}; + union { + struct ipt_entry e4; + struct ip6t_entry e6; + } e = {}; + + mtpar.net = net; + mtpar.table = "filter"; + mtpar.hook_mask = 1 << im->hook; + mtpar.family= im->nfproto; + mtpar.match = im->match; + mtpar.entryinfo = &e; + mtpar.matchinfo = (void *)im->match_data; + return xt_check_match(&mtpar, mdata_len, 0, 0); +} + +static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = { + [TCA_EM_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_EM_IPT_MATCH_NAME] = { .type = NLA_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, + [TCA_EM_IPT_MATCH_REVISION] = { .type = NLA_U8 }
[PATCH net-next,v2 1/2] net: sched: ematch: pass protocol to ematch 'change()' handlers
From: Eyal Birger In order to allow ematches to create their internal state based on the L3 protocol specified when creating the filter. Signed-off-by: Eyal Birger --- include/net/pkt_cls.h | 2 +- net/sched/em_canid.c | 4 ++-- net/sched/em_ipset.c | 4 ++-- net/sched/em_meta.c | 2 +- net/sched/em_nbyte.c | 4 ++-- net/sched/em_text.c | 2 +- net/sched/ematch.c| 3 ++- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 8740625..ff5fcb1 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -474,7 +474,7 @@ struct tcf_ematch_tree { struct tcf_ematch_ops { int kind; int datalen; - int (*change)(struct net *net, void *, + int (*change)(struct net *net, __be16, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index ddd883c..445c10d 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -120,8 +120,8 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, return match; } -static int em_canid_change(struct net *net, void *data, int len, - struct tcf_ematch *m) +static int em_canid_change(struct net *net, __be16 protocol, void *data, + int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ struct canid_match *cm; diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index c1b23e3..50f7282 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -19,8 +19,8 @@ #include #include -static int em_ipset_change(struct net *net, void *data, int data_len, - struct tcf_ematch *em) +static int em_ipset_change(struct net *net, __be16 protocol, void *data, + int data_len, struct tcf_ematch *em) { struct xt_set_info *set = data; ip_set_id_t index; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index d6e9711..6892efc 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -904,7 +904,7 @@ static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, }; -static int em_meta_change(struct net *net, void *data, int len, +static int em_meta_change(struct net *net, __be16 protocol, void *data, int len, struct tcf_ematch *m) { int err; diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 07c10ba..62a7a2e 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -23,8 +23,8 @@ struct nbyte_data { charpattern[0]; }; -static int em_nbyte_change(struct net *net, void *data, int data_len, - struct tcf_ematch *em) +static int em_nbyte_change(struct net *net, __be16 protocol, void *data, + int data_len, struct tcf_ematch *em) { struct tcf_em_nbyte *nbyte = data; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 73e2ed5..b5d9e21 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -44,7 +44,7 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, return skb_find_text(skb, from, to, tm->config) != UINT_MAX; } -static int em_text_change(struct net *net, void *data, int len, +static int em_text_change(struct net *net, __be16 protocol, void *data, int len, struct tcf_ematch *m) { struct text_match *tm; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 1331a4c..a69abd8 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -242,7 +242,8 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { - err = em->ops->change(net, data, data_len, em); + err = em->ops->change(net, tp->protocol, data, data_len, + em); if (err < 0) goto errout; } else if (data_len > 0) { -- 2.7.4
Re: [PATCH 1/3] Partial revert "e1000e: Avoid receiver overrun interrupt bursts"
On Fri, Jan 26, 2018 at 1:12 AM, Benjamin Poirier wrote: > This reverts commit 4aea7a5c5e940c1723add439f4088844cd26196d. > > We keep the fix for the first part of the problem (1) described in the log > of that commit however we use the implementation of e1000_msix_other() from > before commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt"). > We remove the fix for the second part of the problem (2). > > Bursts of "Other" interrupts may once again occur during rxo (receive > overflow) traffic conditions. This is deemed acceptable in the interest of > reverting driver code back to its previous state. The e1000e driver should > be in "maintenance" mode and we want to avoid unforeseen fallout from > changes that are not strictly necessary. > > Link: https://www.spinics.net/lists/netdev/msg480675.html > Signed-off-by: Benjamin Poirier Thanks for doing this. Only a few minor tweaks I would recommend. Otherwise it looks good. > --- > drivers/net/ethernet/intel/e1000e/defines.h | 1 - > drivers/net/ethernet/intel/e1000e/netdev.c | 32 > +++-- > 2 files changed, 12 insertions(+), 21 deletions(-) > > diff --git a/drivers/net/ethernet/intel/e1000e/defines.h > b/drivers/net/ethernet/intel/e1000e/defines.h > index afb7ebe20b24..0641c0098738 100644 > --- a/drivers/net/ethernet/intel/e1000e/defines.h > +++ b/drivers/net/ethernet/intel/e1000e/defines.h > @@ -398,7 +398,6 @@ > #define E1000_ICR_LSC 0x0004 /* Link Status Change */ > #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */ > #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */ > -#define E1000_ICR_RXO 0x0040 /* Receiver Overrun */ > #define E1000_ICR_RXT0 0x0080 /* Rx timer intr (ring 0) */ > #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */ > /* If this bit asserted, the driver should claim the interrupt */ > diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c > b/drivers/net/ethernet/intel/e1000e/netdev.c > index 9f18d39bdc8f..398e940436f8 100644 > --- a/drivers/net/ethernet/intel/e1000e/netdev.c > +++ b/drivers/net/ethernet/intel/e1000e/netdev.c > @@ -1914,30 +1914,23 @@ static irqreturn_t e1000_msix_other(int > __always_unused irq, void *data) > struct net_device *netdev = data; > struct e1000_adapter *adapter = netdev_priv(netdev); > struct e1000_hw *hw = &adapter->hw; > - u32 icr; > - bool enable = true; > - > - icr = er32(ICR); > - if (icr & E1000_ICR_RXO) { > - ew32(ICR, E1000_ICR_RXO); > - enable = false; > - /* napi poll will re-enable Other, make sure it runs */ > - if (napi_schedule_prep(&adapter->napi)) { > - adapter->total_rx_bytes = 0; > - adapter->total_rx_packets = 0; > - __napi_schedule(&adapter->napi); > - } > - } > - if (icr & E1000_ICR_LSC) { > - ew32(ICR, E1000_ICR_LSC); > + u32 icr = er32(ICR); > + > + if (icr & adapter->eiac_mask) > + ew32(ICS, (icr & adapter->eiac_mask)); I'm not sure about this bit as it includes queue interrupts if I am not mistaken. What we should be focusing on are the bits that are not auto-cleared so this should probably be icr & ~adapter->eiac_mask. Actually what you might do is something like: icr &= ~adapter->eiac_mask; if (icr) ew32(ICS, icr); Also a comment explaining that we have to clear the bits that are not automatically cleared by other interrupt causes might be useful. > + if (icr & E1000_ICR_OTHER) { > + if (!(icr & E1000_ICR_LSC)) > + goto no_link_interrupt; I wouldn't bother with the jump tag. Let's just make this one if statement checking both OTHER && LSC. > hw->mac.get_link_status = true; > /* guard against interrupt when we're going down */ > if (!test_bit(__E1000_DOWN, &adapter->state)) > mod_timer(&adapter->watchdog_timer, jiffies + 1); > } > > - if (enable && !test_bit(__E1000_DOWN, &adapter->state)) > - ew32(IMS, E1000_IMS_OTHER); > +no_link_interrupt: If you just combine the if statement logic the code naturally flows to this point anyway without the jump label being needed. > + if (!test_bit(__E1000_DOWN, &adapter->state)) > + ew32(IMS, E1000_IMS_LSC | E1000_IMS_OTHER); > > return IRQ_HANDLED; > } > @@ -2707,8 +2700,7 @@ static int e1000e_poll(struct napi_struct *napi, int > weight) > napi_complete_done(napi, work_done); > if (!test_bit(__E1000_DOWN, &adapter->state)) { > if (adapter->msix_entries) > - ew32(IMS, adapter->rx_ring->ims_val | > -E1000_IMS_OTHER); > +
Re: [PATCH bpf-next 1/2] bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map
On 1/25/18 8:47 PM, Eric Dumazet wrote: On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote: +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes +* have exact two children, so this function will never return NULL. +*/ syzbot [1] disagrees violently with this comment. + for (node = rcu_dereference(*root); node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); Thanks for reporting the issue. I looked at the update and delete code in lpm_trie.c, it looks the comment that each intermediate node has two children still holds. But I do find a problem: 616 /* Empty trie */ 617 if (!rcu_dereference(trie->root)) 618 return -ENOENT; 619 620 /* For invalid key, find the leftmost node in the trie */ 621 if (!key || key->prefixlen > trie->max_prefixlen) { 622 root = &trie->root; 623 goto find_leftmost; 624 } .. 672 find_leftmost: 673 /* Find the leftmost non-intermediate node, all intermediate nodes 674 * have exact two children, so this function will never return NULL. 675 */ 676 for (node = rcu_dereference(*root); node;) { 677 if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) 678 next_node = node; 679 node = rcu_dereference(node->child[0]); 680 } It is possible that at line 617, trie->root is not NULL, but later at line 676 is NULL. This will lead to next_node is NULL. Will write a test to demonstrate this and examine intermediate node property and propose a fix soon. [1] syzbot hit the following crash on e9dcd80b9d77a92bfae6ce42a451f5c5fd318832 git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master config: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller-2Dbuganizer.googleplex.com_text-3Ftag-3DConfig-26id-3Db2216f04db2aa337e2bbf5ebd233919c3e2aa05f&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=aHeuptD-ytpgZCGZK_koCI1pUdJBACYveZ3RmN35g90&e= compiler: gcc (GCC) 7.1.1 20170620 Unfortunately, I don't have any reproducer for this bug yet. raw crash log: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller-2Dbuganizer.googleplex.com_text-3Ftag-3DCrashLog-26id-3D4f78be02e2cd37040b8796322e02b147caae6024&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=RS0jaMd3n5ipeDtuFCL8B0152mZoHBx0b8xNq9AjDes&e= dashboard link: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D148b56534d9269ab7433&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=6L_1a0P1zZJmhAN49_cgfs8_LjnBW7h4L-SXEFF4orc&e= See https://urldefense.proofpoint.com/v2/url?u=http-3A__go_syzbot&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=iajwDJDDZzMqC0CxPLANoybk0lVIpuh2r7pz0rcD8kI&e= for details on how to handle this bug. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 RSP: 0018:8801aa44f628 EFLAGS: 00010202 RAX: dc00 RBX: RCX: 81829a9d RDX: 0004 RSI: c90003b7b000 RDI: 0020 RBP: 8801aa44f8b0 R08: 817e8f95 R09: 0002 R10: 8801aa44f790 R11: R12: R13: 110035489f01 R14: fff4 R15: FS: 7fbb3b39b700() GS:8801db30() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 2057a000 CR3: 0001c26e4005 CR4: 001606e0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: map_get_next_key kernel/bpf/syscall.c:842 [inline] SYSC_bpf kernel/bpf/syscall.c:1881 [inline] SyS_bpf+0x11b4/0x4860 kernel/bpf/syscall.c:1846 entry_SYSCALL_64_fastpath+0x29/0xa0 RIP: 0033:0x452f19 RSP: 002b:7fbb3b39ac58 EFLAGS: 0212 ORIG_RAX: 0141 RAX: ffda RBX: 0071bea0 RCX: 00452f19 RDX: 0018
Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
On 1/26/2018 12:14 AM, Siwei Liu wrote: On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin wrote: On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote: On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin wrote: On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote: First off, as mentioned in another thread, the model of stacking up virt-bond functionality over virtio seems a wrong direction to me. Essentially the migration process would need to carry over all guest side configurations previously done on the VF/PT and get them moved to the new device being it virtio or VF/PT. I might be wrong but I don't see why we should worry about this usecase. Whoever has a bond configured already has working config for migration. We are trying to help people who don't, not convert existig users. That has been placed in the view of cloud providers that the imported images from the store must be able to run unmodified thus no additional setup script is allowed (just as Stephen mentioned in another mail). Cloud users don't care about live migration themselves but the providers are required to implement such automation mechanism to make this process transparent if at all possible. The user does not care about the device underneath being VF or not, but they do care about consistency all across and the resulting performance acceleration in making VF the prefered datapath. It is not quite peculiar user cases but IMHO *any* approach proposed for live migration should be able to persist the state including network config e.g. as simple as MTU. Actually this requirement has nothing to do with virtio but our target users are live migration agnostic, being it tracking DMA through dirty pages, using virtio as the helper, or whatsoever, the goal of persisting configs across remains same. So the patching being discussed here will mostly do exactly that if your original config was simply a single virtio net device. True, but I don't see the patch being discussed starts with good foundation of supporting the same for VF/PT device. That is the core of the issue. What kind of configs do your users have right now? Any configs be it generic or driver specific that the VF/PT device supports and have been enabled/configured. General network configs (MAC, IP address, VLAN, MTU, iptables rules), ethtool settings (hardware offload, # of queues and ring entris, RSC options, rss rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc flower offload, just to name a few. As cloud providers we don't limit users from applying driver specific tuning to the NIC/VF, and sometimes this is essential to achieving best performance for their workload. We've seen cases like tuning coalescing parameters for getting low latency, changing rx-flow-hash function for better VXLAN throughput, or even adopting quite advanced NIC features such as flow director or cloud filter. We don't expect users to compromise even a little bit on these. That is once we turn on live migration for the VF or pass through devices in the VM, it all takes place under the hood, users (guest admins, applications) don't have to react upon it or even notice the change. I should note that the majority of live migrations take place between machines with completely identical hardware, it's more critical than necessary to keep the config as-is across the move, stealth while quiet. This usecase is much more complicated and different than what this patch is trying to address. Also your usecase seems to be assuming that source and destination hosts are identical and have the same HW. It makes it pretty hard to transparently migrate all these settings with live migration when we are looking at a solution that unplugs the VF interface from the host and the VF driver is unloaded. As you see generic bond or bridge cannot suffice the need. That's why we need a new customized virt bond driver, and tailor it for VM live migration specifically. Leveraging para-virtual e.g. virtio net device as the backup path is one thing, tracking through driver config changes in order to re-config as necessary is another. I would think without considering the latter, the proposal being discussed is rather incomplete, and remote to be useful in production. Without the help of a new upper layer bond driver that enslaves virtio and VF/PT devices underneath, virtio will be overloaded with too much specifics being a VF/PT backup in the future. So this paragraph already includes at least two conflicting proposals. On the one hand you want a separate device for the virtual bond, on the other you are saying a separate driver. Just to be crystal clear: separate virtual bond device (netdev ops, not necessarily bus device) for VM migration specifically with a separate driver. Okay, but note that any config someone had on a virtio device won't propagate to that bond. Further, the reason to have a separate *driver* was that some people wanted to share code with netvsc -
[net] i40e/i40evf: Update DESC_NEEDED value to reflect larger value
From: Alexander Duyck When compared to ixgbe and other previous Intel drivers the i40e and i40evf drivers actually reserve 2 additional descriptors in maybe_stop_tx for cache line alignment. We need to update DESC_NEEDED to reflect this as otherwise we are more likely to return TX_BUSY which will cause issues with things like xmit_more. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 2 +- drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index fbae1182e2ea..675923f47281 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -277,7 +277,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size) } /* Tx Descriptors needed, worst case */ -#define DESC_NEEDED (MAX_SKB_FRAGS + 4) +#define DESC_NEEDED (MAX_SKB_FRAGS + 6) #define I40E_MIN_DESC_PENDING 4 #define I40E_TX_FLAGS_HW_VLAN BIT(1) diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h index 8d26c85d12e1..1f2959e66c34 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h @@ -260,7 +260,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size) } /* Tx Descriptors needed, worst case */ -#define DESC_NEEDED (MAX_SKB_FRAGS + 4) +#define DESC_NEEDED (MAX_SKB_FRAGS + 6) #define I40E_MIN_DESC_PENDING 4 #define I40E_TX_FLAGS_HW_VLAN BIT(1) -- 2.14.3
Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
On Thu, Jan 11, 2018 at 09:58:39PM -0800, Sridhar Samudrala wrote: > @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = { > #endif > }; > > +static int virtio_netdev_event(struct notifier_block *this, > +unsigned long event, void *ptr) > +{ > + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); > + > + /* Skip our own events */ > + if (event_dev->netdev_ops == &virtnet_netdev) > + return NOTIFY_DONE; > + > + /* Avoid non-Ethernet type devices */ > + if (event_dev->type != ARPHRD_ETHER) > + return NOTIFY_DONE; > + > + /* Avoid Vlan dev with same MAC registering as VF */ > + if (is_vlan_dev(event_dev)) > + return NOTIFY_DONE; > + > + /* Avoid Bonding master dev with same MAC registering as VF */ > + if ((event_dev->priv_flags & IFF_BONDING) && > + (event_dev->flags & IFF_MASTER)) > + return NOTIFY_DONE; > + > + switch (event) { > + case NETDEV_REGISTER: > + return virtnet_register_vf(event_dev); > + case NETDEV_UNREGISTER: > + return virtnet_unregister_vf(event_dev); > + default: > + return NOTIFY_DONE; > + } > +} > + > +static struct notifier_block virtio_netdev_notifier = { > + .notifier_call = virtio_netdev_event, > +}; > + > static __init int virtio_net_driver_init(void) > { > int ret; > @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void) > ret = register_virtio_driver(&virtio_net_driver); > if (ret) > goto err_virtio; > + > + register_netdevice_notifier(&virtio_netdev_notifier); > return 0; > err_virtio: > cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); > @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init); > > static __exit void virtio_net_driver_exit(void) > { > + unregister_netdevice_notifier(&virtio_netdev_notifier); > unregister_virtio_driver(&virtio_net_driver); > cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); > cpuhp_remove_multi_state(virtionet_online); I have a question here: what if PT device driver module loads and creates a device before virtio? > -- > 2.14.3
[PATCH net-next] sfc: mark some unexported symbols as static
From: kbuild test robot efx_default_channel_want_txqs() is only used in efx.c, while efx_ptp_want_txqs() and efx_ptp_channel_type (a struct) are only used in ptp.c. In all cases these symbols should be static. Fixes: 2935e3c38228 ("sfc: on 8000 series use TX queues for TX timestamps") Signed-off-by: Fengguang Wu [ec...@solarflare.com: rewrote commit message] Signed-off-by: Edward Cree --- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/ethernet/sfc/ptp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 456866b05641..16757cfc5b29 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -896,7 +896,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue) mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(100)); } -bool efx_default_channel_want_txqs(struct efx_channel *channel) +static bool efx_default_channel_want_txqs(struct efx_channel *channel) { return channel->channel - channel->efx->tx_channel_offset < channel->efx->n_tx_channels; diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 433d29d6bc95..3e2c5b11b5ef 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -366,7 +366,7 @@ bool efx_ptp_use_mac_tx_timestamps(struct efx_nic *efx) /* PTP 'extra' channel is still a traffic channel, but we only create TX queues * if PTP uses MAC TX timestamps, not if PTP uses the MC directly to transmit. */ -bool efx_ptp_want_txqs(struct efx_channel *channel) +static bool efx_ptp_want_txqs(struct efx_channel *channel) { return efx_ptp_use_mac_tx_timestamps(channel->efx); } @@ -2146,7 +2146,7 @@ static int efx_phc_enable(struct ptp_clock_info *ptp, return 0; } -const struct efx_channel_type efx_ptp_channel_type = { +static const struct efx_channel_type efx_ptp_channel_type = { .handle_no_channel = efx_ptp_handle_no_channel, .pre_probe = efx_ptp_probe_channel, .post_remove= efx_ptp_remove_channel,
Re: [PATCH 2/3] Revert "e1000e: Separate signaling for link check/link up"
On Fri, Jan 26, 2018 at 1:12 AM, Benjamin Poirier wrote: > This reverts commit 19110cfbb34d4af0cdfe14cd243f3b09dc95b013. > This reverts commit 4110e02eb45ea447ec6f5459c9934de0a273fb91. > > ... because they cause an extra 2s delay for the link to come up when > autoneg is off. > > After reverting, the race condition described in the log of commit > 19110cfbb34d ("e1000e: Separate signaling for link check/link up") is > reintroduced. It may still be triggered by LSC events but this should not > result in link flap. It may no longer be triggered by RXO events because > commit 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts") > restored reading icr in the Other handler. With the RXO events removed the only cause for us to transition the bit should be LSC. I'm not sure if the race condition in that state is a valid concern or not as the LSC should only get triggered if the link state toggled, even briefly. The bigger concern I would have would be the opposite of the original race that was pointed out: \ e1000_watchdog_task \ e1000e_has_link \ hw->mac.ops.check_for_link() === e1000e_check_for_copper_link /* link is up */ mac->get_link_status = false; /* interrupt */ \ e1000_msix_other hw->mac.get_link_status = true; link_active = !hw->mac.get_link_status /* link_active is false, wrongly */ So the question I would have is what if we see the LSC for a link down just after the check_for_copper_link call completes? It may not be anything seen in the real world since I don't know if we have any link flapping issues on e1000e or not without this patch. It is something to keep in mind for the future though. > As discussed, the driver should be in "maintenance mode". In the interest > of stability, revert to the original code as much as possible instead of a > half-baked solution. If nothing else we may want to do a follow-up on this patch as we probably shouldn't be returning the error values to trigger link up. There are definitely issues to be found here. If nothing else we may want to explore just returning 1 if auto-neg is disabled instead of returning an error code. > Link: https://www.spinics.net/lists/netdev/msg479923.html > Signed-off-by: Benjamin Poirier > --- > drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 +++ > drivers/net/ethernet/intel/e1000e/mac.c | 11 +++ > drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- > 3 files changed, 7 insertions(+), 17 deletions(-) > > diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c > b/drivers/net/ethernet/intel/e1000e/ich8lan.c > index 31277d3bb7dc..d6d4ed7acf03 100644 > --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c > +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c > @@ -1367,9 +1367,6 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw > *hw, bool force) > * Checks to see of the link status of the hardware has changed. If a > * change in link status has been detected, then we read the PHY registers > * to get the current speed/duplex if link exists. > - * > - * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link > - * up). > **/ > static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) > { > @@ -1385,7 +1382,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct > e1000_hw *hw) > * Change or Rx Sequence Error interrupt. > */ > if (!mac->get_link_status) > - return 1; > + return 0; > > /* First we want to see if the MII Status Register reports > * link. If so, then we want to get the current speed/duplex > @@ -1616,12 +1613,10 @@ static s32 e1000_check_for_copper_link_ich8lan(struct > e1000_hw *hw) > * different link partner. > */ > ret_val = e1000e_config_fc_after_link_up(hw); > - if (ret_val) { > + if (ret_val) > e_dbg("Error configuring flow control\n"); > - return ret_val; > - } > > - return 1; > + return ret_val; > } > > static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter) > diff --git a/drivers/net/ethernet/intel/e1000e/mac.c > b/drivers/net/ethernet/intel/e1000e/mac.c > index f457c5703d0c..b322011ec282 100644 > --- a/drivers/net/ethernet/intel/e1000e/mac.c > +++ b/drivers/net/ethernet/intel/e1000e/mac.c > @@ -410,9 +410,6 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw) > * Checks to see of the link status of the hardware has changed. If a > * change in link status has been detected, then we read the PHY registers > * to get the current speed/duplex if link exists. > - * > - * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link > - * up). > **/ > s32 e1000e_check_for_copper_link(struct e1000_hw *hw) > { > @@ -426,7 +423,7 @@ s32 e1000e_check_for_cop
pull-request: wireless-drivers-next 2018-01-26
Hi Dave, this is a pull request to net-next for 4.16, more info in the signed tag below. Please let me know if you have any problems. Kalle The following changes since commit ebdd7b491b8a65d65936e07004caabca4a3c94a0: Merge branch 'mlxsw-Add-support-for-mirror-action-with-flower' (2018-01-21 18:21:31 -0500) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git tags/wireless-drivers-next-for-davem-2018-01-26 for you to fetch changes up to 30ce7f4456ae40e970d9e82fe63c5e55147af0c0: mt76: validate rx CCMP PN (2018-01-26 11:20:52 +0200) wireless-drivers-next patches for 4.16 Major changes: wil6210 * add PCI device id for Talyn * support flashless device ath9k * improve RSSI/signal accuracy on AR9003 series mt76 * validate CCMP PN from received frames to avoid replay attacks qtnfmac * support 64-bit network stats * report more hardware information to kernel log and some via ethtool Arend Van Spriel (2): brcmfmac: assure bcdc dcmd api does not return value > 0 brcmfmac: separate firmware errors from i/o errors Dedy Lansky (1): wil6210: support flashless device Felix Fietkau (10): mt76: fix transmission of encrypted management frames ath9k: discard undersized packets mt76: retry rx polling as long as there is budget left mt76: fix TSF value in probe responses mt76: add an intermediate struct for rx status information mt76: get station pointer by wcid and pass it to mac80211 mt76: implement A-MPDU rx reordering in the driver code mt76: split mt76_rx_complete mt76: pass the per-vif wcid to the core for multicast rx mt76: validate rx CCMP PN Igor Mitsyanko (2): qtnfmac: do not use mutexes in timer context qtnfmac: do not use bus mutex for events processing Kalle Valo (1): Merge ath-next from git://git.kernel.org/.../kvalo/ath.git Lior David (2): wil6210: fix random failure to bring network interface up wil6210: enlarge FW mac_rgf_ext section for Sparrow D0 Lorenzo Bianconi (5): mt76x2: fix WMM parameter configuration mt76x2: dfs: avoid tasklet scheduling during mt76x2_dfs_init_params() mt76x2: dfs: add set_domain handler mt76x2: dfs: take into account dfs region in mt76x2_dfs_init_params() mt76x2: init: disable all pending tasklets during device removal Luis de Bethencourt (1): rtl8xxxu: Fix trailing semicolon Maya Erez (5): wil6210: add Talyn PCIe device ID wil6210: recognize Talyn JTAG ID wil6210: add support for Talyn AHB address map wil6210: configure OTP HW vectors in SW reset flow wil6210: support parsing brd file address from fw file Ping-Ke Shih (10): rtlwifi: btcoex: extend get_wifi_bw to support bandwidth 80M rtlwifi: btcoex: Add switch band notify for btc rtlwifi: btcoex: Add variable ant_div_cfg to support antenna diversity rtlwifi: btcoex: add scan_notify within ips_notify if RFON rtlwifi: btcoex: Add wifi_only series ops to control solo card rtlwifi: btcoex: add boolean variables dbg_mode rtlwifi: 8822be has to report vht capability to mac80211 rtlwifi: Add ratr_table for newer IC rtlwifi: Add spec_ver to check whether use new rate-id or not rtlwifi: btcoex: Fix some static warnings from Sparse Ramon Fried (1): wcn36xx: release DMA memory in case of error Sergey Matyukevich (4): qtnfmac: modify supported interface combinations qtnfmac: validate interface combinations on changes qtnfmac: fix STA disconnect procedure qtnfmac: remove redundant 'unlikely' checks Vasily Ulyanov (5): qtnfmac: remove struct qlink_cmd_set_mac_acl qtnfmac: fix warnings when mBSS setup is stopped qtnfmac: support 64-bit network interface stats qtnfmac: get more hardware info from card qtnfmac: report hardware/firmware information via ethtool Wojciech Dubowik (4): ath9k: Alternative EEPROM size for AR9003 ath9k: Read noise floor calibration data from eeprom ath9k: Use calibrated noise floor value when available ath9k: Display calibration data piers in debugfs drivers/net/wireless/ath/ath9k/ar9003_eeprom.c | 133 ++- drivers/net/wireless/ath/ath9k/ar9003_eeprom.h | 10 + drivers/net/wireless/ath/ath9k/calib.c | 38 +-- drivers/net/wireless/ath/ath9k/hw.h| 2 + drivers/net/wireless/ath/ath9k/recv.c | 4 +- drivers/net/wireless/ath/wcn36xx/dxe.c | 46 +++- drivers/net/wireless/ath/wil6210/boot_loader.h | 9 +- drivers/net/wireless/ath/wil6210/fw.h | 18 +- drivers/net/wireless/ath/wil6210/fw_inc.c | 167 - drivers/net/wireless/ath/wil6210/interrupt.c | 6 +- drivers/
Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send
On Fri, Jan 26, 2018 at 11:07:39AM -0500, David Miller wrote: > >> This is found by a static analysis tool named DCNS written by myself. > > > > The trouble is, places like > > net/atm/raw.c:65: vcc->send = atm_send_aal0; > > net/atm/raw.c:74: vcc->send = vcc->dev->ops->send; > > net/atm/raw.c:83: vcc->send = vcc->dev->ops->send; > > mean extra call chains. It's *not* just vcc_sendmsg(), and e.g. > > ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) > > ? DROP_PACKET : 1; > > bh_unlock_sock(sk_atm(vcc)); > > in pppoatm_send() definitely is called under a spinlock. > > > > Looking through the driver (in advanced bitrot, as usual for drivers/atm), > > I'd say that submit_queue() is fucked in head in the "queue full" case. > > And judging by the history, had been thus since the original merge... > > Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL > conversions. > > Al's analysis above is similar to how things looked for other patches > you submited of this nature. > > So because of the lack of care and serious auditing you put into these > conversions, I really have no choice but to drop them from my queue > because on the whole they are adding bugs rather than improving the > code. FWIW, the tool *does* promise to be useful - as far as I understand it looks for places where GFP_ATOMIC allocation goes with blocking operations near every callchain leading there. And that indicates something fishy going on - either pointless GFP_ATOMIC (in benign case), or something much nastier: a callchain that would require GFP_ATOMIC. In that case whatever blocking operation found along the way is a bug. This time it has, AFAICS, caught a genuine bug in drivers/atm/firestream.c - static void submit_qentry (struct fs_dev *dev, struct queue *q, struct FS_QENTRY *qe) { u32 wp; struct FS_QENTRY *cqe; /* XXX Sanity check: the write pointer can be checked to be still the same as the value passed as qe... -- REW */ /* udelay (5); */ while ((wp = read_fs (dev, Q_WP (q->offset))) & Q_FULL) { fs_dprintk (FS_DEBUG_TXQ, "Found queue at %x full. Waiting.\n", q->offset); schedule (); } ... static void submit_queue (struct fs_dev *dev, struct queue *q, u32 cmd, u32 p1, u32 p2, u32 p3) { struct FS_QENTRY *qe; qe = get_qentry (dev, q); qe->cmd = cmd; qe->p0 = p1; qe->p1 = p2; qe->p2 = p3; submit_qentry (dev, q, qe); ... static int fs_send (struct atm_vcc *atm_vcc, struct sk_buff *skb) { ... td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC); ... submit_queue (dev, &dev->hp_txq, QE_TRANSMIT_DE | vcc->channo, virt_to_bus (td), 0, virt_to_bus (td)); ... Either all callchains leading to fs_send() are in non-atomic contexts (in which case that GFP_ATOMIC would be pointless) or there's one where we cannot block. Any such would be a potential deadlock. And the latter appears to be the case - fs_send() is atmdev_ops ->send(), which can end up in atm_vcc ->send(), which can be called from under ->sk_lock.slock. What would be really useful: * list of "here's a list of locations where we do something blocking; each callchain to this GFP_ATOMIC allocation passes either upstream of one of those without leaving atomic context in between or downstream without entering one". * after that - backtracking these callchains further, watching for ones in atomic contexts. Can be done manually, but if that tool can assist in doing so, all the better. If we do find one, we have found a deadlock - just take the blocking operation reported for that callchain and that's it. If it's not an obvious false positive (e.g. if (!foo) spin_lock(&splat); . if (foo) schedule(); ), it's worth reporting to maintainers of the code in question. * if all callchains reach obviously non-atomic contexts (syscall entry, for example, or a kernel thread payload, or a function documented to require a mutex held by caller, etc.) - then a switch to GFP_KERNEL might be appropriate. With analysis of callchains posted as you are posting that. * either way, having the tool print the callchains out would be a good idea - for examining them, for writing reports, etc.
Re: [regresssion 4.15] Userspace compilation broken by uapi/linux/if_ether.h update
On Fri, Jan 26, 2018 at 11:51:38AM +0100, Guillaume Nault wrote: > On Thu, Jan 25, 2018 at 11:21:34PM +0100, Hauke Mehrtens wrote: > > On 01/25/2018 03:58 PM, Guillaume Nault wrote: > > > Now that linux/libc-compat.h is included in linux/if_ether.h, it is > > > processed before netinet/in.h. Therefore, it sets the relevant > > > __UAPI_DEF_* guards to 1 (as _NETINET_IN_H isn't yet defined). > > > Then netinet/in.h is included, followed by linux/in.h. The later > > > doesn't realise that what it defines has already been included by > > > netinet/in.h because the __UAPI_DEF_* guards were set too early. > > > > > This is about this commit: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6926e041a8920c8ec27e4e155efa760aa01551fd > > > > On option would be to move this into include/uapi/linux/if_ether.h and > > remove the include for libc-compat.h: > > #ifndef __UAPI_DEF_ETHHDR > > #define __UAPI_DEF_ETHHDR 1 > > #endif > > > > This will only work if netinet/if_ether.h is included before > > linux/if_ether.h, but I think this is very likely. > > > I don't see what makes its likely. That's not directly related to your > point, but for example, glibc guarantees the opposite as it includes > linux/if_ether.h at the beginning of netinet/if_ether.h. > > > I think we can do this because we do not need some special libc handling > > like it is done for other symbols as __UAPI_DEF_ETHHDR is currently only > > needed by musl and not by glibc. > > > That's ok for me as long as existing projects keep compiling. But all > __UAPI_DEF_* are currently centralised in libc-compat.h. Adding > __UAPI_DEF_ETHHDR in if_ether.h looks like defeating the purpose of > libc-compat.h and I wonder if that'd be accepted. Maybe with a > different name. > > In any case, we're really late in the release cycle. If more discussion > is needed, it's probably better to revert and take time to work on a > solution for the next release. > Hi David, I just realise you've sent your last pull request for this release. I was waiting for feedbacks in order to avoid a revert. Should I send a revert now or do you prefer to sort this out later and backport a fix in 4.15.1?
[bpf PATCH v2 0/3] Series short description
The following series implements... --- John Fastabend (3): net: add a UID to use for ULP socket assignment bpf: sockmap, add sock close() hook to remove socks bpf: sockmap, fix leaking maps with attached but not detached progs include/net/tcp.h|8 ++ kernel/bpf/sockmap.c | 165 -- net/ipv4/tcp_ulp.c | 57 - net/tls/tls_main.c |2 + 4 files changed, 155 insertions(+), 77 deletions(-) -- Signature
Re: [bpf PATCH v2 0/3] Series short description
On 01/26/2018 09:43 AM, John Fastabend wrote: > The following series implements... > > --- > Sorry random noise on my side fat-fingered some keys. Please just ignore it. :/
Re: [RFC PATCH 0/2] hv_netvsc: Fix shutdown regression on Win2012 hosts
On Tue, 23 Jan 2018 10:34:03 +0100 Mohammed Gamal wrote: > Commit 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split") introduced > a regression that caused VMs not to shutdown after netvsc_device_remove() is > called. This is caused by GPADL teardown sequence change, and while that was > necessary to fix issues with Win2016 hosts, it did introduce a regression for > earlier versions. > > Prior to commit 0cf737808 the call sequence in netvsc_device_remove() was as > follows (as implemented in netvsc_destroy_buf()): > 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message > 2- Teardown receive buffer GPADL > 3- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message > 4- Teardown send buffer GPADL > 5- Close vmbus > > This didn't work for WS2016 hosts. Commit 0cf737808 split netvsc_destroy_buf() > into two functions and rearranged the order as follows > 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message > 2- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message > 3- Close vmbus > 4- Teardown receive buffer GPADL > 5- Teardown send buffer GPADL > > That worked well for WS2016 hosts, but for WS2012 hosts it prevented VMs from > shutting down. > > This patch series works around this problem. The first patch splits > netvsc_revoke_buf() and netvsc_teardown_gpadl() into two finer grained > functions for tearing down send and receive buffers individally. The second > patch > uses the finer grained functions to implement the teardown sequence according > to > the host's version. We keep the behavior introduced in 0cf737808ae7 for > Windows > 2016 hosts, while we re-introduce the old sequence for earlier verions. > > Mohammed Gamal (2): > hv_netvsc: Split netvsc_revoke_buf() and netvsc_teardown_gpadl() > hv_netvsc: Change GPADL teardown order according to Hyper-V version > > drivers/net/hyperv/netvsc.c | 50 > + > 1 file changed, 42 insertions(+), 8 deletions(-) > What I am experimenting with is sending an NDIS_RESET (instead of setting packet filter) as part of the close processing. This seems more like what the description of what Windows driver does and matches my reading of the public RNDIS specification.
[bpf-next PATCH v2 0/3] bpf: sockmap fixes
A set of fixes for sockmap to resolve map/prog not being cleaned up when maps are not cleaned up from the program side. For this we pull in the ULP infrastructure to hook into the close() hook of the sock layer. This seemed natural because we have additional sockmap features (to add support for TX hooks) that will also use the ULP infrastructure. This allows us to cleanup entries in the map when socks are closed() and avoid trying to get the sk_state_change() hook to fire in all cases. The second issue resolved here occurs when users don't detach programs. The gist is a refcnt issue resolved by implementing the release callback. See patch for details. For testing I ran both sample/sockmap and selftests bpf/test_maps.c. Dave Watson ran TLS test suite on v1 version of the patches without the put_module error path change. v2 changes rebased onto bpf-next with the following small updates Patch 1/3 Dave Watson pointed out that we should do a module put in case future users use this with another ULP other than sockmap. And module_put is a nop when owner = NULL (sockmap case) so no harm here and code is more robust. Patch 2/3 Be explicit and set user_visible to false just to avoid any reader confusion. --- John Fastabend (3): net: add a UID to use for ULP socket assignment bpf: sockmap, add sock close() hook to remove socks bpf: sockmap, fix leaking maps with attached but not detached progs include/net/tcp.h|8 ++ kernel/bpf/sockmap.c | 165 -- net/ipv4/tcp_ulp.c | 57 - net/tls/tls_main.c |2 + 4 files changed, 155 insertions(+), 77 deletions(-) -- Signature
[bpf-next PATCH v2 1/3] net: add a UID to use for ULP socket assignment
Create a UID field and enum that can be used to assign ULPs to sockets. This saves a set of string comparisons if the ULP id is known. For sockmap, which is added in the next patches, a ULP is used to hook into TCP sockets close state. In this case the ULP being added is done at map insert time and the ULP is known and done on the kernel side. In this case the named lookup is not needed. Because we don't want to expose psock internals to user space socket options a user visible flag is also added. For TLS this is set for BPF it will be cleared. Alos remove pr_notice, user gets an error code back and should check that rather than rely on logs. Signed-off-by: John Fastabend --- include/net/tcp.h |6 + net/ipv4/tcp_ulp.c | 57 +++- net/tls/tls_main.c |2 ++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 093e967..ba10ca7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1983,6 +1983,10 @@ static inline void tcp_listendrop(const struct sock *sk) #define TCP_ULP_MAX128 #define TCP_ULP_BUF_MAX(TCP_ULP_NAME_MAX*TCP_ULP_MAX) +enum { + TCP_ULP_TLS, +}; + struct tcp_ulp_ops { struct list_headlist; @@ -1991,7 +1995,9 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); + int uid; charname[TCP_ULP_NAME_MAX]; + booluser_visible; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 6bb9e14..6d87e7a 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -29,6 +29,18 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) return NULL; } +static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp) +{ + struct tcp_ulp_ops *e; + + list_for_each_entry_rcu(e, &tcp_ulp_list, list) { + if (e->uid == ulp) + return e; + } + + return NULL; +} + static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; @@ -51,6 +63,18 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) return ulp; } +static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid) +{ + const struct tcp_ulp_ops *ulp = NULL; + + rcu_read_lock(); + ulp = tcp_ulp_find_id(uid); + if (!ulp || !try_module_get(ulp->owner)) + ulp = NULL; + rcu_read_unlock(); + return ulp; +} + /* Attach new upper layer protocol to the list * of available protocols. */ @@ -59,13 +83,10 @@ int tcp_register_ulp(struct tcp_ulp_ops *ulp) int ret = 0; spin_lock(&tcp_ulp_list_lock); - if (tcp_ulp_find(ulp->name)) { - pr_notice("%s already registered or non-unique name\n", - ulp->name); + if (tcp_ulp_find(ulp->name)) ret = -EEXIST; - } else { + else list_add_tail_rcu(&ulp->list, &tcp_ulp_list); - } spin_unlock(&tcp_ulp_list_lock); return ret; @@ -124,6 +145,32 @@ int tcp_set_ulp(struct sock *sk, const char *name) if (!ulp_ops) return -ENOENT; + if (!ulp_ops->user_visible) + return -ENOENT; + + err = ulp_ops->init(sk); + if (err) { + module_put(ulp_ops->owner); + return err; + } + + icsk->icsk_ulp_ops = ulp_ops; + return 0; +} + +int tcp_set_ulp_id(struct sock *sk, int ulp) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_ulp_ops *ulp_ops; + int err; + + if (icsk->icsk_ulp_ops) + return -EEXIST; + + ulp_ops = __tcp_ulp_lookup(ulp); + if (!ulp_ops) + return -ENOENT; + err = ulp_ops->init(sk); if (err) { module_put(ulp_ops->owner); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 736719c..b0d5fce 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -484,6 +484,8 @@ static int tls_init(struct sock *sk) static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", + .uid= TCP_ULP_TLS, + .user_visible = true, .owner = THIS_MODULE, .init = tls_init, };
[bpf-next PATCH v2 3/3] bpf: sockmap, fix leaking maps with attached but not detached progs
When a program is attached to a map we increment the program refcnt to ensure that the program is not removed while it is potentially being referenced from sockmap side. However, if this same program also references the map (this is a reasonably common pattern in my programs) then the verifier will also increment the maps refcnt from the verifier. This is to ensure the map doesn't get garbage collected while the program has a reference to it. So we are left in a state where the map holds the refcnt on the program stopping it from being removed and releasing the map refcnt. And vice versa the program holds a refcnt on the map stopping it from releasing the refcnt on the prog. All this is fine as long as users detach the program while the map fd is still around. But, if the user omits this detach command we are left with a dangling map we can no longer release. To resolve this when the map fd is released decrement the program references and remove any reference from the map to the program. This fixes the issue with possibly dangling map and creates a user side API constraint. That is, the map fd must be held open for programs to be attached to a map. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend --- kernel/bpf/sockmap.c | 19 ++- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 4e7ba37..a30449d 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -598,11 +598,6 @@ static void sock_map_free(struct bpf_map *map) } rcu_read_unlock(); - if (stab->bpf_verdict) - bpf_prog_put(stab->bpf_verdict); - if (stab->bpf_parse) - bpf_prog_put(stab->bpf_parse); - sock_map_remove_complete(stab); } @@ -878,6 +873,19 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } +static void sock_map_release(struct bpf_map *map, struct file *map_file) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *orig; + + orig = xchg(&stab->bpf_parse, NULL); + if (orig) + bpf_prog_put(orig); + orig = xchg(&stab->bpf_verdict, NULL); + if (orig) + bpf_prog_put(orig); +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -885,6 +893,7 @@ static int sock_map_update_elem(struct bpf_map *map, .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, + .map_release = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
[bpf-next PATCH v2 2/3] bpf: sockmap, add sock close() hook to remove socks
The selftests test_maps program was leaving dangling BPF sockmap programs around because not all psock elements were removed from the map. The elements in turn hold a reference on the BPF program they are attached to causing BPF programs to stay open even after test_maps has completed. The original intent was that sk_state_change() would be called when TCP socks went through TCP_CLOSE state. However, because socks may be in SOCK_DEAD state or the sock may be a listening socket the event is not always triggered. To resolve this use the ULP infrastructure and register our own proto close() handler. This fixes the above case. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: Prashant Bhole Signed-off-by: John Fastabend --- include/net/tcp.h|2 + kernel/bpf/sockmap.c | 146 +++--- 2 files changed, 81 insertions(+), 67 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index ba10ca7..e082101 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1985,6 +1985,7 @@ static inline void tcp_listendrop(const struct sock *sk) enum { TCP_ULP_TLS, + TCP_ULP_BPF, }; struct tcp_ulp_ops { @@ -2003,6 +2004,7 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); +int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0314d17..4e7ba37 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -86,9 +86,10 @@ struct smap_psock { struct work_struct tx_work; struct work_struct gc_work; + struct proto *sk_proto; + void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); - void (*save_state_change)(struct sock *sk); }; static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -96,12 +97,80 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +struct proto tcp_bpf_proto; +static int bpf_tcp_init(struct sock *sk) +{ + struct smap_psock *psock; + + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + return -EINVAL; + + if (unlikely(psock->sk_proto)) + return -EBUSY; + + psock->save_close = sk->sk_prot->close; + psock->sk_proto = sk->sk_prot; + sk->sk_prot = &tcp_bpf_proto; + return 0; +} + +static void bpf_tcp_release(struct sock *sk) +{ + struct smap_psock *psock = smap_psock_sk(sk); + + if (likely(psock)) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); + +void bpf_tcp_close(struct sock *sk, long timeout) +{ + struct smap_psock_map_entry *e, *tmp; + struct smap_psock *psock; + struct sock *osk; + + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + return sk->sk_prot->close(sk, timeout); + + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); + return psock->save_close(sk, timeout); +} + enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, }; +static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { + .name = "bpf_tcp", + .uid= TCP_ULP_BPF, + .user_visible = false, + .owner = NULL, + .init = bpf_tcp_init, + .release= bpf_tcp_release, +}; + +static int bpf_tcp_ulp_register(void) +{ + tcp_bpf_proto = tcp_prot; + tcp_bpf_proto.close = bpf_tcp_close; + return tcp_register_ulp(&bpf_tcp_ulp_ops); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -166,68 +235,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - -/* Called with lock_sock(sk) held */ -static void smap_state_change(struct sock *sk) -{ - struct smap_psock_map_entry *e, *tmp; - struct smap_psock *psock; - struct socket_wq *wq; - struct sock *osk; - - rcu_read_lock(); - - /* Allowing transitions into an established syn_recv states allows -* for early binding sockets
Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
On 1/26/2018 8:58 AM, Michael S. Tsirkin wrote: On Thu, Jan 11, 2018 at 09:58:39PM -0800, Sridhar Samudrala wrote: @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = { #endif }; +static int virtio_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + /* Skip our own events */ + if (event_dev->netdev_ops == &virtnet_netdev) + return NOTIFY_DONE; + + /* Avoid non-Ethernet type devices */ + if (event_dev->type != ARPHRD_ETHER) + return NOTIFY_DONE; + + /* Avoid Vlan dev with same MAC registering as VF */ + if (is_vlan_dev(event_dev)) + return NOTIFY_DONE; + + /* Avoid Bonding master dev with same MAC registering as VF */ + if ((event_dev->priv_flags & IFF_BONDING) && + (event_dev->flags & IFF_MASTER)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return virtnet_register_vf(event_dev); + case NETDEV_UNREGISTER: + return virtnet_unregister_vf(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block virtio_netdev_notifier = { + .notifier_call = virtio_netdev_event, +}; + static __init int virtio_net_driver_init(void) { int ret; @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void) ret = register_virtio_driver(&virtio_net_driver); if (ret) goto err_virtio; + + register_netdevice_notifier(&virtio_netdev_notifier); return 0; err_virtio: cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init); static __exit void virtio_net_driver_exit(void) { + unregister_netdevice_notifier(&virtio_netdev_notifier); unregister_virtio_driver(&virtio_net_driver); cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); cpuhp_remove_multi_state(virtionet_online); I have a question here: what if PT device driver module loads and creates a device before virtio? Initially i also had this question if we get NETDEV_REGISTER events for netdevs that are already present. But it looks like register_netdevice_notifier() will cause replay of all the registration and up events of existing devices. /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ Thanks Sridhar
[net-next 15/15] ixgbe: don't set RXDCTL.RLPML for 82599
From: Emil Tantilov commit 2de6aa3a666e ("ixgbe: Add support for padding packet") Uses RXDCTL.RLPML to limit the maximum frame size on Rx when using build_skb. Unfortunately that register does not work on 82599. Added an explicit check to avoid setting this register on 82599 MAC. Extended the comment related to the setting of RXDCTL.RLPML to better explain its purpose. Signed-off-by: Emil Tantilov Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 643c7288ea0f..0da5aa2c8aba 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -4133,11 +4133,15 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, rxdctl &= ~0x3F; rxdctl |= 0x080420; #if (PAGE_SIZE < 8192) - } else { + /* RXDCTL.RLPML does not work on 82599 */ + } else if (hw->mac.type != ixgbe_mac_82599EB) { rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); - /* Limit the maximum frame size so we don't overrun the skb */ + /* Limit the maximum frame size so we don't overrun the skb. +* This can happen in SRIOV mode when the MTU of the VF is +* higher than the MTU of the PF. +*/ if (ring_uses_build_skb(ring) && !test_bit(__IXGBE_RX_3K_BUFFER, &ring->state)) rxdctl |= IXGBE_MAX_2K_FRAME_BUILD_SKB | -- 2.14.3
[net-next 10/15] ixgbevf: use ARRAY_SIZE for various array sizing calculations
From: Colin Ian King Use the ARRAY_SIZE macro on various arrays to determine size of the arrays. Improvement suggested by coccinelle. Signed-off-by: Colin Ian King Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbevf/vf.c | 17 +++-- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c index 64c93e8becc6..38d3a327c1bc 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.c +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c @@ -286,7 +286,7 @@ static s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr) ether_addr_copy(msg_addr, addr); ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, -sizeof(msgbuf) / sizeof(u32)); +ARRAY_SIZE(msgbuf)); if (!ret_val) { msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS; @@ -456,8 +456,7 @@ static s32 ixgbevf_set_rar_vf(struct ixgbe_hw *hw, u32 index, u8 *addr, ether_addr_copy(msg_addr, addr); ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, -sizeof(msgbuf) / sizeof(u32)); - +ARRAY_SIZE(msgbuf)); msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS; /* if nacked the address was rejected, use "perm_addr" */ @@ -574,7 +573,7 @@ static s32 ixgbevf_update_xcast_mode(struct ixgbe_hw *hw, int xcast_mode) msgbuf[1] = xcast_mode; err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, -sizeof(msgbuf) / sizeof(u32)); +ARRAY_SIZE(msgbuf)); if (err) return err; @@ -614,7 +613,7 @@ static s32 ixgbevf_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT; err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, -sizeof(msgbuf) / sizeof(u32)); +ARRAY_SIZE(msgbuf)); if (err) goto mbx_err; @@ -826,7 +825,7 @@ static s32 ixgbevf_set_rlpml_vf(struct ixgbe_hw *hw, u16 max_size) msgbuf[1] = max_size; ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, -sizeof(msgbuf) / sizeof(u32)); +ARRAY_SIZE(msgbuf)); if (ret_val) return ret_val; if ((msgbuf[0] & IXGBE_VF_SET_LPE) && @@ -872,8 +871,7 @@ static int ixgbevf_negotiate_api_version_vf(struct ixgbe_hw *hw, int api) msg[1] = api; msg[2] = 0; - err = ixgbevf_write_msg_read_ack(hw, msg, msg, -sizeof(msg) / sizeof(u32)); + err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg)); if (!err) { msg[0] &= ~IXGBE_VT_MSGTYPE_CTS; @@ -924,8 +922,7 @@ int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs, msg[0] = IXGBE_VF_GET_QUEUE; msg[1] = msg[2] = msg[3] = msg[4] = 0; - err = ixgbevf_write_msg_read_ack(hw, msg, msg, -sizeof(msg) / sizeof(u32)); + err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg)); if (!err) { msg[0] &= ~IXGBE_VT_MSGTYPE_CTS; -- 2.14.3
[net-next 14/15] ixgbe: Fix && vs || typo
From: Dan Carpenter "offset" can't be both 0x0 and 0x so presumably || was intended instead of &&. That matches with how this check is done in other functions. Fixes: 73834aec7199 ("ixgbe: extend firmware version support") Signed-off-by: Dan Carpenter Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 7ac7ef9b37ff..61188f343955 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -4087,7 +4087,7 @@ void ixgbe_get_oem_prod_version(struct ixgbe_hw *hw, hw->eeprom.ops.read(hw, NVM_OEM_PROD_VER_PTR, &offset); /* Return is offset to OEM Product Version block is invalid */ - if (offset == 0x0 && offset == NVM_INVALID_PTR) + if (offset == 0x0 || offset == NVM_INVALID_PTR) return; /* Read product version block */ -- 2.14.3
[net-next 07/15] ixgbevf: clear rx_buffer_info in configure instead of clean
From: Emil Tantilov Based on commit d2bead576e67 ("igb: Clear Rx buffer_info in configure instead of clean") This change makes it so that instead of going through the entire ring on Rx cleanup we only go through the region that was designated to be cleaned up and stop when we reach the region where new allocations should start. In addition we can avoid having to perform a memset on the Rx buffer_info structures until we are about to start using the ring again. Signed-off-by: Emil Tantilov Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 26 +++ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 350afec3dde8..a793f9ea05e7 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1773,6 +1773,10 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0); ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx); + /* initialize rx_buffer_info */ + memset(ring->rx_buffer_info, 0, + sizeof(struct ixgbevf_rx_buffer) * ring->count); + /* initialize Rx descriptor 0 */ rx_desc = IXGBEVF_RX_DESC(ring, 0); rx_desc->wb.upper.length = 0; @@ -2131,8 +2135,7 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter) **/ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) { - unsigned long size; - unsigned int i; + u16 i = rx_ring->next_to_clean; /* Free Rx ring sk_buff */ if (rx_ring->skb) { @@ -2140,17 +2143,11 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) rx_ring->skb = NULL; } - /* ring already cleared, nothing to do */ - if (!rx_ring->rx_buffer_info) - return; - /* Free all the Rx ring pages */ - for (i = 0; i < rx_ring->count; i++) { + while (i != rx_ring->next_to_alloc) { struct ixgbevf_rx_buffer *rx_buffer; rx_buffer = &rx_ring->rx_buffer_info[i]; - if (!rx_buffer->page) - continue; /* Invalidate cache lines that may have been written to by * device so that we avoid corrupting memory. @@ -2171,11 +2168,14 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) __page_frag_cache_drain(rx_buffer->page, rx_buffer->pagecnt_bias); - rx_buffer->page = NULL; + i++; + if (i == rx_ring->count) + i = 0; } - size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count; - memset(rx_ring->rx_buffer_info, 0, size); + rx_ring->next_to_alloc = 0; + rx_ring->next_to_clean = 0; + rx_ring->next_to_use = 0; } /** @@ -3090,7 +3090,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring *rx_ring) int size; size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count; - rx_ring->rx_buffer_info = vzalloc(size); + rx_ring->rx_buffer_info = vmalloc(size); if (!rx_ring->rx_buffer_info) goto err; -- 2.14.3
[net-next 13/15] ixgbe: add support for reporting 5G link speed
From: Paul Greenwalt Since 5G link speed is supported by some devices, add reporting of 5G link speed. Signed-off-by: Paul Greenwalt Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index bbb622f15a77..643c7288ea0f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -7259,6 +7259,9 @@ static void ixgbe_watchdog_link_is_up(struct ixgbe_adapter *adapter) case IXGBE_LINK_SPEED_10GB_FULL: speed_str = "10 Gbps"; break; + case IXGBE_LINK_SPEED_5GB_FULL: + speed_str = "5 Gbps"; + break; case IXGBE_LINK_SPEED_2_5GB_FULL: speed_str = "2.5 Gbps"; break; -- 2.14.3
[net-next 04/15] ixgbevf: add support for DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING
From: Emil Tantilov Based on commit 5be5955425c2 ("igb: update driver to make use of DMA_ATTR_SKIP_CPU_SYNC") and commit 7bd175928280 ("igb: Add support for DMA_ATTR_WEAK_ORDERING") Convert the calls to dma_map/unmap_page() to the attributes version and add DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING which should help improve performance on some platforms. Move sync_for_cpu call before we perform a prefetch to avoid invalidating the first 128 bytes of the packet on architectures where that call may invalidate the cache. Signed-off-by: Emil Tantilov Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 3 ++ drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 57 ++- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 581f44bbd7b3..b1da9f41c1dc 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -260,6 +260,9 @@ static inline void ixgbevf_write_tail(struct ixgbevf_ring *ring, u32 value) #define MIN_MSIX_Q_VECTORS 1 #define MIN_MSIX_COUNT (MIN_MSIX_Q_VECTORS + NON_Q_VECTORS) +#define IXGBEVF_RX_DMA_ATTR \ + (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + /* board specific private data structure */ struct ixgbevf_adapter { /* this field must be first, see ixgbevf_process_skb_fields */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 725fe2dca868..fbd493efd14e 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -595,8 +595,8 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, } /* map page for use */ - dma = dma_map_page(rx_ring->dev, page, 0, - PAGE_SIZE, DMA_FROM_DEVICE); + dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, +DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); /* if mapping failed free memory back to system since * there isn't much point in holding memory we can't use @@ -639,6 +639,12 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, if (!ixgbevf_alloc_mapped_page(rx_ring, bi)) break; + /* sync the buffer for use by the device */ + dma_sync_single_range_for_device(rx_ring->dev, bi->dma, +bi->page_offset, +IXGBEVF_RX_BUFSZ, +DMA_FROM_DEVICE); + /* Refresh the desc even if pkt_addr didn't change * because each write-back erases this info. */ @@ -741,12 +747,6 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, new_buff->page = old_buff->page; new_buff->dma = old_buff->dma; new_buff->page_offset = old_buff->page_offset; - - /* sync the buffer for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma, -new_buff->page_offset, -IXGBEVF_RX_BUFSZ, -DMA_FROM_DEVICE); } static inline bool ixgbevf_page_is_reserved(struct page *page) @@ -862,6 +862,13 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring, page = rx_buffer->page; prefetchw(page); + /* we are reusing so sync this buffer for CPU use */ + dma_sync_single_range_for_cpu(rx_ring->dev, + rx_buffer->dma, + rx_buffer->page_offset, + size, + DMA_FROM_DEVICE); + if (likely(!skb)) { void *page_addr = page_address(page) + rx_buffer->page_offset; @@ -887,21 +894,15 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring, prefetchw(skb->data); } - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - rx_buffer->dma, - rx_buffer->page_offset, - size, - DMA_FROM_DEVICE); - /* pull page into skb */ if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { /* hand second half of page back to the ring */ ixgbevf_reuse_rx_page(rx_ring, rx_buffer); } else { /* we are not reusing the buffer so unmap it */ - dma_unmap_page(rx_ring->dev, rx_buf
[net-next 05/15] ixgbevf: update code to better handle incrementing page count
From: Emil Tantilov Based on commit bd4171a5d4c2 ("igb: update code to better handle incrementing page count") Update the driver code so that we do bulk updates of the page reference count instead of just incrementing it by one reference at a time. The advantage to doing this is that we cut down on atomic operations and this in turn should give us a slight improvement in cycles per packet. In addition if we eventually move this over to using build_skb the gains will be more noticeable. Signed-off-by: Emil Tantilov Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 7 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 30 +-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index b1da9f41c1dc..c70a789035ae 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -62,7 +62,12 @@ struct ixgbevf_tx_buffer { struct ixgbevf_rx_buffer { dma_addr_t dma; struct page *page; - unsigned int page_offset; +#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) + __u32 page_offset; +#else + __u16 page_offset; +#endif + __u16 pagecnt_bias; }; struct ixgbevf_stats { diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index fbd493efd14e..ae2402ddd9fb 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -611,6 +611,7 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, bi->dma = dma; bi->page = page; bi->page_offset = 0; + bi->pagecnt_bias = 1; return true; } @@ -747,6 +748,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, new_buff->page = old_buff->page; new_buff->dma = old_buff->dma; new_buff->page_offset = old_buff->page_offset; + new_buff->pagecnt_bias = old_buff->pagecnt_bias; } static inline bool ixgbevf_page_is_reserved(struct page *page) @@ -758,13 +760,15 @@ static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer, struct page *page, const unsigned int truesize) { + unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--; + /* avoid re-using remote pages */ if (unlikely(ixgbevf_page_is_reserved(page))) return false; #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely(page_count(page) != 1)) + if (unlikely(page_ref_count(page) != pagecnt_bias)) return false; /* flip page offset to other buffer */ @@ -778,10 +782,15 @@ static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer, return false; #endif - /* Even if we own the page, we are not allowed to use atomic_set() -* This would break get_page_unless_zero() users. + + /* If we have drained the page fragment pool we need to update +* the pagecnt_bias and page count so that we fully restock the +* number of references the driver holds. */ - page_ref_inc(page); + if (unlikely(pagecnt_bias == 1)) { + page_ref_add(page, USHRT_MAX); + rx_buffer->pagecnt_bias = USHRT_MAX; + } return true; } @@ -827,7 +836,6 @@ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, return true; /* this page cannot be reused so discard it */ - put_page(page); return false; } @@ -899,10 +907,13 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring, /* hand second half of page back to the ring */ ixgbevf_reuse_rx_page(rx_ring, rx_buffer); } else { - /* we are not reusing the buffer so unmap it */ + /* We are not reusing the buffer so unmap it and free +* any references we are holding to it +*/ dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); + __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } /* clear contents of buffer_info */ @@ -2135,6 +2146,8 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) struct ixgbevf_rx_buffer *rx_buffer; rx_buffer = &rx_ring->rx_buffer_info[i]; + if (!rx_buffer->page) + continue; /* Invalidate cache lines that may have been written to by * device so that we avoid corruptin
[net-next 08/15] ixgbevf: improve performance and reduce size of ixgbevf_tx_map()
From: Emil Tantilov Based on commit ec718254cbfe ("ixgbe: Improve performance and reduce size of ixgbe_tx_map") This change is meant to both improve the performance and reduce the size of ixgbevf_tx_map(). Expand the work done in the main loop by pushing first into tx_buffer. This allows us to pull in the dma_mapping_error check, the tx_buffer value assignment, and the initial DMA value assignment to the Tx descriptor. Signed-off-by: Emil Tantilov Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 45 ++- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index a793f9ea05e7..d3415ee38597 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -3532,34 +3532,37 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, struct ixgbevf_tx_buffer *first, const u8 hdr_len) { - dma_addr_t dma; struct sk_buff *skb = first->skb; struct ixgbevf_tx_buffer *tx_buffer; union ixgbe_adv_tx_desc *tx_desc; - struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; - unsigned int data_len = skb->data_len; - unsigned int size = skb_headlen(skb); - unsigned int paylen = skb->len - hdr_len; + struct skb_frag_struct *frag; + dma_addr_t dma; + unsigned int data_len, size; u32 tx_flags = first->tx_flags; - __le32 cmd_type; + __le32 cmd_type = ixgbevf_tx_cmd_type(tx_flags); u16 i = tx_ring->next_to_use; tx_desc = IXGBEVF_TX_DESC(tx_ring, i); - ixgbevf_tx_olinfo_status(tx_desc, tx_flags, paylen); - cmd_type = ixgbevf_tx_cmd_type(tx_flags); + ixgbevf_tx_olinfo_status(tx_desc, tx_flags, skb->len - hdr_len); + + size = skb_headlen(skb); + data_len = skb->data_len; dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE); - if (dma_mapping_error(tx_ring->dev, dma)) - goto dma_error; - /* record length, and DMA address */ - dma_unmap_len_set(first, len, size); - dma_unmap_addr_set(first, dma, dma); + tx_buffer = first; - tx_desc->read.buffer_addr = cpu_to_le64(dma); + for (frag = &skb_shinfo(skb)->frags[0];; frag++) { + if (dma_mapping_error(tx_ring->dev, dma)) + goto dma_error; + + /* record length, and DMA address */ + dma_unmap_len_set(tx_buffer, len, size); + dma_unmap_addr_set(tx_buffer, dma, dma); + + tx_desc->read.buffer_addr = cpu_to_le64(dma); - for (;;) { while (unlikely(size > IXGBE_MAX_DATA_PER_TXD)) { tx_desc->read.cmd_type_len = cmd_type | cpu_to_le32(IXGBE_MAX_DATA_PER_TXD); @@ -3570,12 +3573,12 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, tx_desc = IXGBEVF_TX_DESC(tx_ring, 0); i = 0; } + tx_desc->read.olinfo_status = 0; dma += IXGBE_MAX_DATA_PER_TXD; size -= IXGBE_MAX_DATA_PER_TXD; tx_desc->read.buffer_addr = cpu_to_le64(dma); - tx_desc->read.olinfo_status = 0; } if (likely(!data_len)) @@ -3589,23 +3592,15 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, tx_desc = IXGBEVF_TX_DESC(tx_ring, 0); i = 0; } + tx_desc->read.olinfo_status = 0; size = skb_frag_size(frag); data_len -= size; dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size, DMA_TO_DEVICE); - if (dma_mapping_error(tx_ring->dev, dma)) - goto dma_error; tx_buffer = &tx_ring->tx_buffer_info[i]; - dma_unmap_len_set(tx_buffer, len, size); - dma_unmap_addr_set(tx_buffer, dma, dma); - - tx_desc->read.buffer_addr = cpu_to_le64(dma); - tx_desc->read.olinfo_status = 0; - - frag++; } /* write last descriptor with RS and EOP bits */ -- 2.14.3