[PATCH v3 net-next 2/7] rds: hold a sock ref from rds_message to the rds_sock

2018-02-15 Thread Sowmini Varadhan
The existing model holds a reference from the rds_sock to the
rds_message, but the rds_message does not itself hold a sock_put()
on the rds_sock. Instead the m_rs field in the rds_message is
assigned when the message is queued on the sock, and nulled when
the message is dequeued from the sock.

We want to be able to notify userspace when the rds_message
is actually freed (from rds_message_purge(), after the refcounts
to the rds_message go to 0). At the time that rds_message_purge()
is called, the message is no longer on the rds_sock retransmit
queue. Thus the explicit reference for the m_rs is needed to
send a notification that will signal to userspace that
it is now safe to free/reuse any pages that may have
been pinned down for zerocopy.

This patch manages the m_rs assignment in the rds_message with
the necessary refcount book-keeping.

Signed-off-by: Sowmini Varadhan 
---
 net/rds/message.c |8 +++-
 net/rds/send.c|7 +--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/rds/message.c b/net/rds/message.c
index 4318cc9..ef3daaf 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -58,7 +58,7 @@ void rds_message_addref(struct rds_message *rm)
  */
 static void rds_message_purge(struct rds_message *rm)
 {
-   unsigned long i;
+   unsigned long i, flags;
 
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
@@ -69,6 +69,12 @@ static void rds_message_purge(struct rds_message *rm)
__free_page(sg_page(&rm->data.op_sg[i]));
}
rm->data.op_nents = 0;
+   spin_lock_irqsave(&rm->m_rs_lock, flags);
+   if (rm->m_rs) {
+   sock_put(rds_rs_to_sk(rm->m_rs));
+   rm->m_rs = NULL;
+   }
+   spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
if (rm->rdma.op_active)
rds_rdma_free_op(&rm->rdma);
diff --git a/net/rds/send.c b/net/rds/send.c
index b1b0022..e8f3ff4 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head 
*messages, int status)
rm->rdma.op_notifier = NULL;
}
was_on_sock = 1;
-   rm->m_rs = NULL;
}
spin_unlock(&rs->rs_lock);
 
@@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct 
sockaddr_in *dest)
 */
if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
spin_unlock_irqrestore(&cp->cp_lock, flags);
-   spin_lock_irqsave(&rm->m_rs_lock, flags);
-   rm->m_rs = NULL;
-   spin_unlock_irqrestore(&rm->m_rs_lock, flags);
continue;
}
list_del_init(&rm->m_conn_item);
@@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct 
sockaddr_in *dest)
__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
spin_unlock(&rs->rs_lock);
 
-   rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
rds_message_put(rm);
@@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct 
sockaddr_in *dest)
__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
spin_unlock(&rs->rs_lock);
 
-   rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
rds_message_put(rm);
@@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct 
rds_connection *conn,
list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
rds_message_addref(rm);
+   sock_hold(rds_rs_to_sk(rs));
rm->m_rs = rs;
 
/* The code ordering is a little weird, but we're
-- 
1.7.1



[PATCH v2] ss: prepare rth when killing inet sock

2018-02-15 Thread Masatake YAMATO
kill_inet_sock() expects rhn_handle instance is passed
via inet_diag_arg argument. However on the following calling path:

generic_show_sock
=> show_one_inet_sock
   => kill_inet_sock

rth field of inet_diag_arg is not filled with the address of
rhn_handle instance. As the result ss crashes.

This commit fills the field with newly created rhn_handle
instance.

Changes in v2:
Instead of creating rtn_handle instances for each socket, create
one in upper layer and reuse it.

Signed-off-by: Masatake YAMATO 
---
 misc/ss.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/misc/ss.c b/misc/ss.c
index 29a25070..e047f9c0 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -239,6 +239,7 @@ struct filter {
uint64_t families;
struct ssfilter *f;
bool kill;
+   struct rtnl_handle *rth_for_killing;
 };
 
 #define FAMILY_MASK(family) ((uint64_t)1 << (family))
@@ -4262,6 +4263,7 @@ static int generic_show_sock(const struct sockaddr_nl 
*addr,
switch (r->sdiag_family) {
case AF_INET:
case AF_INET6:
+   inet_arg.rth = inet_arg.f->rth_for_killing;
return show_one_inet_sock(addr, nlh, &inet_arg);
case AF_UNIX:
return unix_show_sock(addr, nlh, arg);
@@ -4280,7 +4282,7 @@ static int handle_follow_request(struct filter *f)
 {
int ret = 0;
int groups = 0;
-   struct rtnl_handle rth;
+   struct rtnl_handle rth, rth2;
 
if (f->families & FAMILY_MASK(AF_INET) && f->dbs & (1 << TCP_DB))
groups |= 1 << (SKNLGRP_INET_TCP_DESTROY - 1);
@@ -4300,10 +4302,20 @@ static int handle_follow_request(struct filter *f)
rth.dump = 0;
rth.local.nl_pid = 0;
 
+   if (f->kill) {
+   if (rtnl_open_byproto(&rth2, groups, NETLINK_SOCK_DIAG)) {
+   rtnl_close(&rth);
+   return -1;
+   }
+   f->rth_for_killing = &rth2;
+   }
+
if (rtnl_dump_filter(&rth, generic_show_sock, f))
ret = -1;
 
rtnl_close(&rth);
+   if (f->rth_for_killing)
+   rtnl_close(f->rth_for_killing);
return ret;
 }
 
-- 
2.14.3



[PATCH][next] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift

2018-02-15 Thread Colin King
From: Colin Ian King 

The shifting of timehi by 16 bits to the left will be promoted to
a 32 bit signed int and then sign-extended to an u64. If the top bit
of timehi is set then all then all the upper bits of ns end up as also
being set because of the sign-extension. Fix this by casting timehi
to u64 before the shift.

Detected by CoverityScan, CID#1465288 ("Unintended sign extension")

Fixes: c6fe0ad2c349 ("net: dsa: mv88e6xxx: add rx/tx timestamping support")
Signed-off-by: Colin Ian King 
---
 drivers/net/dsa/mv88e6xxx/hwtstamp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c 
b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
index b251d534b70d..758e35fa69ab 100644
--- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c
+++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
@@ -321,7 +321,7 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip,
 */
for ( ; skb; skb = skb_dequeue(rxq)) {
if (mv88e6xxx_ts_valid(status) && seq_match(skb, seq_id)) {
-   u64 ns = timehi << 16 | timelo;
+   u64 ns = (u64)timehi << 16 | timelo;
 
mutex_lock(&chip->reg_lock);
ns = timecounter_cyc2time(&chip->tstamp_tc, ns);
-- 
2.15.1



[PATCH v3 net-next 7/7] selftests/net: add zerocopy support for PF_RDS test case

2018-02-15 Thread Sowmini Varadhan
Send a cookie with sendmsg() on PF_RDS sockets, and process the
returned batched cookies in do_recv_completion()

Signed-off-by: Sowmini Varadhan 
---
v2:
  - restructured do_recv_completion to avoid excessive code re-indent
  - on-stack allocation of cmsghdr for cookie in do_sendmsg
  - Additional verification: Verify ncookies <= MAX_.., verify ret ==
ncookies * sizeof(uint32_t)

 tools/testing/selftests/net/msg_zerocopy.c |   68 ++--
 1 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
b/tools/testing/selftests/net/msg_zerocopy.c
index 7a5b353..5cc2a53 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -168,17 +168,39 @@ static int do_accept(int fd)
return fd;
 }
 
-static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
+static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
+{
+   struct cmsghdr *cm;
+
+   if (!msg->msg_control)
+   error(1, errno, "NULL cookie");
+   cm = (void *)msg->msg_control;
+   cm->cmsg_len = CMSG_LEN(sizeof(cookie));
+   cm->cmsg_level = SOL_RDS;
+   cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
+   memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
+}
+
+static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int 
domain)
 {
int ret, len, i, flags;
+   static uint32_t cookie;
+   char ckbuf[CMSG_SPACE(sizeof(cookie))];
 
len = 0;
for (i = 0; i < msg->msg_iovlen; i++)
len += msg->msg_iov[i].iov_len;
 
flags = MSG_DONTWAIT;
-   if (do_zerocopy)
+   if (do_zerocopy) {
flags |= MSG_ZEROCOPY;
+   if (domain == PF_RDS) {
+   memset(&msg->msg_control, 0, sizeof(msg->msg_control));
+   msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
+   msg->msg_control = (struct cmsghdr *)ckbuf;
+   add_zcopy_cookie(msg, ++cookie);
+   }
+   }
 
ret = sendmsg(fd, msg, flags);
if (ret == -1 && errno == EAGAIN)
@@ -194,6 +216,10 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool 
do_zerocopy)
if (do_zerocopy && ret)
expected_completions++;
}
+   if (do_zerocopy && domain == PF_RDS) {
+   msg->msg_control = NULL;
+   msg->msg_controllen = 0;
+   }
 
return true;
 }
@@ -220,7 +246,9 @@ static void do_sendmsg_corked(int fd, struct msghdr *msg)
msg->msg_iov[0].iov_len = payload_len + extra_len;
extra_len = 0;
 
-   do_sendmsg(fd, msg, do_zerocopy);
+   do_sendmsg(fd, msg, do_zerocopy,
+  (cfg_dst_addr.ss_family == AF_INET ?
+   PF_INET : PF_INET6));
}
 
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
@@ -316,6 +344,26 @@ static int do_setup_tx(int domain, int type, int protocol)
return fd;
 }
 
+static int do_process_zerocopy_cookies(struct sock_extended_err *serr,
+  uint32_t *ckbuf, size_t nbytes)
+{
+   int ncookies, i;
+
+   if (serr->ee_errno != 0)
+   error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
+   ncookies = serr->ee_data;
+   if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES)
+   error(1, 0, "Returned %d cookies, max expected %d\n",
+ ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES);
+   if (nbytes != ncookies * sizeof(uint32_t))
+   error(1, 0, "Expected %d cookies, got %ld\n",
+ ncookies, nbytes/sizeof(uint32_t));
+   for (i = 0; i < ncookies; i++)
+   if (cfg_verbose >= 2)
+   fprintf(stderr, "%d\n", ckbuf[i]);
+   return ncookies;
+}
+
 static bool do_recv_completion(int fd)
 {
struct sock_extended_err *serr;
@@ -324,10 +372,17 @@ static bool do_recv_completion(int fd)
uint32_t hi, lo, range;
int ret, zerocopy;
char control[100];
+   uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES];
+   struct iovec iov;
 
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
 
+   iov.iov_base = ckbuf;
+   iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0]));
+   msg.msg_iov = &iov;
+   msg.msg_iovlen = 1;
+
ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
if (ret == -1 && errno == EAGAIN)
return false;
@@ -346,6 +401,11 @@ static bool do_recv_completion(int fd)
  cm->cmsg_level, cm->cmsg_type);
 
serr = (void *) CMSG_DATA(cm);
+
+   if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) {
+   completions += do_process_zerocopy_cookies(serr, ckbuf, ret);
+   return true;
+   }
if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
error(1, 0, "serr

[PATCH v3 net-next 6/7] selftests/net: add support for PF_RDS sockets

2018-02-15 Thread Sowmini Varadhan
Add support for basic PF_RDS client-server testing in msg_zerocopy

Signed-off-by: Sowmini Varadhan 
---
 tools/testing/selftests/net/msg_zerocopy.c |   65 +++-
 1 files changed, 64 insertions(+), 1 deletions(-)

diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
b/tools/testing/selftests/net/msg_zerocopy.c
index e11fe84..7a5b353 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -14,6 +14,9 @@
  * - SOCK_DGRAM
  * - SOCK_RAW
  *
+ * PF_RDS
+ * - SOCK_SEQPACKET
+ *
  * Start this program on two connected hosts, one in send mode and
  * the other with option '-r' to put it in receiver mode.
  *
@@ -53,6 +56,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifndef SO_EE_ORIGIN_ZEROCOPY
 #define SO_EE_ORIGIN_ZEROCOPY  5
@@ -300,10 +304,15 @@ static int do_setup_tx(int domain, int type, int protocol)
if (cfg_zerocopy)
do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
 
-   if (domain != PF_PACKET)
+   if (domain != PF_PACKET && domain != PF_RDS)
if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
error(1, errno, "connect");
 
+   if (domain == PF_RDS) {
+   if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
+   error(1, errno, "bind");
+   }
+
return fd;
 }
 
@@ -444,6 +453,13 @@ static void do_tx(int domain, int type, int protocol)
msg.msg_iovlen++;
}
 
+   if (domain == PF_RDS) {
+   msg.msg_name = &cfg_dst_addr;
+   msg.msg_namelen =  (cfg_dst_addr.ss_family == AF_INET ?
+   sizeof(struct sockaddr_in) :
+   sizeof(struct sockaddr_in6));
+   }
+
iov[2].iov_base = payload;
iov[2].iov_len = cfg_payload_len;
msg.msg_iovlen++;
@@ -555,6 +571,40 @@ static void do_flush_datagram(int fd, int type)
bytes += cfg_payload_len;
 }
 
+
+static void do_recvmsg(int fd)
+{
+   int ret, off = 0;
+   char *buf;
+   struct iovec iov;
+   struct msghdr msg;
+   struct sockaddr_storage din;
+
+   buf = calloc(cfg_payload_len, sizeof(char));
+   iov.iov_base = buf;
+   iov.iov_len = cfg_payload_len;
+
+   memset(&msg, 0, sizeof(msg));
+   msg.msg_name = &din;
+   msg.msg_namelen = sizeof(din);
+   msg.msg_iov = &iov;
+   msg.msg_iovlen = 1;
+
+   ret = recvmsg(fd, &msg, MSG_TRUNC);
+
+   if (ret == -1)
+   error(1, errno, "recv");
+   if (ret != cfg_payload_len)
+   error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
+
+   if (memcmp(buf + off, payload, ret))
+   error(1, 0, "recv: data mismatch");
+
+   free(buf);
+   packets++;
+   bytes += cfg_payload_len;
+}
+
 static void do_rx(int domain, int type, int protocol)
 {
uint64_t tstop;
@@ -566,6 +616,8 @@ static void do_rx(int domain, int type, int protocol)
do {
if (type == SOCK_STREAM)
do_flush_tcp(fd);
+   else if (domain == PF_RDS)
+   do_recvmsg(fd);
else
do_flush_datagram(fd, type);
 
@@ -610,6 +662,7 @@ static void parse_opts(int argc, char **argv)
40 /* max tcp options */;
int c;
char *daddr = NULL, *saddr = NULL;
+   char *cfg_test;
 
cfg_payload_len = max_payload_len;
 
@@ -667,6 +720,14 @@ static void parse_opts(int argc, char **argv)
break;
}
}
+
+   cfg_test = argv[argc - 1];
+   if (strcmp(cfg_test, "rds") == 0) {
+   if (!daddr)
+   error(1, 0, "-D  required for PF_RDS\n");
+   if (!cfg_rx && !saddr)
+   error(1, 0, "-S  required for PF_RDS\n");
+   }
setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
 
@@ -699,6 +760,8 @@ int main(int argc, char **argv)
do_test(cfg_family, SOCK_STREAM, 0);
else if (!strcmp(cfg_test, "udp"))
do_test(cfg_family, SOCK_DGRAM, 0);
+   else if (!strcmp(cfg_test, "rds"))
+   do_test(PF_RDS, SOCK_SEQPACKET, 0);
else
error(1, 0, "unknown cfg_test %s", cfg_test);
 
-- 
1.7.1



[PATCH v3 net-next 4/7] rds: support for zcopy completion notification

2018-02-15 Thread Sowmini Varadhan
RDS removes a datagram (rds_message) from the retransmit queue when
an ACK is received. The ACK indicates that the receiver has queued
the RDS datagram, so that the sender can safely forget the datagram.
When all references to the rds_message are quiesced, rds_message_purge
is called to release resources used by the rds_message

If the datagram to be removed had pinned pages set up, add
an entry to the rs->rs_znotify_queue so that the notifcation
will be sent up via rds_rm_zerocopy_callback() when the
rds_message is eventually freed by rds_message_purge.

rds_rm_zerocopy_callback() attempts to batch the number of cookies
sent with each notification  to a max of SO_EE_ORIGIN_MAX_ZCOOKIES.
This is achieved by checking the tail skb in the sk_error_queue:
if this has room for one more cookie, the cookie from the
current notification is added; else a new skb is added to the
sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will
trigger a ->sk_error_report to notify the application.

Signed-off-by: Sowmini Varadhan 
---
v2:
  - make sure to always sock_put m_rs even if there is no znotifier.
  - major rewrite of notification, resulting in much simplification.
v3:
  - fix fragile use of skb->cb[], do not set ee_code incorrectly.
 include/uapi/linux/errqueue.h |2 +
 net/rds/af_rds.c  |2 +
 net/rds/message.c |   83 +---
 net/rds/rds.h |   14 +++
 net/rds/recv.c|2 +
 5 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index dc64cfa..28812ed 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -20,11 +20,13 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP6 3
 #define SO_EE_ORIGIN_TXSTATUS  4
 #define SO_EE_ORIGIN_ZEROCOPY  5
+#define SO_EE_ORIGIN_ZCOOKIE   6
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
 
 #define SO_EE_CODE_ZEROCOPY_COPIED 1
+#defineSO_EE_ORIGIN_MAX_ZCOOKIES   8
 
 /**
  * struct scm_timestamping - timestamps exposed through cmsg
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 0a8eefd..a937f18 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket 
*sock,
mask |= (EPOLLIN | EPOLLRDNORM);
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (EPOLLOUT | EPOLLWRNORM);
+   if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+   mask |= POLLERR;
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 
/* clear state any time we wake a seen-congested socket */
diff --git a/net/rds/message.c b/net/rds/message.c
index ef3daaf..bf1a656 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -33,6 +33,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "rds.h"
 
@@ -53,29 +56,95 @@ void rds_message_addref(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_addref);
 
+static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
+{
+   struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+   int ncookies;
+   u32 *ptr;
+
+   if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
+   return false;
+   ncookies = serr->ee.ee_data;
+   if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
+   return false;
+   ptr = skb_put(skb, sizeof(u32));
+   *ptr = cookie;
+   serr->ee.ee_data = ++ncookies;
+   return true;
+}
+
+static void rds_rm_zerocopy_callback(struct rds_sock *rs,
+struct rds_znotifier *znotif)
+{
+   struct sock *sk = rds_rs_to_sk(rs);
+   struct sk_buff *skb, *tail;
+   struct sock_exterr_skb *serr;
+   unsigned long flags;
+   struct sk_buff_head *q;
+   u32 cookie = znotif->z_cookie;
+
+   q = &sk->sk_error_queue;
+   spin_lock_irqsave(&q->lock, flags);
+   tail = skb_peek_tail(q);
+
+   if (tail && skb_zcookie_add(tail, cookie)) {
+   spin_unlock_irqrestore(&q->lock, flags);
+   mm_unaccount_pinned_pages(&znotif->z_mmp);
+   consume_skb(rds_skb_from_znotifier(znotif));
+   sk->sk_error_report(sk);
+   return;
+   }
+
+   skb = rds_skb_from_znotifier(znotif);
+   serr = SKB_EXT_ERR(skb);
+   memset(&serr->ee, 0, sizeof(serr->ee));
+   serr->ee.ee_errno = 0;
+   serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
+   serr->ee.ee_info = 0;
+   WARN_ON(!skb_zcookie_add(skb, cookie));
+
+   __skb_queue_tail(q, skb);
+
+   spin_unlock_irqrestore(&q->lock, flags);
+   sk->sk_error_report(sk);
+
+   mm_unaccount_pinned_pages(&znotif->z_mmp);
+}
+
 /*
  * This relies on dma_map_sg() not touching sg[].page during merging.
  */
 static void rds_message_purge(struct rds_message 

[PATCH v3 net-next 5/7] rds: zerocopy Tx support.

2018-02-15 Thread Sowmini Varadhan
If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
if the SO_ZEROCOPY socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg() are pinned.

The pinning uses the accounting infrastructure added by
Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages")

The payload bytes in the message may not be modified for the
duration that the message has been pinned. A multi-threaded
application using this infrastructure may thus need to be notified
about send-completion so that it can free/reuse the buffers
passed to rds_sendmsg(). Notification of send-completion will
identify each message-buffer by a cookie that the application
must specify as ancillary data to rds_sendmsg().
The ancillary data in this case has cmsg_level == SOL_RDS
and cmsg_type == RDS_CMSG_ZCOPY_COOKIE.

Signed-off-by: Sowmini Varadhan 
---
v2:
  - remove unused data_len argument to rds_rm_size;
  - unmap as necessary if we fail in the middle of zerocopy setup
v3: remove needless bzero of skb->cb[], consolidate err cleanup
 include/uapi/linux/rds.h |1 +
 net/rds/message.c|   51 +-
 net/rds/rds.h|3 +-
 net/rds/send.c   |   44 ++-
 4 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index e71d449..12e3bca 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -103,6 +103,7 @@
 #define RDS_CMSG_MASKED_ATOMIC_FADD8
 #define RDS_CMSG_MASKED_ATOMIC_CSWP9
 #define RDS_CMSG_RXPATH_LATENCY11
+#defineRDS_CMSG_ZCOPY_COOKIE   12
 
 #define RDS_INFO_FIRST 1
 #define RDS_INFO_COUNTERS  1
diff --git a/net/rds/message.c b/net/rds/message.c
index bf1a656..6518345 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -341,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long 
*page_addrs, unsigned in
return rm;
 }
 
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+  bool zcopy)
 {
unsigned long to_copy, nbytes;
unsigned long sg_off;
struct scatterlist *sg;
int ret = 0;
+   int length = iov_iter_count(from);
 
rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
 
@@ -356,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, 
struct iov_iter *from)
sg = rm->data.op_sg;
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
+   if (zcopy) {
+   int total_copied = 0;
+   struct sk_buff *skb;
+
+   skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
+   GFP_KERNEL);
+   if (!skb)
+   return -ENOMEM;
+   rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
+   if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+   length)) {
+   ret = -ENOMEM;
+   goto err;
+   }
+   while (iov_iter_count(from)) {
+   struct page *pages;
+   size_t start;
+   ssize_t copied;
+
+   copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+   1, &start);
+   if (copied < 0) {
+   struct mmpin *mmp;
+   int i;
+
+   for (i = 0; i < rm->data.op_nents; i++)
+   put_page(sg_page(&rm->data.op_sg[i]));
+   mmp = &rm->data.op_mmp_znotifier->z_mmp;
+   mm_unaccount_pinned_pages(mmp);
+   ret = -EFAULT;
+   goto err;
+   }
+   total_copied += copied;
+   iov_iter_advance(from, copied);
+   length -= copied;
+   sg_set_page(sg, pages, copied, start);
+   rm->data.op_nents++;
+   sg++;
+   }
+   WARN_ON_ONCE(length != 0);
+   return ret;
+err:
+   consume_skb(skb);
+   rm->data.op_mmp_znotifier = NULL;
+   return ret;
+   } /* zcopy */
+
while (iov_iter_count(from)) {
if (!sg_page(sg)) {
ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 24576bc..31cd388 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -785,7 +785,8 @@ void rds_for_each_conn_info(struct socket *sock, unsigned 
int len,
 /* message.c */
 struct

Re: ppp/pppoe, still panic 4.15.3 in ppp_push

2018-02-15 Thread Guillaume Nault
On Wed, Feb 14, 2018 at 06:49:19PM +0200, Denys Fedoryshchenko wrote:
> On 2018-02-14 18:47, Guillaume Nault wrote:
> > On Wed, Feb 14, 2018 at 06:29:34PM +0200, Denys Fedoryshchenko wrote:
> > > On 2018-02-14 18:07, Guillaume Nault wrote:
> > > > On Wed, Feb 14, 2018 at 03:17:23PM +0200, Denys Fedoryshchenko wrote:
> > > > > Hi,
> > > > >
> > > > > Upgraded kernel to 4.15.3, still it crashes after while (several
> > > > > hours,
> > > > > cannot do bisect, as it is production server).
> > > > >
> > > > > dev ppp # gdb ppp_generic.o
> > > > > GNU gdb (Gentoo 7.12.1 vanilla) 7.12.1
> > > > > <>
> > > > > Reading symbols from ppp_generic.o...done.
> > > > > (gdb) list *ppp_push+0x73
> > > > > 0x681 is in ppp_push (drivers/net/ppp/ppp_generic.c:1663).
> > > > > 1658  list = list->next;
> > > > > 1659  pch = list_entry(list, struct channel, clist);
> > > > > 1660
> > > > > 1661  spin_lock(&pch->downl);
> > > > > 1662  if (pch->chan) {
> > > > > 1663  if 
> > > > > (pch->chan->ops->start_xmit(pch->chan, skb))
> > > > > 1664  ppp->xmit_pending = NULL;
> > > > > 1665  } else {
> > > > > 1666  /* channel got unregistered */
> > > > > 1667  kfree_skb(skb);
> > > > >
> > > > >
> > > > I expect a memory corruption. Do you have the possibility to run with
> > > > KASAN by any chance?
> > > I will try to enable it tonight. For now i reverted "drivers, net,
> > > ppp:
> > > convert ppp_file.refcnt from atomic_t to refcount_t" for test.
> > > 
> > This commit looks good to me. Do you have doubts about it because it's
> > new in 4.15? Does it mean that your last known-good kernel is 4.14?
> 
> I am just doing "manual" bisect, checking all possibilities, and picking
> patch to revert randomly.
> Yes, correct, my known-good is 4.14.2.
> 
Then maybe try reverting commit 0171c4183559 ("ppp: unlock all_ppp_mutex before 
registering device").
I can't see how it could lead to the bug you observed, but the other
ppp_generic patches introduced since 4.14 were rather trivial.


Hello Dear,

2018-02-15 Thread Musa


Hello Dear,

I am Mr.Musa Ali
I have a Geniue business transaction of (10.5Million Dollars) to do with You
Hence You Co-operate with me  I am assured you that within (7) seven banking 
working days, this said amount will enter your given Bank account with 
immediate alacrity. If you agree to my business proposal, further details of 
the transfer will be forwarded to you as soon as I receive your wiliness to 
join hand with me.
Am awaiting your urgent response with this informations
Name:...
Sex:...
Age:...
Occupation:
Address:...
Tel/ Fax:...
State:.
Country Of origin:..
Have a nice day!!
Mr.Musa Ali


[PATCH] i40evf: pass struct virtchnl_filter by reference rather than by value

2018-02-15 Thread Colin King
From: Colin Ian King 

Passing struct virtchnl_filter f by value requires a 272 byte copy
on x86_64, so instead pass it by reference is much more efficient. Also
adjust some lines that are over 80 chars.

Detected by CoverityScan, CID#1465285 ("Big parameter passed by value")

Signed-off-by: Colin Ian King 
---
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c| 32 --
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 6134b61e0938..3c76c817ca1a 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -1048,24 +1048,28 @@ void i40evf_disable_channels(struct i40evf_adapter 
*adapter)
  * Print the cloud filter
  **/
 static void i40evf_print_cloud_filter(struct i40evf_adapter *adapter,
- struct virtchnl_filter f)
+ struct virtchnl_filter *f)
 {
-   switch (f.flow_type) {
+   switch (f->flow_type) {
case VIRTCHNL_TCP_V4_FLOW:
dev_info(&adapter->pdev->dev, "dst_mac: %pM src_mac: %pM 
vlan_id: %hu dst_ip: %pI4 src_ip %pI4 dst_port %hu src_port %hu\n",
-&f.data.tcp_spec.dst_mac, &f.data.tcp_spec.src_mac,
-ntohs(f.data.tcp_spec.vlan_id),
-&f.data.tcp_spec.dst_ip[0], &f.data.tcp_spec.src_ip[0],
-ntohs(f.data.tcp_spec.dst_port),
-ntohs(f.data.tcp_spec.src_port));
+&f->data.tcp_spec.dst_mac,
+&f->data.tcp_spec.src_mac,
+ntohs(f->data.tcp_spec.vlan_id),
+&f->data.tcp_spec.dst_ip[0],
+&f->data.tcp_spec.src_ip[0],
+ntohs(f->data.tcp_spec.dst_port),
+ntohs(f->data.tcp_spec.src_port));
break;
case VIRTCHNL_TCP_V6_FLOW:
dev_info(&adapter->pdev->dev, "dst_mac: %pM src_mac: %pM 
vlan_id: %hu dst_ip: %pI6 src_ip %pI6 dst_port %hu src_port %hu\n",
-&f.data.tcp_spec.dst_mac, &f.data.tcp_spec.src_mac,
-ntohs(f.data.tcp_spec.vlan_id),
-&f.data.tcp_spec.dst_ip, &f.data.tcp_spec.src_ip,
-ntohs(f.data.tcp_spec.dst_port),
-ntohs(f.data.tcp_spec.src_port));
+&f->data.tcp_spec.dst_mac,
+&f->data.tcp_spec.src_mac,
+ntohs(f->data.tcp_spec.vlan_id),
+&f->data.tcp_spec.dst_ip,
+&f->data.tcp_spec.src_ip,
+ntohs(f->data.tcp_spec.dst_port),
+ntohs(f->data.tcp_spec.src_port));
break;
}
 }
@@ -1303,7 +1307,7 @@ void i40evf_virtchnl_completion(struct i40evf_adapter 
*adapter,
 i40evf_stat_str(&adapter->hw,
 v_retval));
i40evf_print_cloud_filter(adapter,
- cf->f);
+ &cf->f);
list_del(&cf->list);
kfree(cf);
adapter->num_cloud_filters--;
@@ -1322,7 +1326,7 @@ void i40evf_virtchnl_completion(struct i40evf_adapter 
*adapter,
 i40evf_stat_str(&adapter->hw,
 v_retval));
i40evf_print_cloud_filter(adapter,
- cf->f);
+ &cf->f);
}
}
}
-- 
2.15.1



Re: [PATCH iproute2-next v4 8/9] utils: Introduce and use print_name_and_link() to print name@link

2018-02-15 Thread David Ahern
On 2/15/18 11:59 AM, Stephen Hemminger wrote:
> On Thu, 15 Feb 2018 20:34:07 +0200
> Serhey Popovych  wrote:
> 
>> +fprintf(fp, "%d: ", ifi->ifi_index);
>> +
>> +print_name_and_link("%s: ", COLOR_NONE, name, tb);
> 
> This should be COLOR_IFNAME but I am working on that.
> 

Let me commit this series and adjust from there.


Re: ppp/pppoe, still panic 4.15.3 in ppp_push

2018-02-15 Thread Guillaume Nault
On Thu, Feb 15, 2018 at 06:01:16PM +0200, Denys Fedoryshchenko wrote:
> On 2018-02-15 17:55, Guillaume Nault wrote:
> > On Thu, Feb 15, 2018 at 12:19:52PM +0200, Denys Fedoryshchenko wrote:
> > > Here we go:
> > > 
> > >   [24558.921549]
> > > ==
> > >   [24558.922167] BUG: KASAN: use-after-free in
> > > ppp_ioctl+0xa6a/0x1522
> > > [ppp_generic]
> > >   [24558.922776] Write of size 8 at addr 8803d35bf3f8 by task
> > > accel-pppd/12622
> > >   [24558.923113]
> > >   [24558.923451] CPU: 0 PID: 12622 Comm: accel-pppd Tainted: G
> > > W
> > > 4.15.3-build-0134 #1
> > >   [24558.924058] Hardware name: HP ProLiant DL320e Gen8 v2,
> > > BIOS P80
> > > 04/02/2015
> > >   [24558.924406] Call Trace:
> > >   [24558.924753]  dump_stack+0x46/0x59
> > >   [24558.925103]  print_address_description+0x6b/0x23b
> > >   [24558.925451]  ? ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> > >   [24558.925797]  kasan_report+0x21b/0x241
> > >   [24558.926136]  ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> > >   [24558.926479]  ? ppp_nl_newlink+0x1da/0x1da [ppp_generic]
> > >   [24558.926829]  ? sock_sendmsg+0x89/0x99
> > >   [24558.927176]  ? __vfs_write+0xd9/0x4ad
> > >   [24558.927523]  ? kernel_read+0xed/0xed
> > >   [24558.927872]  ? SyS_getpeername+0x18c/0x18c
> > >   [24558.928213]  ? bit_waitqueue+0x2a/0x2a
> > >   [24558.928561]  ? wake_atomic_t_function+0x115/0x115
> > >   [24558.928898]  vfs_ioctl+0x6e/0x81
> > >   [24558.929228]  do_vfs_ioctl+0xa00/0xb10
> > >   [24558.929571]  ? sigprocmask+0x1a6/0x1d0
> > >   [24558.929907]  ? sigsuspend+0x13e/0x13e
> > >   [24558.930239]  ? ioctl_preallocate+0x14e/0x14e
> > >   [24558.930568]  ? SyS_rt_sigprocmask+0xf1/0x142
> > >   [24558.930904]  ? sigprocmask+0x1d0/0x1d0
> > >   [24558.931252]  SyS_ioctl+0x39/0x55
> > >   [24558.931595]  ? do_vfs_ioctl+0xb10/0xb10
> > >   [24558.931942]  do_syscall_64+0x1b1/0x31f
> > >   [24558.932288]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > >   [24558.932627] RIP: 0033:0x7f302849d8a7
> > >   [24558.932965] RSP: 002b:7f3029a52af8 EFLAGS: 0206
> > > ORIG_RAX:
> > > 0010
> > >   [24558.933578] RAX: ffda RBX: 7f3027d861e3 RCX:
> > > 7f302849d8a7
> > >   [24558.933927] RDX: 7f3023f49468 RSI: 4004743a RDI:
> > > 3a67
> > >   [24558.934266] RBP: 7f3029a52b20 R08:  R09:
> > > 55c8308d8e40
> > >   [24558.934607] R10: 0008 R11: 0206 R12:
> > > 7f3023f49358
> > >   [24558.934947] R13: 7ffe86e5723f R14:  R15:
> > > 7f3029a53700
> > >   [24558.935288]
> > >   [24558.935626] Allocated by task 12622:
> > >   [24558.935972]  ppp_register_net_channel+0x5f/0x5c6
> > > [ppp_generic]
> > >   [24558.936306]  pppoe_connect+0xab7/0xc71 [pppoe]
> > >   [24558.936640]  SyS_connect+0x14b/0x1b7
> > >   [24558.936975]  do_syscall_64+0x1b1/0x31f
> > >   [24558.937319]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > >   [24558.937655]
> > >   [24558.937993] Freed by task 12622:
> > >   [24558.938321]  kfree+0xb0/0x11d
> > >   [24558.938658]  ppp_release+0x111/0x120 [ppp_generic]
> > >   [24558.938994]  __fput+0x2ba/0x51a
> > >   [24558.939332]  task_work_run+0x11c/0x13d
> > >   [24558.939676]  exit_to_usermode_loop+0x7c/0xaf
> > >   [24558.940022]  do_syscall_64+0x2ea/0x31f
> > >   [24558.940368]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > >   [24558.947099]
> > 
> > Your first guess was right. It looks like we have an issue with
> > reference counting on the channels. Can you send me your ppp_generic.o?
> http://nuclearcat.com/ppp_generic.o
> Compiled with gcc version 6.4.0 (Gentoo 6.4.0-r1 p1.3)
> 
>From what I can see, ppp_release() and ioctl(PPPIOCCONNECT) are called
concurrently on the same ppp_file. Even if this ppp_file was pointed at
by two different file descriptors, I can't see how this could defeat
the reference counting mechanism. I'm going to think more about it.

Can you test with CONFIG_REFCOUNT_FULL? (and keep
d780cd44e3ce ("drivers, net, ppp: convert ppp_file.refcnt from atomic_t to 
refcount_t")).


Re: ppp/pppoe, still panic 4.15.3 in ppp_push

2018-02-15 Thread Denys Fedoryshchenko

On 2018-02-15 21:31, Guillaume Nault wrote:

On Thu, Feb 15, 2018 at 06:01:16PM +0200, Denys Fedoryshchenko wrote:

On 2018-02-15 17:55, Guillaume Nault wrote:
> On Thu, Feb 15, 2018 at 12:19:52PM +0200, Denys Fedoryshchenko wrote:
> > Here we go:
> >
> >   [24558.921549]
> > ==
> >   [24558.922167] BUG: KASAN: use-after-free in
> > ppp_ioctl+0xa6a/0x1522
> > [ppp_generic]
> >   [24558.922776] Write of size 8 at addr 8803d35bf3f8 by task
> > accel-pppd/12622
> >   [24558.923113]
> >   [24558.923451] CPU: 0 PID: 12622 Comm: accel-pppd Tainted: G
> > W
> > 4.15.3-build-0134 #1
> >   [24558.924058] Hardware name: HP ProLiant DL320e Gen8 v2,
> > BIOS P80
> > 04/02/2015
> >   [24558.924406] Call Trace:
> >   [24558.924753]  dump_stack+0x46/0x59
> >   [24558.925103]  print_address_description+0x6b/0x23b
> >   [24558.925451]  ? ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> >   [24558.925797]  kasan_report+0x21b/0x241
> >   [24558.926136]  ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> >   [24558.926479]  ? ppp_nl_newlink+0x1da/0x1da [ppp_generic]
> >   [24558.926829]  ? sock_sendmsg+0x89/0x99
> >   [24558.927176]  ? __vfs_write+0xd9/0x4ad
> >   [24558.927523]  ? kernel_read+0xed/0xed
> >   [24558.927872]  ? SyS_getpeername+0x18c/0x18c
> >   [24558.928213]  ? bit_waitqueue+0x2a/0x2a
> >   [24558.928561]  ? wake_atomic_t_function+0x115/0x115
> >   [24558.928898]  vfs_ioctl+0x6e/0x81
> >   [24558.929228]  do_vfs_ioctl+0xa00/0xb10
> >   [24558.929571]  ? sigprocmask+0x1a6/0x1d0
> >   [24558.929907]  ? sigsuspend+0x13e/0x13e
> >   [24558.930239]  ? ioctl_preallocate+0x14e/0x14e
> >   [24558.930568]  ? SyS_rt_sigprocmask+0xf1/0x142
> >   [24558.930904]  ? sigprocmask+0x1d0/0x1d0
> >   [24558.931252]  SyS_ioctl+0x39/0x55
> >   [24558.931595]  ? do_vfs_ioctl+0xb10/0xb10
> >   [24558.931942]  do_syscall_64+0x1b1/0x31f
> >   [24558.932288]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> >   [24558.932627] RIP: 0033:0x7f302849d8a7
> >   [24558.932965] RSP: 002b:7f3029a52af8 EFLAGS: 0206
> > ORIG_RAX:
> > 0010
> >   [24558.933578] RAX: ffda RBX: 7f3027d861e3 RCX:
> > 7f302849d8a7
> >   [24558.933927] RDX: 7f3023f49468 RSI: 4004743a RDI:
> > 3a67
> >   [24558.934266] RBP: 7f3029a52b20 R08:  R09:
> > 55c8308d8e40
> >   [24558.934607] R10: 0008 R11: 0206 R12:
> > 7f3023f49358
> >   [24558.934947] R13: 7ffe86e5723f R14:  R15:
> > 7f3029a53700
> >   [24558.935288]
> >   [24558.935626] Allocated by task 12622:
> >   [24558.935972]  ppp_register_net_channel+0x5f/0x5c6
> > [ppp_generic]
> >   [24558.936306]  pppoe_connect+0xab7/0xc71 [pppoe]
> >   [24558.936640]  SyS_connect+0x14b/0x1b7
> >   [24558.936975]  do_syscall_64+0x1b1/0x31f
> >   [24558.937319]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> >   [24558.937655]
> >   [24558.937993] Freed by task 12622:
> >   [24558.938321]  kfree+0xb0/0x11d
> >   [24558.938658]  ppp_release+0x111/0x120 [ppp_generic]
> >   [24558.938994]  __fput+0x2ba/0x51a
> >   [24558.939332]  task_work_run+0x11c/0x13d
> >   [24558.939676]  exit_to_usermode_loop+0x7c/0xaf
> >   [24558.940022]  do_syscall_64+0x2ea/0x31f
> >   [24558.940368]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> >   [24558.947099]
>
> Your first guess was right. It looks like we have an issue with
> reference counting on the channels. Can you send me your ppp_generic.o?
http://nuclearcat.com/ppp_generic.o
Compiled with gcc version 6.4.0 (Gentoo 6.4.0-r1 p1.3)


From what I can see, ppp_release() and ioctl(PPPIOCCONNECT) are called
concurrently on the same ppp_file. Even if this ppp_file was pointed at
by two different file descriptors, I can't see how this could defeat
the reference counting mechanism. I'm going to think more about it.

Can you test with CONFIG_REFCOUNT_FULL? (and keep
d780cd44e3ce ("drivers, net, ppp: convert ppp_file.refcnt from
atomic_t to refcount_t")).
Ok, i will try that tonight. On vanilla kernel or reversing mentioned in 
previous email patch?


[PATCH] i40evf: remove redundant array comparisons to 0 checks

2018-02-15 Thread Colin King
From: Colin Ian King 

The checks to see if key->dst.s6_addr and key->src.s6_addr are null
pointers are redundant because these are constant size arrays and
so the checks always return true.  Fix this by removing the redundant
checks.

Detected by CoverityScan, CID#1465279 ("Array compared to 0")

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 4955ce3ab6a2..63e4169828ea 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2710,22 +2710,19 @@ static int i40evf_parse_cls_flower(struct 
i40evf_adapter *adapter,
if (!ipv6_addr_any(&mask->dst) || !ipv6_addr_any(&mask->src))
field_flags |= I40EVF_CLOUD_FIELD_IIP;
 
-   if (key->dst.s6_addr) {
-   for (i = 0; i < 4; i++)
-   filter->f.mask.tcp_spec.dst_ip[i] |=
+   for (i = 0; i < 4; i++)
+   filter->f.mask.tcp_spec.dst_ip[i] |=
cpu_to_be32(0x);
-   memcpy(&filter->f.data.tcp_spec.dst_ip,
-  &key->dst.s6_addr32,
-  sizeof(filter->f.data.tcp_spec.dst_ip));
-   }
-   if (key->src.s6_addr) {
-   for (i = 0; i < 4; i++)
-   filter->f.mask.tcp_spec.src_ip[i] |=
+   memcpy(&filter->f.data.tcp_spec.dst_ip,
+  &key->dst.s6_addr32,
+  sizeof(filter->f.data.tcp_spec.dst_ip));
+
+   for (i = 0; i < 4; i++)
+   filter->f.mask.tcp_spec.src_ip[i] |=
cpu_to_be32(0x);
-   memcpy(&filter->f.data.tcp_spec.src_ip,
-  &key->src.s6_addr32,
-  sizeof(filter->f.data.tcp_spec.src_ip));
-   }
+   memcpy(&filter->f.data.tcp_spec.src_ip,
+  &key->src.s6_addr32,
+  sizeof(filter->f.data.tcp_spec.src_ip));
}
if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
struct flow_dissector_key_ports *key =
-- 
2.15.1



Re: ppp/pppoe, still panic 4.15.3 in ppp_push

2018-02-15 Thread Guillaume Nault
On Thu, Feb 15, 2018 at 09:34:42PM +0200, Denys Fedoryshchenko wrote:
> On 2018-02-15 21:31, Guillaume Nault wrote:
> > On Thu, Feb 15, 2018 at 06:01:16PM +0200, Denys Fedoryshchenko wrote:
> > > On 2018-02-15 17:55, Guillaume Nault wrote:
> > > > On Thu, Feb 15, 2018 at 12:19:52PM +0200, Denys Fedoryshchenko wrote:
> > > > > Here we go:
> > > > >
> > > > >   [24558.921549]
> > > > > ==
> > > > >   [24558.922167] BUG: KASAN: use-after-free in
> > > > > ppp_ioctl+0xa6a/0x1522
> > > > > [ppp_generic]
> > > > >   [24558.922776] Write of size 8 at addr 8803d35bf3f8 by task
> > > > > accel-pppd/12622
> > > > >   [24558.923113]
> > > > >   [24558.923451] CPU: 0 PID: 12622 Comm: accel-pppd Tainted: G
> > > > > W
> > > > > 4.15.3-build-0134 #1
> > > > >   [24558.924058] Hardware name: HP ProLiant DL320e Gen8 v2,
> > > > > BIOS P80
> > > > > 04/02/2015
> > > > >   [24558.924406] Call Trace:
> > > > >   [24558.924753]  dump_stack+0x46/0x59
> > > > >   [24558.925103]  print_address_description+0x6b/0x23b
> > > > >   [24558.925451]  ? ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> > > > >   [24558.925797]  kasan_report+0x21b/0x241
> > > > >   [24558.926136]  ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> > > > >   [24558.926479]  ? ppp_nl_newlink+0x1da/0x1da [ppp_generic]
> > > > >   [24558.926829]  ? sock_sendmsg+0x89/0x99
> > > > >   [24558.927176]  ? __vfs_write+0xd9/0x4ad
> > > > >   [24558.927523]  ? kernel_read+0xed/0xed
> > > > >   [24558.927872]  ? SyS_getpeername+0x18c/0x18c
> > > > >   [24558.928213]  ? bit_waitqueue+0x2a/0x2a
> > > > >   [24558.928561]  ? wake_atomic_t_function+0x115/0x115
> > > > >   [24558.928898]  vfs_ioctl+0x6e/0x81
> > > > >   [24558.929228]  do_vfs_ioctl+0xa00/0xb10
> > > > >   [24558.929571]  ? sigprocmask+0x1a6/0x1d0
> > > > >   [24558.929907]  ? sigsuspend+0x13e/0x13e
> > > > >   [24558.930239]  ? ioctl_preallocate+0x14e/0x14e
> > > > >   [24558.930568]  ? SyS_rt_sigprocmask+0xf1/0x142
> > > > >   [24558.930904]  ? sigprocmask+0x1d0/0x1d0
> > > > >   [24558.931252]  SyS_ioctl+0x39/0x55
> > > > >   [24558.931595]  ? do_vfs_ioctl+0xb10/0xb10
> > > > >   [24558.931942]  do_syscall_64+0x1b1/0x31f
> > > > >   [24558.932288]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > >   [24558.932627] RIP: 0033:0x7f302849d8a7
> > > > >   [24558.932965] RSP: 002b:7f3029a52af8 EFLAGS: 0206
> > > > > ORIG_RAX:
> > > > > 0010
> > > > >   [24558.933578] RAX: ffda RBX: 7f3027d861e3 RCX:
> > > > > 7f302849d8a7
> > > > >   [24558.933927] RDX: 7f3023f49468 RSI: 4004743a RDI:
> > > > > 3a67
> > > > >   [24558.934266] RBP: 7f3029a52b20 R08:  R09:
> > > > > 55c8308d8e40
> > > > >   [24558.934607] R10: 0008 R11: 0206 R12:
> > > > > 7f3023f49358
> > > > >   [24558.934947] R13: 7ffe86e5723f R14:  R15:
> > > > > 7f3029a53700
> > > > >   [24558.935288]
> > > > >   [24558.935626] Allocated by task 12622:
> > > > >   [24558.935972]  ppp_register_net_channel+0x5f/0x5c6
> > > > > [ppp_generic]
> > > > >   [24558.936306]  pppoe_connect+0xab7/0xc71 [pppoe]
> > > > >   [24558.936640]  SyS_connect+0x14b/0x1b7
> > > > >   [24558.936975]  do_syscall_64+0x1b1/0x31f
> > > > >   [24558.937319]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > >   [24558.937655]
> > > > >   [24558.937993] Freed by task 12622:
> > > > >   [24558.938321]  kfree+0xb0/0x11d
> > > > >   [24558.938658]  ppp_release+0x111/0x120 [ppp_generic]
> > > > >   [24558.938994]  __fput+0x2ba/0x51a
> > > > >   [24558.939332]  task_work_run+0x11c/0x13d
> > > > >   [24558.939676]  exit_to_usermode_loop+0x7c/0xaf
> > > > >   [24558.940022]  do_syscall_64+0x2ea/0x31f
> > > > >   [24558.940368]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > >   [24558.947099]
> > > >
> > > > Your first guess was right. It looks like we have an issue with
> > > > reference counting on the channels. Can you send me your ppp_generic.o?
> > > http://nuclearcat.com/ppp_generic.o
> > > Compiled with gcc version 6.4.0 (Gentoo 6.4.0-r1 p1.3)
> > > 
> > From what I can see, ppp_release() and ioctl(PPPIOCCONNECT) are called
> > concurrently on the same ppp_file. Even if this ppp_file was pointed at
> > by two different file descriptors, I can't see how this could defeat
> > the reference counting mechanism. I'm going to think more about it.
> > 
> > Can you test with CONFIG_REFCOUNT_FULL? (and keep
> > d780cd44e3ce ("drivers, net, ppp: convert ppp_file.refcnt from
> > atomic_t to refcount_t")).
> Ok, i will try that tonight. On vanilla kernel or reversing mentioned in
> previous email patch?
On vanilla kernel. The other is really a shot in the dark.


Re: [PATCH v3 net-next 2/7] rds: hold a sock ref from rds_message to the rds_sock

2018-02-15 Thread Santosh Shilimkar

On 2/15/2018 10:49 AM, Sowmini Varadhan wrote:

The existing model holds a reference from the rds_sock to the
rds_message, but the rds_message does not itself hold a sock_put()
on the rds_sock. Instead the m_rs field in the rds_message is
assigned when the message is queued on the sock, and nulled when
the message is dequeued from the sock.

We want to be able to notify userspace when the rds_message
is actually freed (from rds_message_purge(), after the refcounts
to the rds_message go to 0). At the time that rds_message_purge()
is called, the message is no longer on the rds_sock retransmit
queue. Thus the explicit reference for the m_rs is needed to
send a notification that will signal to userspace that
it is now safe to free/reuse any pages that may have
been pinned down for zerocopy.

This patch manages the m_rs assignment in the rds_message with
the necessary refcount book-keeping.

Signed-off-by: Sowmini Varadhan 
---

Acked-by: Santosh Shilimkar 


Re: [PATCH v3 net-next 4/7] rds: support for zcopy completion notification

2018-02-15 Thread Santosh Shilimkar

On 2/15/2018 10:49 AM, Sowmini Varadhan wrote:

RDS removes a datagram (rds_message) from the retransmit queue when
an ACK is received. The ACK indicates that the receiver has queued
the RDS datagram, so that the sender can safely forget the datagram.
When all references to the rds_message are quiesced, rds_message_purge
is called to release resources used by the rds_message

If the datagram to be removed had pinned pages set up, add
an entry to the rs->rs_znotify_queue so that the notifcation
will be sent up via rds_rm_zerocopy_callback() when the
rds_message is eventually freed by rds_message_purge.

rds_rm_zerocopy_callback() attempts to batch the number of cookies
sent with each notification  to a max of SO_EE_ORIGIN_MAX_ZCOOKIES.
This is achieved by checking the tail skb in the sk_error_queue:
if this has room for one more cookie, the cookie from the
current notification is added; else a new skb is added to the
sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will
trigger a ->sk_error_report to notify the application.

Signed-off-by: Sowmini Varadhan 
---

Acked-by: Santosh Shilimkar 


Re: [PATCH v3 net-next 5/7] rds: zerocopy Tx support.

2018-02-15 Thread Santosh Shilimkar

On 2/15/2018 10:49 AM, Sowmini Varadhan wrote:

If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
if the SO_ZEROCOPY socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg() are pinned.

The pinning uses the accounting infrastructure added by
Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages")

The payload bytes in the message may not be modified for the
duration that the message has been pinned. A multi-threaded
application using this infrastructure may thus need to be notified
about send-completion so that it can free/reuse the buffers
passed to rds_sendmsg(). Notification of send-completion will
identify each message-buffer by a cookie that the application
must specify as ancillary data to rds_sendmsg().
The ancillary data in this case has cmsg_level == SOL_RDS
and cmsg_type == RDS_CMSG_ZCOPY_COOKIE.

Signed-off-by: Sowmini Varadhan 
---


Acked-by: Santosh Shilimkar 



[PATCH] i40e: check that pointer vsi is not null before dereferencing it

2018-02-15 Thread Colin King
From: Colin Ian King 

Function i40e_find_vsi_from_id can potentially return null, hence
vsi may be null, so defensively check it is non-null before
dereferencing it to check the seid.

Fixes: e284fc280473 ("i40e: Add and delete cloud filter")
Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 5cca083da93c..14bbd5c1db78 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -3062,7 +3062,7 @@ static struct i40e_vsi *i40e_find_vsi_from_seid(struct 
i40e_vf *vf, u16 seid)
 
for (i = 0; i < vf->num_tc ; i++) {
vsi = i40e_find_vsi_from_id(pf, vf->ch[i].vsi_id);
-   if (vsi->seid == seid)
+   if (vsi && vsi->seid == seid)
return vsi;
}
return NULL;
-- 
2.15.1



Re: [PATCH iproute2-next v4 7/9] utils: Introduce and use get_ifname_rta()

2018-02-15 Thread David Ahern
On 2/15/18 11:34 AM, Serhey Popovych wrote:
> Be consistent in handling of IFLA_IFNAME attribute in all places: if
> there is no attribute report bug to stderr and use ll_idx_n2a() as
> last measure to get name in "if%u" format instead of "".
> 
> Use check_ifname() to validate network device name: this catches both
> unexpected return from kernel and ll_idx_n2a().
> 
> Signed-off-by: Serhey Popovych 
> ---
>  bridge/link.c   |8 
>  include/utils.h |1 +
>  ip/ipaddress.c  |   20 
>  lib/utils.c |   19 +++
>  4 files changed, 32 insertions(+), 16 deletions(-)
> 
> diff --git a/bridge/link.c b/bridge/link.c
> index 870ebe0..a11cbb1 100644
> --- a/bridge/link.c
> +++ b/bridge/link.c
> @@ -99,9 +99,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
>  struct nlmsghdr *n, void *arg)
>  {
>   FILE *fp = arg;
> - int len = n->nlmsg_len;
>   struct ifinfomsg *ifi = NLMSG_DATA(n);
>   struct rtattr *tb[IFLA_MAX+1];
> + int len = n->nlmsg_len;
> + const char *name;
>  
>   len -= NLMSG_LENGTH(sizeof(*ifi));
>   if (len < 0) {
> @@ -117,10 +118,9 @@ int print_linkinfo(const struct sockaddr_nl *who,
>  
>   parse_rtattr_flags(tb, IFLA_MAX, IFLA_RTA(ifi), len, NLA_F_NESTED);
>  
> - if (tb[IFLA_IFNAME] == NULL) {
> - fprintf(stderr, "BUG: nil ifname\n");
> + name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
> + if (!name)
>   return -1;
> - }
>  
>   if (n->nlmsg_type == RTM_DELLINK)
>   fprintf(fp, "Deleted ");
> diff --git a/include/utils.h b/include/utils.h
> index 867e540..84ca873 100644
> --- a/include/utils.h
> +++ b/include/utils.h
> @@ -183,6 +183,7 @@ void duparg(const char *, const char *) 
> __attribute__((noreturn));
>  void duparg2(const char *, const char *) __attribute__((noreturn));
>  int check_ifname(const char *);
>  int get_ifname(char *, const char *);
> +const char *get_ifname_rta(int ifindex, const struct rtattr *rta);
>  int matches(const char *arg, const char *pattern);
>  int inet_addr_match(const inet_prefix *a, const inet_prefix *b, int bits);
>  int inet_addr_match_rta(const inet_prefix *m, const struct rtattr *rta);
> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
> index 749178d..08d2576 100644
> --- a/ip/ipaddress.c
> +++ b/ip/ipaddress.c
> @@ -776,12 +776,10 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
>   return -1;
>  
>   parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
> - if (tb[IFLA_IFNAME] == NULL) {
> - fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
> ifi->ifi_index);
> - name = ll_idx_n2a(ifi->ifi_index);
> - } else {
> - name = rta_getattr_str(tb[IFLA_IFNAME]);
> - }
> +
> + name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
> + if (!name)
> + return -1;
>  
>   if (filter.label &&
>   (!filter.family || filter.family == AF_PACKET) &&
> @@ -903,12 +901,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
>   return -1;
>  
>   parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
> - if (tb[IFLA_IFNAME] == NULL) {
> - fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
> ifi->ifi_index);
> - name = ll_idx_n2a(ifi->ifi_index);
> - } else {
> - name = rta_getattr_str(tb[IFLA_IFNAME]);
> - }
> +
> + name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
> + if (!name)
> + return -1;
>  
>   if (filter.label &&
>   (!filter.family || filter.family == AF_PACKET) &&
> diff --git a/lib/utils.c b/lib/utils.c
> index d86c2ee..572d42a 100644
> --- a/lib/utils.c
> +++ b/lib/utils.c
> @@ -871,6 +871,25 @@ int get_ifname(char *buf, const char *name)
>   return ret;
>  }
>  
> +const char *get_ifname_rta(int ifindex, const struct rtattr *rta)
> +{
> + const char *name;
> +
> + if (rta) {
> + name = rta_getattr_str(rta);
> + } else {
> + fprintf(stderr,
> + "BUG: device with ifindex %d has nil ifname\n",
> + ifindex);
> + name = ll_idx_n2a(ifindex);
> + }
> +
> + if (check_ifname(name))
> + return NULL;
> +
> + return name;
> +}
> +
>  int matches(const char *cmd, const char *pattern)
>  {
>   int len = strlen(cmd);
> 


Getting build failures with this one:
$ make clean; make -j 8
...

devlink
CC   devlink.o
CC   mnlg.o
LINK devlink
../lib/libutil.a(ll_map.o): In function `ll_remember_index':
ll_map.c:(.text+0xf6): undefined reference to `parse_rtattr'
ll_map.c:(.text+0x1c5): undefined reference to `parse_rtattr'
../lib/libutil.a(ll_map.o): In function `ll_init_map':
ll_map.c:(.text+0x47c): undefined reference to `rtnl_wilddump_request'
ll_map.c:(.text+0x493): undefined reference to `rtnl_dump_filter_nc'




Re: Potential deadlock BUG in drivers/net/wireless/st/cw1200/sta.c (Linux 4.9)

2018-02-15 Thread Solomon Peachy
On Tue, Feb 06, 2018 at 09:04:29PM +0100, Iago Abal wrote:
> This still looks like a deadlock bug to me, could someone take a look
> as well and confirm? I will help preparing a patch if needed.

The short answer:  yes, it deadlocks.

I'll be glad to review patches (this code is pretty tricky, but I 
believe the deadlock got introduced/exposed by different code changes)
but unfortunately my last move killed the hardware capable of 
accepting my SDIO CW1200 dongles.

Hmm, might be worth trolling ebay or something.

 - Solomon
-- 
Solomon Peachy pizza at shaftnet dot org
Coconut Creek, FL  ^^ (email/xmpp) ^^
Quidquid latine dictum sit, altum videtur.


signature.asc
Description: PGP signature


Re: [RFC][PATCH bpf v2 1/2] bpf: allow 64-bit offsets for bpf function calls

2018-02-15 Thread Daniel Borkmann
On 02/15/2018 05:25 PM, Daniel Borkmann wrote:
> On 02/13/2018 05:05 AM, Sandipan Das wrote:
>> The imm field of a bpf_insn is a signed 32-bit integer. For
>> JIT-ed bpf-to-bpf function calls, it stores the offset from
>> __bpf_call_base to the start of the callee function.
>>
>> For some architectures, such as powerpc64, it was found that
>> this offset may be as large as 64 bits because of which this
>> cannot be accomodated in the imm field without truncation.
>>
>> To resolve this, we additionally make aux->func within each
>> bpf_prog associated with the functions to point to the list
>> of all function addresses determined by the verifier.
>>
>> We keep the value assigned to the off field of the bpf_insn
>> as a way to index into aux->func and also set aux->func_cnt
>> so that this can be used for performing basic upper bound
>> checks for the off field.
>>
>> Signed-off-by: Sandipan Das 
>> ---
>> v2: Make aux->func point to the list of functions determined
>> by the verifier rather than allocating a separate callee
>> list for each function.
> 
> Approach looks good to me; do you know whether s390x JIT would
> have similar requirement? I think one limitation that would still
> need to be addressed later with such approach would be regarding the
> xlated prog dump in bpftool, see 'BPF calls via JIT' in 7105e828c087
> ("bpf: allow for correlation of maps and helpers in dump"). Any
> ideas for this (potentially if we could use off + imm for calls,
> we'd get to 48 bits, but that seems still not be enough as you say)?

One other random thought, although I'm not sure how feasible this
is for ppc64 JIT to realize ... but idea would be to have something
like the below:

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 29ca920..daa7258 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -512,6 +512,11 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long 
*value, char *type,
return ret;
 }

+void * __weak bpf_jit_image_alloc(unsigned long size)
+{
+   return module_alloc(size);
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 unsigned int alignment,
@@ -525,7 +530,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 * random section of illegal instructions.
 */
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
-   hdr = module_alloc(size);
+   hdr = bpf_jit_image_alloc(size);
if (hdr == NULL)
return NULL;

And ppc64 JIT could override bpf_jit_image_alloc() in a similar way
like some archs would override the module_alloc() helper through a
custom implementation, usually via __vmalloc_node_range(), so we
could perhaps fit the range for BPF JITed images in a way that they
could use the 32bit imm in the end? There are not that many progs
loaded typically, so the range could be a bit narrower in such case
anyway. (Not sure if this would work out though, but I thought to
bring it up.)

Thanks,
Daniel


[PATCH v1 0/1] Parfait changes

2018-02-15 Thread Joe Moriarty
The following patch(s) are bugs found by the static compiler
'Parfait'.  Care was taken to make sure false positive results
were removed from this patchset.

Parfait Overview


https://labs.oracle.com/pls/apex/f?p=labs:49:P49_PROJECT_ID:13

v1:
Initial release

Joe Moriarty (1):
  drivers: isdn: NULL pointer dereference [null-pointer-deref] (CWE 476)
problem

 drivers/isdn/mISDN/core.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

-- 
2.15.0



[PATCH v1 1/1] drivers: isdn: NULL pointer dereference [null-pointer-deref] (CWE 476) problem

2018-02-15 Thread Joe Moriarty
The Parfait (version 2.1.0) static code analysis tool found the
following NULL pointer dereference problem.

- drivers/isdn/mISDN/core.c
function channelmap_show() does not check the returned mdev
variable from dev_to_mISDN() for NULL.  Added the check for
NULL, which results in a value of 0 being returned.

Signed-off-by: Joe Moriarty 
Reviewed-by: Jonathan Helman 
---
 drivers/isdn/mISDN/core.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index faf505462a4f..aec7e2706109 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -129,8 +129,10 @@ static ssize_t channelmap_show(struct device *dev,
char *bp = buf;
int i;
 
-   for (i = 0; i <= mdev->nrbchan; i++)
-   *bp++ = test_channelmap(i, mdev->channelmap) ? '1' : '0';
+   if (mdev)
+   for (i = 0; i <= mdev->nrbchan; i++)
+   *bp++ = test_channelmap(i, mdev->channelmap) ?
+   '1' : '0';
 
return bp - buf;
 }
-- 
2.15.0



Re: [PATCH][next] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift

2018-02-15 Thread Andrew Lunn
> diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c 
> b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
> index b251d534b70d..758e35fa69ab 100644
> --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c
> +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
> @@ -321,7 +321,7 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip 
> *chip,
>*/
>   for ( ; skb; skb = skb_dequeue(rxq)) {
>   if (mv88e6xxx_ts_valid(status) && seq_match(skb, seq_id)) {
> - u64 ns = timehi << 16 | timelo;
> + u64 ns = (u64)timehi << 16 | timelo;

Hi Richard

Do you prefer this, or making timehi and timelo a u64?

   Andrew


Re: [PATCH net] xfrm: reuse uncached_list to track xdsts

2018-02-15 Thread David Miller
From: Xin Long 
Date: Wed, 14 Feb 2018 19:06:02 +0800

> In early time, when freeing a xdst, it would be inserted into
> dst_garbage.list first. Then if it's refcnt was still held
> somewhere, later it would be put into dst_busy_list in
> dst_gc_task().
> 
> When one dev was being unregistered, the dev of these dsts in
> dst_busy_list would be set with loopback_dev and put this dev.
> So that this dev's removal wouldn't get blocked, and avoid the
> kmsg warning:
> 
>   kernel:unregister_netdevice: waiting for veth0 to become \
>   free. Usage count = 2
> 
> However after Commit 52df157f17e5 ("xfrm: take refcnt of dst
> when creating struct xfrm_dst bundle"), the xdst will not be
> freed with dst gc, and this warning happens.
> 
> To fix it, we need to find these xdsts that are still held by
> others when removing the dev, and free xdst's dev and set it
> with loopback_dev.
> 
> But unfortunately after flow_cache for xfrm was deleted, no
> list tracks them anymore. So we need to save these xdsts
> somewhere to release the xdst's dev later.
> 
> To make this easier, this patch is to reuse uncached_list to
> track xdsts, so that the dev refcnt can be released in the
> event NETDEV_UNREGISTER process of fib_netdev_notifier.
> 
> Thanks to Florian, we could move forward this fix quickly.
> 
> Fixes: 52df157f17e5 ("xfrm: take refcnt of dst when creating struct xfrm_dst 
> bundle")
> Reported-by: Jianlin Shi 
> Reported-by: Hangbin Liu 
> Tested-by: Eyal Birger 
> Signed-off-by: Xin Long 

Steffen, I assume you will take this via your ipsec tree?

Thank you.


[PATCH v3 net-next 0/7] RDS: zerocopy support

2018-02-15 Thread Sowmini Varadhan
This is version 3 of the series, following up on review comments for
 http://patchwork.ozlabs.org/project/netdev/list/?series=28530

Review comments addressed
Patch 4
  - fix fragile use of skb->cb[], do not set ee_code incorrectly.
Patch 5:
  - remove needless bzero of skb->cb[], consolidate err cleanup

A brief overview of this feature follows.

This patch series provides support for MSG_ZERCOCOPY
on a PF_RDS socket based on the APIs and infrastructure added
by Commit f214f915e7db ("tcp: enable MSG_ZEROCOPY")

For single threaded rds-stress testing using rds-tcp with the
ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary
results show that  there is a significant reduction in latency: about
90 usec with zerocopy, compared with 200 usec without zerocopy.

This patchset modifies the above for zerocopy in the following manner.
- if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
- if the SO_ZEROCOPY  socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg are pinned. The pinning
uses the accounting infrastructure added by a91dbff551a6 ("sock: ulimit 
on MSG_ZEROCOPY pages"). The message is unpinned when all references
to the message go down to 0, and the message is freed by rds_message_purge.

A multithreaded application using this infrastructure must send down 
a unique 32 bit cookie as ancillary data with each sendmsg invocation.
The format of this ancillary data is described in Patch 5 of the series.
The cookie is passed up to the application on the sk_error_queue when 
the message is unpinned, indicating to the application that it is now
safe to free/reuse the message buffer. The details of the completion
notification are provided in Patch 4 of this series.

Sowmini Varadhan (7):
  skbuff: export mm_[un]account_pinned_pages for other modules
  rds: hold a sock ref from rds_message to the rds_sock
  sock: permit SO_ZEROCOPY on PF_RDS socket
  rds: support for zcopy completion notification
  rds: zerocopy Tx support.
  selftests/net: add support for PF_RDS sockets
  selftests/net: add zerocopy support for PF_RDS test case

 include/linux/skbuff.h |3 +
 include/uapi/linux/errqueue.h  |2 +
 include/uapi/linux/rds.h   |1 +
 net/core/skbuff.c  |6 +-
 net/core/sock.c|   25 +++---
 net/rds/af_rds.c   |2 +
 net/rds/message.c  |  132 ++-
 net/rds/rds.h  |   17 -
 net/rds/recv.c |2 +
 net/rds/send.c |   51 ---
 tools/testing/selftests/net/msg_zerocopy.c |  133 ++-
 11 files changed, 339 insertions(+), 35 deletions(-)



Re: [PATCH net-next v2 0/3] Add ioctl() SIOCGSKNS cmd to allow obtaining net ns of tun device

2018-02-15 Thread David Miller
From: Kirill Tkhai 
Date: Wed, 14 Feb 2018 16:39:45 +0300

> Currently, it's not possible to get or check net namespace,
> which was used to create tun socket. User may have two tun
> devices with the same names in different nets, and there
> is no way to differ them each other.
> 
> The patchset adds support for ioctl() cmd SIOCGSKNS for tun
> devices. It will allow people to obtain net namespace file
> descriptor like we allow to do that for sockets in general.
> 
> v2: Add new patch [2/3] to export open_related_ns().

Series applied, thanks.


Re: [PATCH net-next 0/7] tools: tc-testing: Plugin Architecture

2018-02-15 Thread David Miller
From: "Brenda J. Butler" 
Date: Wed, 14 Feb 2018 14:09:18 -0500

> To make tdc.py more general, we are introducing a plugin architecture.
> 
> This patch set first organizes the command line parameters, then
> introduces the plugin architecture and some example plugins.

Series applied, thanks Brenda.


Re: [PATCH v3 net-next 3/7] sock: permit SO_ZEROCOPY on PF_RDS socket

2018-02-15 Thread Willem de Bruijn
On Thu, Feb 15, 2018 at 1:49 PM, Sowmini Varadhan
 wrote:
> allow the application to set SO_ZEROCOPY on the underlying sk
> of a PF_RDS socket
>
> Signed-off-by: Sowmini Varadhan 

Acked-by: Willem de Bruijn 


Re: [PATCH net-next] net/ipv4: Remove fib table id from rtable

2018-02-15 Thread David Miller
From: David Ahern 
Date: Wed, 14 Feb 2018 14:24:28 -0800

> Remove rt_table_id from rtable. It was added for getroute to return the
> table id that was hit in the lookup. With the changes for fibmatch the
> table id can be extracted from the fib_info returned in the fib_result
> so it no longer needs to be in rtable directly.
> 
> Signed-off-by: David Ahern 

Nice, applied, thanks David.


Re: [PATCH v3 net-next 1/7] skbuff: export mm_[un]account_pinned_pages for other modules

2018-02-15 Thread Willem de Bruijn
On Thu, Feb 15, 2018 at 1:49 PM, Sowmini Varadhan
 wrote:
> RDS would like to use the helper functions for managing pinned pages
> added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages")
>
> Signed-off-by: Sowmini Varadhan 

Acked-by: Willem de Bruijn 


Re: [PATCH v3 net-next 4/7] rds: support for zcopy completion notification

2018-02-15 Thread Willem de Bruijn
On Thu, Feb 15, 2018 at 2:46 PM, Santosh Shilimkar
 wrote:
> On 2/15/2018 10:49 AM, Sowmini Varadhan wrote:
>>
>> RDS removes a datagram (rds_message) from the retransmit queue when
>> an ACK is received. The ACK indicates that the receiver has queued
>> the RDS datagram, so that the sender can safely forget the datagram.
>> When all references to the rds_message are quiesced, rds_message_purge
>> is called to release resources used by the rds_message
>>
>> If the datagram to be removed had pinned pages set up, add
>> an entry to the rs->rs_znotify_queue so that the notifcation
>> will be sent up via rds_rm_zerocopy_callback() when the
>> rds_message is eventually freed by rds_message_purge.
>>
>> rds_rm_zerocopy_callback() attempts to batch the number of cookies
>> sent with each notification  to a max of SO_EE_ORIGIN_MAX_ZCOOKIES.
>> This is achieved by checking the tail skb in the sk_error_queue:
>> if this has room for one more cookie, the cookie from the
>> current notification is added; else a new skb is added to the
>> sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will
>> trigger a ->sk_error_report to notify the application.
>>
>> Signed-off-by: Sowmini Varadhan 
>> ---
>
> Acked-by: Santosh Shilimkar 

Acked-by: Willem de Bruijn 


Re: [PATCH net-next v2] selftests/net: fixes psock_fanout eBPF test case

2018-02-15 Thread David Miller
From: Prashant Bhole 
Date: Thu, 15 Feb 2018 09:19:26 +0900

> eBPF test fails due to verifier failure because log_buf is too small.
> Fixed by increasing log_buf size
> 
> Signed-off-by: Prashant Bhole 
> ---
> v2: log_buf is statically allocated

Applied, thanks.


Re: [PATCH v3 net-next 5/7] rds: zerocopy Tx support.

2018-02-15 Thread Willem de Bruijn
On Thu, Feb 15, 2018 at 2:47 PM, Santosh Shilimkar
 wrote:
> On 2/15/2018 10:49 AM, Sowmini Varadhan wrote:
>>
>> If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
>> if the SO_ZEROCOPY socket option has been set on the PF_RDS socket,
>> application pages sent down with rds_sendmsg() are pinned.
>>
>> The pinning uses the accounting infrastructure added by
>> Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages")
>>
>> The payload bytes in the message may not be modified for the
>> duration that the message has been pinned. A multi-threaded
>> application using this infrastructure may thus need to be notified
>> about send-completion so that it can free/reuse the buffers
>> passed to rds_sendmsg(). Notification of send-completion will
>> identify each message-buffer by a cookie that the application
>> must specify as ancillary data to rds_sendmsg().
>> The ancillary data in this case has cmsg_level == SOL_RDS
>> and cmsg_type == RDS_CMSG_ZCOPY_COOKIE.
>>
>> Signed-off-by: Sowmini Varadhan 
>> ---
>
>
> Acked-by: Santosh Shilimkar 

Acked-by: Willem de Bruijn 


Re: [PATCH net v2] fib_semantics: Don't match route with mismatching tclassid

2018-02-15 Thread David Ahern
On 2/15/18 1:46 AM, Stefano Brivio wrote:
> In fib_nh_match(), if output interface or gateway are passed in
> the FIB configuration, we don't have to check next hops of
> multipath routes to conclude whether we have a match or not.
> 
> However, we might still have routes with different realms
> matching the same output interface and gateway configuration,
> and this needs to cause the match to fail. Otherwise the first
> route inserted in the FIB will match, regardless of the realms:
> 
>  # ip route add 1.1.1.1 dev eth0 table 1234 realms 1/2
>  # ip route append 1.1.1.1 dev eth0 table 1234 realms 3/4
>  # ip route list table 1234
>  1.1.1.1 dev eth0 scope link realms 1/2
>  1.1.1.1 dev eth0 scope link realms 3/4
>  # ip route del 1.1.1.1 dev ens3 table 1234 realms 3/4
>  # ip route list table 1234
>  1.1.1.1 dev ens3 scope link realms 3/4
> 
> whereas route with realms 3/4 should have been deleted instead.
> 
> Explicitly check for fc_flow passed in the FIB configuration
> (this comes from RTA_FLOW extracted by rtm_to_fib_config()) and
> fail matching if it differs from nh_tclassid.
> 
> The handling of RTA_FLOW for multipath routes later in
> fib_nh_match() is still needed, as we can have multiple RTA_FLOW
> attributes that need to be matched against the tclassid of each
> next hop.
> 
> v2: Check that fc_flow is set before discarding the match, so
> that the user can still select the first matching rule by
> not specifying any realm, as suggested by David Ahern.
> 
> Reported-by: Jianlin Shi 
> Signed-off-by: Stefano Brivio 
> ---
>  net/ipv4/fib_semantics.c | 5 +
>  1 file changed, 5 insertions(+)
> 

Acked-by: David Ahern 



Re: [PATCH v3 net-next 6/7] selftests/net: add support for PF_RDS sockets

2018-02-15 Thread Willem de Bruijn
On Thu, Feb 15, 2018 at 1:49 PM, Sowmini Varadhan
 wrote:
> Add support for basic PF_RDS client-server testing in msg_zerocopy
>
> Signed-off-by: Sowmini Varadhan 

Acked-by: Willem de Bruijn 


Re: [PATCH v3 net-next 7/7] selftests/net: add zerocopy support for PF_RDS test case

2018-02-15 Thread Willem de Bruijn
On Thu, Feb 15, 2018 at 1:49 PM, Sowmini Varadhan
 wrote:
> Send a cookie with sendmsg() on PF_RDS sockets, and process the
> returned batched cookies in do_recv_completion()
>
> Signed-off-by: Sowmini Varadhan 

Acked-by: Willem de Bruijn 


Re: pull request: bluetooth-next 2018-02-15

2018-02-15 Thread David Miller
From: Johan Hedberg 
Date: Thu, 15 Feb 2018 10:33:59 +0200

> Here's the first bluetooth-next pull request targetting the 4.17 kernel
> release.
> 
>  - Fixes & cleanups to Atheros and Marvell drivers
>  - Support for two new Realtek controllers
>  - Support for new Intel Bluetooth controller
>  - Fix for supporting multiple slave-role Bluetooth LE connections
> 
> Please let me know if there are any issues pulling. Thanks.

Pulled, thanks Johan.


TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-15 Thread Oleksandr Natalenko
Hello.

I've faced an issue with a limited TCP bandwidth between my laptop and a 
server in my 1 Gbps LAN while using BBR as a congestion control mechanism. To 
verify my observations, I've set up 2 KVM VMs with the following parameters:

1) Linux v4.15.3
2) virtio NICs
3) 128 MiB of RAM
4) 2 vCPUs
5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz

The VMs are interconnected via host bridge (-netdev bridge). I was running 
iperf3 in the default and reverse mode. Here are the results:

1) BBR on both VMs

upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes
download: 3.39 Gbits/sec, cwnd ~ 320 KBytes

2) Reno on both VMs

upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant)
download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant)

3) Reno on client, BBR on server

upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant)
download: 3.45 Gbits/sec, cwnd ~ 320 KBytes

4) BBR on client, Reno on server

upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes
download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant)

So, as you may see, when BBR is in use, upload rate is bad and cwnd is low. If 
using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput to 
~100 Mbps (verifiable not only by iperf3, but also by scp while transferring 
some files between hosts).

Also, I've tried to use YeAH instead of Reno, and it gives me the same results 
as Reno (IOW, YeAH works fine too).

Questions:

1) is this expected?
2) or am I missing some extra BBR tuneable?
3) if it is not a regression (I don't have any previous data to compare with), 
how can I fix this?
4) if it is a bug in BBR, what else should I provide or check for a proper 
investigation?

Thanks.

Regards,
  Oleksandr




Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read

2018-02-15 Thread Andrew Lunn
On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote:
> _port_ is being used as index to array port_hwtstamp before verifying
> it is a non-negative number and a valid index at line 209 and 258:
> 
> if (port < 0 || port >= mv88e6xxx_num_ports(chip))
> 
> Fix this by checking _port_ before using it as index to array
> port_hwtstamp.
> 
> Addresses-Coverity-ID: 1465287 ("Negative array index read")
> Addresses-Coverity-ID: 1465291 ("Negative array index read")
> Fixes: c6fe0ad2c349 ("net: dsa: mv88e6xxx: add rx/tx timestamping support")
> Signed-off-by: Gustavo A. R. Silva 

Reviewed-by: Andrew Lunn 

Andrew


[PATCH iproute2-next v5 0/9] ipaddress: Make print_linkinfo_brief() static

2018-02-15 Thread Serhey Popovych
With this series I propose to make print_linkinfo_brief() static in
favor of print_linkinfo() as single point for linkinfo printing.

Changes presented with this series tested using following script:

\#!/bin/bash

iproute2_dir="$1"
iface='eth0.2'

pushd "$iproute2_dir" &>/dev/null

for i in new old; do
DIR="/tmp/$i"
mkdir -p "$DIR"

ln -snf ip.$i ip/ip

# normal
ip/ip link show  >"$DIR/ip-link-show"
ip/ip -4 addr show   >"$DIR/ip-4-addr-show"
ip/ip -6 addr show   >"$DIR/ip-6-addr-show"
ip/ip addr show dev "$iface" >"$DIR/ip-addr-show-$iface"

# brief
ip/ip -br link show  >"$DIR/ip-br-link-show"
ip/ip -br -4 addr show   >"$DIR/ip-br-4-addr-show"
ip/ip -br -6 addr show   >"$DIR/ip-br-6-addr-show"
ip/ip -br addr show dev "$iface" >"$DIR/ip-br-addr-show-$iface"
done
rm -f ip/ip

diff -urN /tmp/{old,new} |sed -n -Ee'/^(-{3}|\+{3})[[:space:]]+/!p'
rc=$?

popd &>/dev/null
exit $rc

Expected results : 
Actual results   : 

Although test coverage is far from ideal in my opinion it covers most
important aspects of the changes presented by the series.

All this work is done in prepare of iplink_get() enhancements to support
attribute parse that finally will be used to simplify ip/tunnel
RTM_GETLINK code.

As always reviews, comments, suggestions and criticism is welcome.

v5
  Fix build failures caused by incorrect dependency: libnetlink.a does
  not depend on ll_map.o, instead ll_map.o depends on it.

v4
  Print master network device name using correct color.

v3
  Fixed subject line.

v2
  Rebased to current iproute2-next/master. No changes.

Thanks,
Serhii

Serhey Popovych (9):
  ipaddress: Abstract IFA_LABEL matching code
  ipaddress: ll_map: Replace ll_idx_n2a() with ll_index_to_name()
  utils: Reimplement ll_idx_n2a() and introduce ll_idx_a2n()
  ipaddress: Improve print_linkinfo()
  ipaddress: Simplify print_linkinfo_brief() and it's usage
  lib: Correct object file dependencies
  utils: Introduce and use get_ifname_rta()
  utils: Introduce and use print_name_and_link() to print name@link
  ipaddress: Make print_linkinfo_brief() static

 Makefile |2 +-
 bridge/link.c|   21 ++---
 include/ll_map.h |4 +-
 include/utils.h  |5 ++
 ip/ip_common.h   |2 -
 ip/ipaddress.c   |  231 ++
 ip/iplink.c  |5 +-
 lib/Makefile |4 +-
 lib/ll_map.c |   31 +---
 lib/utils.c  |   68 
 10 files changed, 167 insertions(+), 206 deletions(-)

-- 
1.7.10.4



[PATCH iproute2-next v5 2/9] ipaddress: ll_map: Replace ll_idx_n2a() with ll_index_to_name()

2018-02-15 Thread Serhey Popovych
There is no reentrancy as well as deferred result usage for all cases
where ll_idx_n2a() being used: it is safe to use ll_index_to_name() that
internally calls ll_idx_n2a() with static buffer to hold result.

While there print master network device name using correct color.

Signed-off-by: Serhey Popovych 
---
 ip/ipaddress.c |   21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index ad69d09..0daba8c 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -813,14 +813,13 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
 
if (tb[IFLA_LINK]) {
-   SPRINT_BUF(b1);
int iflink = rta_getattr_u32(tb[IFLA_LINK]);
 
if (iflink == 0) {
snprintf(buf, sizeof(buf), "%s@NONE", name);
print_null(PRINT_JSON, "link", NULL, NULL);
} else {
-   const char *link = ll_idx_n2a(iflink, b1);
+   const char *link = ll_index_to_name(iflink);
 
print_string(PRINT_JSON, "link", NULL, link);
snprintf(buf, sizeof(buf), "%s@%s", name, link);
@@ -957,12 +956,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
print_int(PRINT_ANY,
  "link_index", "@if%d: ", iflink);
else {
-   SPRINT_BUF(b1);
-
print_string(PRINT_ANY,
 "link",
 "@%s: ",
-ll_idx_n2a(iflink, b1));
+ll_index_to_name(iflink));
m_flag = ll_index_to_flags(iflink);
m_flag = !(m_flag & IFF_UP);
}
@@ -984,12 +981,13 @@ int print_linkinfo(const struct sockaddr_nl *who,
 "qdisc %s ",
 rta_getattr_str(tb[IFLA_QDISC]));
if (tb[IFLA_MASTER]) {
-   SPRINT_BUF(b1);
+   int master = rta_getattr_u32(tb[IFLA_MASTER]);
 
-   print_string(PRINT_ANY,
-"master",
-"master %s ",
-ll_idx_n2a(rta_getattr_u32(tb[IFLA_MASTER]), b1));
+   print_color_string(PRINT_ANY,
+  COLOR_IFNAME,
+  "master",
+  "master %s ",
+  ll_index_to_name(master));
}
 
if (tb[IFLA_OPERSTATE])
@@ -1308,7 +1306,6 @@ static int get_filter(const char *arg)
 static int ifa_label_match_rta(int ifindex, const struct rtattr *rta)
 {
const char *label;
-   SPRINT_BUF(b1);
 
if (!filter.label)
return 0;
@@ -1316,7 +1313,7 @@ static int ifa_label_match_rta(int ifindex, const struct 
rtattr *rta)
if (rta)
label = RTA_DATA(rta);
else
-   label = ll_idx_n2a(ifindex, b1);
+   label = ll_index_to_name(ifindex);
 
return fnmatch(filter.label, label, 0);
 }
-- 
1.7.10.4



[PATCH iproute2-next v5 9/9] ipaddress: Make print_linkinfo_brief() static

2018-02-15 Thread Serhey Popovych
It shares lot of code with print_linkinfo(): drop duplicated part,
change parameters list, make it static and call from print_linkinfo()
after common path.

While there move SPRINT_BUF() to the function scope from blocks to
avoid duplication and use "%s" to print "\n" to help compiler optimize
exit for both print_linkinfo_brief() and normal paths.

Signed-off-by: Serhey Popovych 
---
 ip/ip_common.h |2 --
 ip/ipaddress.c |   76 
 ip/iplink.c|5 +---
 3 files changed, 11 insertions(+), 72 deletions(-)

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 1397d99..e4e628b 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -29,8 +29,6 @@ struct link_filter {
 int get_operstate(const char *name);
 int print_linkinfo(const struct sockaddr_nl *who,
   struct nlmsghdr *n, void *arg);
-int print_linkinfo_brief(const struct sockaddr_nl *who,
-struct nlmsghdr *n, void *arg);
 int print_addrinfo(const struct sockaddr_nl *who,
   struct nlmsghdr *n, void *arg);
 int print_addrlabel(const struct sockaddr_nl *who,
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 90cb5e5..1380453 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -752,63 +752,12 @@ static void print_link_stats(FILE *fp, struct nlmsghdr *n)
fprintf(fp, "%s", _SL_);
 }
 
-int print_linkinfo_brief(const struct sockaddr_nl *who,
-struct nlmsghdr *n, void *arg)
+static int print_linkinfo_brief(FILE *fp, const char *name,
+   const struct ifinfomsg *ifi,
+   struct rtattr *tb[])
 {
-   FILE *fp = (FILE *)arg;
-   struct ifinfomsg *ifi = NLMSG_DATA(n);
-   struct rtattr *tb[IFLA_MAX+1];
-   int len = n->nlmsg_len;
-   const char *name;
unsigned int m_flag = 0;
 
-   if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
-   return -1;
-
-   len -= NLMSG_LENGTH(sizeof(*ifi));
-   if (len < 0)
-   return -1;
-
-   if (filter.ifindex && ifi->ifi_index != filter.ifindex)
-   return -1;
-   if (filter.up && !(ifi->ifi_flags&IFF_UP))
-   return -1;
-
-   parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
-
-   name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
-   if (!name)
-   return -1;
-
-   if (filter.label &&
-   (!filter.family || filter.family == AF_PACKET) &&
-   fnmatch(filter.label, name, 0))
-   return -1;
-
-   if (tb[IFLA_GROUP]) {
-   int group = rta_getattr_u32(tb[IFLA_GROUP]);
-
-   if (filter.group != -1 && group != filter.group)
-   return -1;
-   }
-
-   if (tb[IFLA_MASTER]) {
-   int master = rta_getattr_u32(tb[IFLA_MASTER]);
-
-   if (filter.master > 0 && master != filter.master)
-   return -1;
-   } else if (filter.master > 0)
-   return -1;
-
-   if (filter.kind && match_link_kind(tb, filter.kind, 0))
-   return -1;
-
-   if (filter.slave_kind && match_link_kind(tb, filter.slave_kind, 1))
-   return -1;
-
-   if (n->nlmsg_type == RTM_DELLINK)
-   print_bool(PRINT_ANY, "deleted", "Deleted ", true);
-
m_flag = print_name_and_link("%-16s ", COLOR_NONE, name, tb);
 
if (tb[IFLA_OPERSTATE])
@@ -868,6 +817,7 @@ int print_linkinfo(const struct sockaddr_nl *who,
int len = n->nlmsg_len;
const char *name;
unsigned int m_flag = 0;
+   SPRINT_BUF(b1);
 
if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
return 0;
@@ -916,6 +866,9 @@ int print_linkinfo(const struct sockaddr_nl *who,
if (n->nlmsg_type == RTM_DELLINK)
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
 
+   if (brief)
+   return print_linkinfo_brief(fp, name, ifi, tb);
+
print_int(PRINT_ANY, "ifindex", "%d: ", ifi->ifi_index);
 
m_flag = print_name_and_link("%s: ", COLOR_IFNAME, name, tb);
@@ -949,7 +902,6 @@ int print_linkinfo(const struct sockaddr_nl *who,
print_linkmode(fp, tb[IFLA_LINKMODE]);
 
if (tb[IFLA_GROUP]) {
-   SPRINT_BUF(b1);
int group = rta_getattr_u32(tb[IFLA_GROUP]);
 
print_string(PRINT_ANY,
@@ -965,8 +917,6 @@ int print_linkinfo(const struct sockaddr_nl *who,
print_link_event(fp, rta_getattr_u32(tb[IFLA_EVENT]));
 
if (!filter.family || filter.family == AF_PACKET || show_details) {
-   SPRINT_BUF(b1);
-
print_string(PRINT_FP, NULL, "%s", _SL_);
print_string(PRINT_ANY,
 "link_type",
@@ -1066,7 +1016,6 @@ int print_linkinfo(const struct sockaddr_nl *who,
 rta_getattr_str(tb[IFLA_PHYS_PORT_NAME]));
 
  

[PATCH iproute2-next v5 7/9] utils: Introduce and use get_ifname_rta()

2018-02-15 Thread Serhey Popovych
Be consistent in handling of IFLA_IFNAME attribute in all places: if
there is no attribute report bug to stderr and use ll_idx_n2a() as
last measure to get name in "if%u" format instead of "".

Use check_ifname() to validate network device name: this catches both
unexpected return from kernel and ll_idx_n2a().

Signed-off-by: Serhey Popovych 
---
 bridge/link.c   |8 
 include/utils.h |1 +
 ip/ipaddress.c  |   20 
 lib/utils.c |   19 +++
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/bridge/link.c b/bridge/link.c
index 870ebe0..a11cbb1 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -99,9 +99,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
   struct nlmsghdr *n, void *arg)
 {
FILE *fp = arg;
-   int len = n->nlmsg_len;
struct ifinfomsg *ifi = NLMSG_DATA(n);
struct rtattr *tb[IFLA_MAX+1];
+   int len = n->nlmsg_len;
+   const char *name;
 
len -= NLMSG_LENGTH(sizeof(*ifi));
if (len < 0) {
@@ -117,10 +118,9 @@ int print_linkinfo(const struct sockaddr_nl *who,
 
parse_rtattr_flags(tb, IFLA_MAX, IFLA_RTA(ifi), len, NLA_F_NESTED);
 
-   if (tb[IFLA_IFNAME] == NULL) {
-   fprintf(stderr, "BUG: nil ifname\n");
+   name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
+   if (!name)
return -1;
-   }
 
if (n->nlmsg_type == RTM_DELLINK)
fprintf(fp, "Deleted ");
diff --git a/include/utils.h b/include/utils.h
index 867e540..84ca873 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -183,6 +183,7 @@ void duparg(const char *, const char *) 
__attribute__((noreturn));
 void duparg2(const char *, const char *) __attribute__((noreturn));
 int check_ifname(const char *);
 int get_ifname(char *, const char *);
+const char *get_ifname_rta(int ifindex, const struct rtattr *rta);
 int matches(const char *arg, const char *pattern);
 int inet_addr_match(const inet_prefix *a, const inet_prefix *b, int bits);
 int inet_addr_match_rta(const inet_prefix *m, const struct rtattr *rta);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 749178d..08d2576 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -776,12 +776,10 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
return -1;
 
parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
-   if (tb[IFLA_IFNAME] == NULL) {
-   fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
ifi->ifi_index);
-   name = ll_idx_n2a(ifi->ifi_index);
-   } else {
-   name = rta_getattr_str(tb[IFLA_IFNAME]);
-   }
+
+   name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
+   if (!name)
+   return -1;
 
if (filter.label &&
(!filter.family || filter.family == AF_PACKET) &&
@@ -903,12 +901,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
return -1;
 
parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
-   if (tb[IFLA_IFNAME] == NULL) {
-   fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
ifi->ifi_index);
-   name = ll_idx_n2a(ifi->ifi_index);
-   } else {
-   name = rta_getattr_str(tb[IFLA_IFNAME]);
-   }
+
+   name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
+   if (!name)
+   return -1;
 
if (filter.label &&
(!filter.family || filter.family == AF_PACKET) &&
diff --git a/lib/utils.c b/lib/utils.c
index d86c2ee..572d42a 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -871,6 +871,25 @@ int get_ifname(char *buf, const char *name)
return ret;
 }
 
+const char *get_ifname_rta(int ifindex, const struct rtattr *rta)
+{
+   const char *name;
+
+   if (rta) {
+   name = rta_getattr_str(rta);
+   } else {
+   fprintf(stderr,
+   "BUG: device with ifindex %d has nil ifname\n",
+   ifindex);
+   name = ll_idx_n2a(ifindex);
+   }
+
+   if (check_ifname(name))
+   return NULL;
+
+   return name;
+}
+
 int matches(const char *cmd, const char *pattern)
 {
int len = strlen(cmd);
-- 
1.7.10.4



[PATCH iproute2-next v5 3/9] utils: Reimplement ll_idx_n2a() and introduce ll_idx_a2n()

2018-02-15 Thread Serhey Popovych
Now all users of ll_idx_n2a() replaced with ll_index_to_name() we can
move it's functionality to ll_index_to_name() and implement index to
name conversion using snprintf() and "if%u".

Use %u specifier in "if%..." template consistently: network device
indexes are always greather than zero.

Also introduce ll_idx_n2a() conterpart: ll_idx_a2n() that is used
to translate name of the "if%u" form to index using sscanf().

Signed-off-by: Serhey Popovych 
---
 include/ll_map.h |4 +++-
 lib/ll_map.c |   31 +--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/include/ll_map.h b/include/ll_map.h
index c8474e6..8546ff9 100644
--- a/include/ll_map.h
+++ b/include/ll_map.h
@@ -8,9 +8,11 @@ int ll_remember_index(const struct sockaddr_nl *who,
 void ll_init_map(struct rtnl_handle *rth);
 unsigned ll_name_to_index(const char *name);
 const char *ll_index_to_name(unsigned idx);
-const char *ll_idx_n2a(unsigned idx, char *buf);
 int ll_index_to_type(unsigned idx);
 int ll_index_to_flags(unsigned idx);
 unsigned namehash(const char *str);
 
+const char *ll_idx_n2a(unsigned int idx);
+unsigned int ll_idx_a2n(const char *name);
+
 #endif /* __LL_MAP_H__ */
diff --git a/lib/ll_map.c b/lib/ll_map.c
index f65614f..0afe689 100644
--- a/lib/ll_map.c
+++ b/lib/ll_map.c
@@ -136,8 +136,26 @@ int ll_remember_index(const struct sockaddr_nl *who,
return 0;
 }
 
-const char *ll_idx_n2a(unsigned idx, char *buf)
+const char *ll_idx_n2a(unsigned int idx)
 {
+   static char buf[IFNAMSIZ];
+
+   snprintf(buf, sizeof(buf), "if%u", idx);
+   return buf;
+}
+
+unsigned int ll_idx_a2n(const char *name)
+{
+   unsigned int idx;
+
+   if (sscanf(name, "if%u", &idx) != 1)
+   return 0;
+   return idx;
+}
+
+const char *ll_index_to_name(unsigned int idx)
+{
+   static char buf[IFNAMSIZ];
const struct ll_cache *im;
 
if (idx == 0)
@@ -148,18 +166,11 @@ const char *ll_idx_n2a(unsigned idx, char *buf)
return im->name;
 
if (if_indextoname(idx, buf) == NULL)
-   snprintf(buf, IFNAMSIZ, "if%d", idx);
+   snprintf(buf, IFNAMSIZ, "if%u", idx);
 
return buf;
 }
 
-const char *ll_index_to_name(unsigned idx)
-{
-   static char nbuf[IFNAMSIZ];
-
-   return ll_idx_n2a(idx, nbuf);
-}
-
 int ll_index_to_type(unsigned idx)
 {
const struct ll_cache *im;
@@ -196,7 +207,7 @@ unsigned ll_name_to_index(const char *name)
 
idx = if_nametoindex(name);
if (idx == 0)
-   sscanf(name, "if%u", &idx);
+   idx = ll_idx_a2n(name);
return idx;
 }
 
-- 
1.7.10.4



[PATCH iproute2-next v5 6/9] lib: Correct object file dependencies

2018-02-15 Thread Serhey Popovych
Neither internal libnetlink nor libgenl depends on ll_map.o: prepare for
upcoming changes that brings much more cleaner dependency between
utils.o and ll_map.o.

However ll_map.o depends on libnetlink.o functions so we need to provide
libnetlink.a after libutil.a in LIBNETLINK at global Makefile.

Tested using make clean && make -j4. No problems so far.

Signed-off-by: Serhey Popovych 
---
 Makefile |2 +-
 lib/Makefile |4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 32587db..b526d3b 100644
--- a/Makefile
+++ b/Makefile
@@ -59,7 +59,7 @@ YACCFLAGS = -d -t -v
 
 SUBDIRS=lib ip tc bridge misc netem genl tipc devlink rdma man
 
-LIBNETLINK=../lib/libnetlink.a ../lib/libutil.a
+LIBNETLINK=../lib/libutil.a ../lib/libnetlink.a
 LDLIBS += $(LIBNETLINK)
 
 all: config.mk
diff --git a/lib/Makefile b/lib/Makefile
index 7b34ed5..bab8cbf 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -3,11 +3,11 @@ include ../config.mk
 
 CFLAGS += -fPIC
 
-UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
+UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \
inet_proto.o namespace.o json_writer.o json_print.o \
names.o color.o bpf.o exec.o fs.o
 
-NLOBJ=libgenl.o ll_map.o libnetlink.o
+NLOBJ=libgenl.o libnetlink.o
 
 all: libnetlink.a libutil.a
 
-- 
1.7.10.4



[PATCH iproute2-next v5 5/9] ipaddress: Simplify print_linkinfo_brief() and it's usage

2018-02-15 Thread Serhey Popovych
Simplify calling code in ipaddr_list_flush_or_save() by introducing
intermediate variable of @struct nlmsghdr, drop duplicated code:
print_linkinfo_brief() never returns values other than <= 0 so we can
move print_selected_addrinfo() outside of each block.

Signed-off-by: Serhey Popovych 
---
 ip/ipaddress.c |   31 ++-
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 6eac370..749178d 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -753,7 +753,7 @@ static void print_link_stats(FILE *fp, struct nlmsghdr *n)
 }
 
 int print_linkinfo_brief(const struct sockaddr_nl *who,
-   struct nlmsghdr *n, void *arg)
+struct nlmsghdr *n, void *arg)
 {
FILE *fp = (FILE *)arg;
struct ifinfomsg *ifi = NLMSG_DATA(n);
@@ -2013,24 +2013,21 @@ static int ipaddr_list_flush_or_save(int argc, char 
**argv, int action)
ipaddr_filter(&linfo, ainfo);
 
for (l = linfo.head; l; l = l->next) {
-   int res = 0;
-   struct ifinfomsg *ifi = NLMSG_DATA(&l->h);
+   struct nlmsghdr *n = &l->h;
+   struct ifinfomsg *ifi = NLMSG_DATA(n);
+   int res;
 
open_json_object(NULL);
-   if (brief) {
-   if (print_linkinfo_brief(NULL, &l->h, stdout) == 0)
-   if (filter.family != AF_PACKET)
-   print_selected_addrinfo(ifi,
-   ainfo->head,
-   stdout);
-   } else if (no_link ||
-  (res = print_linkinfo(NULL, &l->h, stdout)) >= 0) {
-   if (filter.family != AF_PACKET)
-   print_selected_addrinfo(ifi,
-   ainfo->head, stdout);
-   if (res > 0 && !do_link && show_stats)
-   print_link_stats(stdout, &l->h);
-   }
+   if (brief)
+   res = print_linkinfo_brief(NULL, n, stdout);
+   else if (no_link)
+   res = 0;
+   else
+   res = print_linkinfo(NULL, n, stdout);
+   if (res >= 0 && filter.family != AF_PACKET)
+   print_selected_addrinfo(ifi, ainfo->head, stdout);
+   if (res > 0 && !do_link && show_stats)
+   print_link_stats(stdout, n);
close_json_object();
}
fflush(stdout);
-- 
1.7.10.4



[PATCH iproute2-next v5 8/9] utils: Introduce and use print_name_and_link() to print name@link

2018-02-15 Thread Serhey Popovych
There is at least three places implementing same things: two in
ipaddress.c print_linkinfo() & print_linkinfo_brief() and one in
bridge/link.c.

They are diverge from each other very little: bridge/link.c does not
support JSON output at the moment and print_linkinfo_brief() does not
handle IFLA_LINK_NETNS case.

Introduce and use print_name_and_link() routine to handle name@link
output in all possible variations; respect IFLA_LINK_NETNS attribute to
handle case when link is in different namespace; use ll_idx_n2a() for
interface name instead of "" to share logic with other code (e.g.
ll_name_to_index() and ll_index_to_name()) supporting such template.

Signed-off-by: Serhey Popovych 
---
 bridge/link.c   |   13 +++--
 include/utils.h |4 
 ip/ipaddress.c  |   44 ++--
 lib/utils.c |   49 +
 4 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/bridge/link.c b/bridge/link.c
index a11cbb1..90c9734 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -125,20 +125,13 @@ int print_linkinfo(const struct sockaddr_nl *who,
if (n->nlmsg_type == RTM_DELLINK)
fprintf(fp, "Deleted ");
 
-   fprintf(fp, "%d: %s ", ifi->ifi_index,
-   tb[IFLA_IFNAME] ? rta_getattr_str(tb[IFLA_IFNAME]) : "");
+   fprintf(fp, "%d: ", ifi->ifi_index);
+
+   print_name_and_link("%s: ", COLOR_NONE, name, tb);
 
if (tb[IFLA_OPERSTATE])
print_operstate(fp, rta_getattr_u8(tb[IFLA_OPERSTATE]));
 
-   if (tb[IFLA_LINK]) {
-   int iflink = rta_getattr_u32(tb[IFLA_LINK]);
-
-   fprintf(fp, "@%s: ",
-   iflink ? ll_index_to_name(iflink) : "NONE");
-   } else
-   fprintf(fp, ": ");
-
print_link_flags(fp, ifi->ifi_flags);
 
if (tb[IFLA_MTU])
diff --git a/include/utils.h b/include/utils.h
index 84ca873..75ddb4a 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -12,6 +12,7 @@
 #include "libnetlink.h"
 #include "ll_map.h"
 #include "rtm_map.h"
+#include "json_print.h"
 
 extern int preferred_family;
 extern int human_readable;
@@ -250,6 +251,9 @@ void print_escape_buf(const __u8 *buf, size_t len, const 
char *escape);
 int print_timestamp(FILE *fp);
 void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n);
 
+unsigned int print_name_and_link(const char *fmt, enum color_attr color,
+const char *name, struct rtattr *tb[]);
+
 #define BIT(nr) (1UL << (nr))
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 08d2576..90cb5e5 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -760,7 +760,6 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
struct rtattr *tb[IFLA_MAX+1];
int len = n->nlmsg_len;
const char *name;
-   char buf[32] = { 0, };
unsigned int m_flag = 0;
 
if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
@@ -810,25 +809,7 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
if (n->nlmsg_type == RTM_DELLINK)
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
 
-   if (tb[IFLA_LINK]) {
-   int iflink = rta_getattr_u32(tb[IFLA_LINK]);
-
-   if (iflink == 0) {
-   snprintf(buf, sizeof(buf), "%s@NONE", name);
-   print_null(PRINT_JSON, "link", NULL, NULL);
-   } else {
-   const char *link = ll_index_to_name(iflink);
-
-   print_string(PRINT_JSON, "link", NULL, link);
-   snprintf(buf, sizeof(buf), "%s@%s", name, link);
-   m_flag = ll_index_to_flags(iflink);
-   m_flag = !(m_flag & IFF_UP);
-   }
-   } else
-   snprintf(buf, sizeof(buf), "%s", name);
-
-   print_string(PRINT_FP, NULL, "%-16s ", buf);
-   print_string(PRINT_JSON, "ifname", NULL, name);
+   m_flag = print_name_and_link("%-16s ", COLOR_NONE, name, tb);
 
if (tb[IFLA_OPERSTATE])
print_operstate(fp, rta_getattr_u8(tb[IFLA_OPERSTATE]));
@@ -936,29 +917,8 @@ int print_linkinfo(const struct sockaddr_nl *who,
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
 
print_int(PRINT_ANY, "ifindex", "%d: ", ifi->ifi_index);
-   print_color_string(PRINT_ANY, COLOR_IFNAME, "ifname", "%s", name);
-
-   if (tb[IFLA_LINK]) {
-   int iflink = rta_getattr_u32(tb[IFLA_LINK]);
 
-   if (iflink == 0)
-   print_null(PRINT_ANY, "link", "@%s: ", "NONE");
-   else {
-   if (tb[IFLA_LINK_NETNSID])
-   print_int(PRINT_ANY,
- "link_index", "@if%d: ", iflink);
-   else {
-   print_string(PRINT

[PATCH iproute2-next v5 1/9] ipaddress: Abstract IFA_LABEL matching code

2018-02-15 Thread Serhey Popovych
There at least two places in ip/ipaddress.c where we match IFA_LABEL
against filter.label if that is given.

Get rid of "common" if () statement for inet_addr_match_rta() and
ifa_label_match_rta(): it is not common because first will check for
filter.pfx.family != AF_UNSPEC inside and second for filter.label being
non NULL.

This allows us to further simplify down code and prepare for
ll_idx_n2a() replacement with ll_index_to_name() without 80 columns
checkpatch notice.

Signed-off-by: Serhey Popovych 
---
 ip/ipaddress.c |   57 +++-
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 6990b81..ad69d09 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1305,6 +1305,22 @@ static int get_filter(const char *arg)
return 0;
 }
 
+static int ifa_label_match_rta(int ifindex, const struct rtattr *rta)
+{
+   const char *label;
+   SPRINT_BUF(b1);
+
+   if (!filter.label)
+   return 0;
+
+   if (rta)
+   label = RTA_DATA(rta);
+   else
+   label = ll_idx_n2a(ifindex, b1);
+
+   return fnmatch(filter.label, label, 0);
+}
+
 int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
   void *arg)
 {
@@ -1343,21 +1359,13 @@ int print_addrinfo(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
return 0;
if ((filter.flags ^ ifa_flags) & filter.flagmask)
return 0;
-   if (filter.label) {
-   SPRINT_BUF(b1);
-   const char *label;
-
-   if (rta_tb[IFA_LABEL])
-   label = RTA_DATA(rta_tb[IFA_LABEL]);
-   else
-   label = ll_idx_n2a(ifa->ifa_index, b1);
-   if (fnmatch(filter.label, label, 0) != 0)
-   return 0;
-   }
 
if (filter.family && filter.family != ifa->ifa_family)
return 0;
 
+   if (ifa_label_match_rta(ifa->ifa_index, rta_tb[IFA_LABEL]))
+   return 0;
+
if (inet_addr_match_rta(&filter.pfx, rta_tb[IFA_LOCAL]))
return 0;
 
@@ -1713,25 +1721,14 @@ static void ipaddr_filter(struct nlmsg_chain *linfo, 
struct nlmsg_chain *ainfo)
 
if ((filter.flags ^ ifa_flags) & filter.flagmask)
continue;
-   if (filter.pfx.family || filter.label) {
-   struct rtattr *rta =
-   tb[IFA_LOCAL] ? : tb[IFA_ADDRESS];
-
-   if (inet_addr_match_rta(&filter.pfx, rta))
-   continue;
-
-   if (filter.label) {
-   SPRINT_BUF(b1);
-   const char *label;
-
-   if (tb[IFA_LABEL])
-   label = RTA_DATA(tb[IFA_LABEL]);
-   else
-   label = 
ll_idx_n2a(ifa->ifa_index, b1);
-   if (fnmatch(filter.label, label, 0) != 
0)
-   continue;
-   }
-   }
+
+   if (ifa_label_match_rta(ifa->ifa_index, tb[IFA_LABEL]))
+   continue;
+
+   if (!tb[IFA_LOCAL])
+   tb[IFA_LOCAL] = tb[IFA_ADDRESS];
+   if (inet_addr_match_rta(&filter.pfx, tb[IFA_LOCAL]))
+   continue;
 
ok = 1;
break;
-- 
1.7.10.4



[PATCH iproute2-next v5 4/9] ipaddress: Improve print_linkinfo()

2018-02-15 Thread Serhey Popovych
There are few places to improve:

  1) return -1 when entry is filtered instead of zero, which means
 accept entry: ipaddress_list_flush_or_save() the only user of this

  2) use ll_idx_n2a() as last resort to translate name to index for
 "should never happen" cases when cache shouldn't be considered

  3) replace open coded access to IFLA_IFNAME attribute data by
 RTA_DATA() with rta_getattr_str()

  4) simplify ifname printing since name is never NULL, thanks to (2).

Signed-off-by: Serhey Popovych 
---
 ip/ipaddress.c |   30 +-
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 0daba8c..6eac370 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -778,14 +778,14 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
if (tb[IFLA_IFNAME] == NULL) {
fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
ifi->ifi_index);
-   name = "";
+   name = ll_idx_n2a(ifi->ifi_index);
} else {
name = rta_getattr_str(tb[IFLA_IFNAME]);
}
 
if (filter.label &&
(!filter.family || filter.family == AF_PACKET) &&
-   fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0))
+   fnmatch(filter.label, name, 0))
return -1;
 
if (tb[IFLA_GROUP]) {
@@ -887,6 +887,7 @@ int print_linkinfo(const struct sockaddr_nl *who,
struct ifinfomsg *ifi = NLMSG_DATA(n);
struct rtattr *tb[IFLA_MAX+1];
int len = n->nlmsg_len;
+   const char *name;
unsigned int m_flag = 0;
 
if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
@@ -897,18 +898,22 @@ int print_linkinfo(const struct sockaddr_nl *who,
return -1;
 
if (filter.ifindex && ifi->ifi_index != filter.ifindex)
-   return 0;
+   return -1;
if (filter.up && !(ifi->ifi_flags&IFF_UP))
-   return 0;
+   return -1;
 
parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
-   if (tb[IFLA_IFNAME] == NULL)
+   if (tb[IFLA_IFNAME] == NULL) {
fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
ifi->ifi_index);
+   name = ll_idx_n2a(ifi->ifi_index);
+   } else {
+   name = rta_getattr_str(tb[IFLA_IFNAME]);
+   }
 
if (filter.label &&
(!filter.family || filter.family == AF_PACKET) &&
-   fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0))
-   return 0;
+   fnmatch(filter.label, name, 0))
+   return -1;
 
if (tb[IFLA_GROUP]) {
int group = rta_getattr_u32(tb[IFLA_GROUP]);
@@ -935,16 +940,7 @@ int print_linkinfo(const struct sockaddr_nl *who,
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
 
print_int(PRINT_ANY, "ifindex", "%d: ", ifi->ifi_index);
-   if (tb[IFLA_IFNAME]) {
-   print_color_string(PRINT_ANY,
-  COLOR_IFNAME,
-  "ifname", "%s",
-  rta_getattr_str(tb[IFLA_IFNAME]));
-   } else {
-   print_null(PRINT_JSON, "ifname", NULL, NULL);
-   print_color_null(PRINT_FP, COLOR_IFNAME,
-"ifname", "%s", "");
-   }
+   print_color_string(PRINT_ANY, COLOR_IFNAME, "ifname", "%s", name);
 
if (tb[IFLA_LINK]) {
int iflink = rta_getattr_u32(tb[IFLA_LINK]);
-- 
1.7.10.4



Re: [PATCH iproute2-next v4 7/9] utils: Introduce and use get_ifname_rta()

2018-02-15 Thread Serhey Popovych
David Ahern wrote:
> On 2/15/18 11:34 AM, Serhey Popovych wrote:
>> Be consistent in handling of IFLA_IFNAME attribute in all places: if
>> there is no attribute report bug to stderr and use ll_idx_n2a() as
>> last measure to get name in "if%u" format instead of "".
>>
>> Use check_ifname() to validate network device name: this catches both
>> unexpected return from kernel and ll_idx_n2a().
>>
>> Signed-off-by: Serhey Popovych 
>> ---
>>  bridge/link.c   |8 
>>  include/utils.h |1 +
>>  ip/ipaddress.c  |   20 
>>  lib/utils.c |   19 +++
>>  4 files changed, 32 insertions(+), 16 deletions(-)
>>
>> diff --git a/bridge/link.c b/bridge/link.c
>> index 870ebe0..a11cbb1 100644
>> --- a/bridge/link.c
>> +++ b/bridge/link.c
>> @@ -99,9 +99,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
>> struct nlmsghdr *n, void *arg)
>>  {
>>  FILE *fp = arg;
>> -int len = n->nlmsg_len;
>>  struct ifinfomsg *ifi = NLMSG_DATA(n);
>>  struct rtattr *tb[IFLA_MAX+1];
>> +int len = n->nlmsg_len;
>> +const char *name;
>>  
>>  len -= NLMSG_LENGTH(sizeof(*ifi));
>>  if (len < 0) {
>> @@ -117,10 +118,9 @@ int print_linkinfo(const struct sockaddr_nl *who,
>>  
>>  parse_rtattr_flags(tb, IFLA_MAX, IFLA_RTA(ifi), len, NLA_F_NESTED);
>>  
>> -if (tb[IFLA_IFNAME] == NULL) {
>> -fprintf(stderr, "BUG: nil ifname\n");
>> +name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
>> +if (!name)
>>  return -1;
>> -}
>>  
>>  if (n->nlmsg_type == RTM_DELLINK)
>>  fprintf(fp, "Deleted ");
>> diff --git a/include/utils.h b/include/utils.h
>> index 867e540..84ca873 100644
>> --- a/include/utils.h
>> +++ b/include/utils.h
>> @@ -183,6 +183,7 @@ void duparg(const char *, const char *) 
>> __attribute__((noreturn));
>>  void duparg2(const char *, const char *) __attribute__((noreturn));
>>  int check_ifname(const char *);
>>  int get_ifname(char *, const char *);
>> +const char *get_ifname_rta(int ifindex, const struct rtattr *rta);
>>  int matches(const char *arg, const char *pattern);
>>  int inet_addr_match(const inet_prefix *a, const inet_prefix *b, int bits);
>>  int inet_addr_match_rta(const inet_prefix *m, const struct rtattr *rta);
>> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
>> index 749178d..08d2576 100644
>> --- a/ip/ipaddress.c
>> +++ b/ip/ipaddress.c
>> @@ -776,12 +776,10 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
>>  return -1;
>>  
>>  parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
>> -if (tb[IFLA_IFNAME] == NULL) {
>> -fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
>> ifi->ifi_index);
>> -name = ll_idx_n2a(ifi->ifi_index);
>> -} else {
>> -name = rta_getattr_str(tb[IFLA_IFNAME]);
>> -}
>> +
>> +name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
>> +if (!name)
>> +return -1;
>>  
>>  if (filter.label &&
>>  (!filter.family || filter.family == AF_PACKET) &&
>> @@ -903,12 +901,10 @@ int print_linkinfo(const struct sockaddr_nl *who,
>>  return -1;
>>  
>>  parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
>> -if (tb[IFLA_IFNAME] == NULL) {
>> -fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
>> ifi->ifi_index);
>> -name = ll_idx_n2a(ifi->ifi_index);
>> -} else {
>> -name = rta_getattr_str(tb[IFLA_IFNAME]);
>> -}
>> +
>> +name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
>> +if (!name)
>> +return -1;
>>  
>>  if (filter.label &&
>>  (!filter.family || filter.family == AF_PACKET) &&
>> diff --git a/lib/utils.c b/lib/utils.c
>> index d86c2ee..572d42a 100644
>> --- a/lib/utils.c
>> +++ b/lib/utils.c
>> @@ -871,6 +871,25 @@ int get_ifname(char *buf, const char *name)
>>  return ret;
>>  }
>>  
>> +const char *get_ifname_rta(int ifindex, const struct rtattr *rta)
>> +{
>> +const char *name;
>> +
>> +if (rta) {
>> +name = rta_getattr_str(rta);
>> +} else {
>> +fprintf(stderr,
>> +"BUG: device with ifindex %d has nil ifname\n",
>> +ifindex);
>> +name = ll_idx_n2a(ifindex);
>> +}
>> +
>> +if (check_ifname(name))
>> +return NULL;
>> +
>> +return name;
>> +}
>> +
>>  int matches(const char *cmd, const char *pattern)
>>  {
>>  int len = strlen(cmd);
>>
> 
> 
> Getting build failures with this one:
> $ make clean; make -j 8
> ...
> 
> devlink
> CC   devlink.o
> CC   mnlg.o
> LINK devlink
> ../lib/libutil.a(ll_map.o): In function `ll_remember_index':
> ll_map.c:(.text+0xf6): undefined reference to `parse_rtattr'
> ll_map.c:(.text+0x1c5): undefined reference to `parse_rtattr'
> ../lib/libutil.a(ll_map.o): In function `ll_init_map':
> ll_map.c:(.text+0x47c): undefined reference to `rtnl_wilddump_re

[RFC net PATCH] virtio_net: disable XDP_REDIRECT in receive_mergeable() case

2018-02-15 Thread Jesper Dangaard Brouer
The virtio_net code have three different RX code-paths in receive_buf().
Two of these code paths can handle XDP, but one of them is broken for
at least XDP_REDIRECT.

Function(1): receive_big() does not support XDP.
Function(2): receive_small() support XDP fully and uses build_skb().
Function(3): receive_mergeable() broken XDP_REDIRECT uses napi_alloc_skb().

The simple explanation is that receive_mergeable() is broken because
it uses napi_alloc_skb(), which violates XDP given XDP assumes packet
header+data in single page and enough tail room for skb_shared_info.

The longer explaination is that receive_mergeable() tries to
work-around and satisfy these XDP requiresments e.g. by having a
function xdp_linearize_page() that allocates and memcpy RX buffers
around (in case packet is scattered across multiple rx buffers).  This
does currently satisfy XDP_PASS, XDP_DROP and XDP_TX (but only because
we have not implemented bpf_xdp_adjust_tail yet).

The XDP_REDIRECT action combined with cpumap is broken, and cause hard
to debug crashes.  The main issue is that the RX packet does not have
the needed tail-room (SKB_DATA_ALIGN(skb_shared_info)), causing
skb_shared_info to overlap the next packets head-room (in which cpumap
stores info).

Reproducing depend on the packet payload length and if RX-buffer size
happened to have tail-room for skb_shared_info or not.  But to make
this even harder to troubleshoot, the RX-buffer size is runtime
dynamically change based on an Exponentially Weighted Moving Average
(EWMA) over the packet length, when refilling RX rings.

This patch only disable XDP_REDIRECT support in receive_mergeable()
case, because it can cause a real crash.

But IMHO we should NOT support XDP in receive_mergeable() at all,
because the principles behind XDP are to gain speed by (1) code
simplicity, (2) sacrificing memory and (3) where possible moving
runtime checks to setup time.  These principles are clearly being
violated in receive_mergeable(), that e.g. runtime track average
buffer size to save memory consumption.

Besides the described bug:

Update(1): There is also a OOM leak in the XDP_REDIRECT code, which
receive_small() is likely also affected by.

Update(2): Also observed a guest crash when redirecting out an
another virtio_net device, when device is down.

Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/virtio_net.c |7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 626c27352ae2..0ca91942a884 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -677,7 +677,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
struct bpf_prog *xdp_prog;
unsigned int truesize;
unsigned int headroom = mergeable_ctx_to_headroom(ctx);
-   int err;
 
head_skb = NULL;
 
@@ -754,12 +753,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
goto err_xdp;
rcu_read_unlock();
goto xdp_xmit;
-   case XDP_REDIRECT:
-   err = xdp_do_redirect(dev, &xdp, xdp_prog);
-   if (!err)
-   *xdp_xmit = true;
-   rcu_read_unlock();
-   goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:



[PATCH net] tun: fix tun_napi_alloc_frags() frag allocator

2018-02-15 Thread Eric Dumazet
From: Eric Dumazet 


While fuzzing arm64 v4.16-rc1 with Syzkaller, I've been hitting a
misaligned atomic in __skb_clone:

atomic_inc(&(skb_shinfo(skb)->dataref));

   where dataref doesn't have the required natural alignment, and the
   atomic operation faults. e.g. i often see it aligned to a single
   byte boundary rather than a four byte boundary.

   AFAICT, the skb_shared_info is misaligned at the instant it's
   allocated in __napi_alloc_skb()  __napi_alloc_skb()


Problem is caused by tun_napi_alloc_frags() using
napi_alloc_frag() with user provided seg sizes,
leading to other users of this API getting unaligned
page fragments.

Since we would like to not necessarily add paddings or alignments to
the frags that tun_napi_alloc_frags() attaches to the skb, switch to
another page frag allocator.

As a bonus skb_page_frag_refill() can use GFP_KERNEL allocations,
meaning that we can not deplete memory reserves as easily.

Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
Signed-off-by: Eric Dumazet 
Reported-by: Mark Rutland 
Tested-by: Mark Rutland 
---
 drivers/net/tun.c |   16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 
81e6cc951e7fc7c983919365c34842c34bcaedcf..b52258c327d2e1d7c7476de345e49f082909c246
 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1489,27 +1489,23 @@ static struct sk_buff *tun_napi_alloc_frags(struct 
tun_file *tfile,
skb->truesize += skb->data_len;
 
for (i = 1; i < it->nr_segs; i++) {
+   struct page_frag *pfrag = ¤t->task_frag;
size_t fragsz = it->iov[i].iov_len;
-   unsigned long offset;
-   struct page *page;
-   void *data;
 
if (fragsz == 0 || fragsz > PAGE_SIZE) {
err = -EINVAL;
goto free;
}
 
-   local_bh_disable();
-   data = napi_alloc_frag(fragsz);
-   local_bh_enable();
-   if (!data) {
+   if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) {
err = -ENOMEM;
goto free;
}
 
-   page = virt_to_head_page(data);
-   offset = data - page_address(page);
-   skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+   skb_fill_page_desc(skb, i - 1, pfrag->page,
+  pfrag->offset, fragsz);
+   page_ref_inc(pfrag->page);
+   pfrag->offset += fragsz;
}
 
return skb;



[PATCH net] rxrpc: Work around usercopy check

2018-02-15 Thread David Howells
Due to a check recently added to copy_to_user(), it's now not permitted to
copy from slab-held data to userspace unless the slab is whitelisted.  This
affects rxrpc_recvmsg() when it attempts to place an RXRPC_USER_CALL_ID
control message in the userspace control message buffer.  A warning is
generated by usercopy_warn() because the source is the copy of the
user_call_ID retained in the rxrpc_call struct.

Work around the issue by copying the user_call_ID to a variable on the
stack and passing that to put_cmsg().

The warning generated looks like:

Bad or missing usercopy whitelist? Kernel memory exposure attempt 
detected from SLUB object 'dmaengine-unmap-128' (offset 680, size 8)!
WARNING: CPU: 0 PID: 1401 at mm/usercopy.c:81 usercopy_warn+0x7e/0xa0
...
RIP: 0010:usercopy_warn+0x7e/0xa0
...
Call Trace:
 __check_object_size+0x9c/0x1a0
 put_cmsg+0x98/0x120
 rxrpc_recvmsg+0x6fc/0x1010 [rxrpc]
 ? finish_wait+0x80/0x80
 ___sys_recvmsg+0xf8/0x240
 ? __clear_rsb+0x25/0x3d
 ? __clear_rsb+0x15/0x3d
 ? __clear_rsb+0x25/0x3d
 ? __clear_rsb+0x15/0x3d
 ? __clear_rsb+0x25/0x3d
 ? __clear_rsb+0x15/0x3d
 ? __clear_rsb+0x25/0x3d
 ? __clear_rsb+0x15/0x3d
 ? finish_task_switch+0xa6/0x2b0
 ? trace_hardirqs_on_caller+0xed/0x180
 ? _raw_spin_unlock_irq+0x29/0x40
 ? __sys_recvmsg+0x4e/0x90
 __sys_recvmsg+0x4e/0x90
 do_syscall_64+0x7a/0x220
 entry_SYSCALL_64_after_hwframe+0x26/0x9b

Reported-by: Jonathan Billings 
Signed-off-by: David Howells 
Acked-by: Kees Cook 
Tested-by: Jonathan Billings 
---

 net/rxrpc/recvmsg.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index cc21e8db25b0..9d45d8b56744 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -517,9 +517,10 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, 
size_t len,
ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
   sizeof(unsigned int), &id32);
} else {
+   unsigned long idl = call->user_call_ID;
+
ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
-  sizeof(unsigned long),
-  &call->user_call_ID);
+  sizeof(unsigned long), &idl);
}
if (ret < 0)
goto error_unlock_call;



[PATCH bpf] bpf: fix mlock precharge on arraymaps

2018-02-15 Thread Daniel Borkmann
syzkaller recently triggered OOM during percpu map allocation;
while there is work in progress by Dennis Zhou to add __GFP_NORETRY
semantics for percpu allocator under pressure, there seems also a
missing bpf_map_precharge_memlock() check in array map allocation.

Given today the actual bpf_map_charge_memlock() happens after the
find_and_alloc_map() in syscall path, the bpf_map_precharge_memlock()
is there to bail out early before we go and do the map setup work
when we find that we hit the limits anyway. Therefore add this for
array map as well.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Fixes: a10423b87a7e ("bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map")
Reported-by: syzbot+adb03f3f0bb57ce3a...@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann 
Cc: Dennis Zhou 
---
 kernel/bpf/arraymap.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b1f6648..a364c40 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,11 +73,11 @@ static int array_map_alloc_check(union bpf_attr *attr)
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
-   int numa_node = bpf_map_attr_numa_node(attr);
+   int ret, numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
+   u64 cost, array_size, mask64;
struct bpf_array *array;
-   u64 array_size, mask64;
 
elem_size = round_up(attr->value_size, 8);
 
@@ -109,8 +109,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr 
*attr)
array_size += (u64) max_entries * elem_size;
 
/* make sure there is no u32 overflow later in round_up() */
-   if (array_size >= U32_MAX - PAGE_SIZE)
+   cost = array_size;
+   if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
+   if (percpu) {
+   cost += (u64)attr->max_entries * elem_size * 
num_possible_cpus();
+   if (cost >= U32_MAX - PAGE_SIZE)
+   return ERR_PTR(-ENOMEM);
+   }
+   cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+   ret = bpf_map_precharge_memlock(cost);
+   if (ret < 0)
+   return ERR_PTR(ret);
 
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
@@ -121,20 +132,13 @@ static struct bpf_map *array_map_alloc(union bpf_attr 
*attr)
 
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
+   array->map.pages = cost;
array->elem_size = elem_size;
 
-   if (!percpu)
-   goto out;
-
-   array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
-
-   if (array_size >= U32_MAX - PAGE_SIZE ||
-   bpf_array_alloc_percpu(array)) {
+   if (percpu && bpf_array_alloc_percpu(array)) {
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
-out:
-   array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
 
return &array->map;
 }
-- 
2.9.5



Re: KASAN: use-after-free Read in sit_tunnel_xmit

2018-02-15 Thread Cong Wang
On Tue, Feb 13, 2018 at 10:48 AM, Dmitry Vyukov  wrote:
> On Mon, Oct 30, 2017 at 7:41 PM, Cong Wang  wrote:
>> On Mon, Oct 30, 2017 at 8:34 AM, syzbot
>> 
>> wrote:
>>> Hello,
>>>
>>> syzkaller hit the following crash on
>>> 4dc12ffeaeac939097a3f55c881d3dc3523dff0c
>>> git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/master
>>> compiler: gcc (GCC) 7.1.1 20170620
>>> .config is attached
>>> Raw console output is attached.
>>>
>>> skbuff: bad partial csum: csum=53081/14726 len=2273
>>> ==
>>> BUG: KASAN: use-after-free in ipv6_get_dsfield include/net/dsfield.h:23
>>> [inline]
>>> BUG: KASAN: use-after-free in ipip6_tunnel_xmit net/ipv6/sit.c:968 [inline]
>>> BUG: KASAN: use-after-free in sit_tunnel_xmit+0x2a41/0x3130
>>> net/ipv6/sit.c:1016
>>> Read of size 2 at addr 8801c64afd00 by task syz-executor3/16942
>>>
>>> CPU: 0 PID: 16942 Comm: syz-executor3 Not tainted 4.14.0-rc5+ #97
>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>>> Google 01/01/2011
>>> Call Trace:
>>>  __dump_stack lib/dump_stack.c:16 [inline]
>>>  dump_stack+0x194/0x257 lib/dump_stack.c:52
>>>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>>>  kasan_report_error mm/kasan/report.c:351 [inline]
>>>  kasan_report+0x25b/0x340 mm/kasan/report.c:409
>>>  __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:428
>>>  ipv6_get_dsfield include/net/dsfield.h:23 [inline]
>>>  ipip6_tunnel_xmit net/ipv6/sit.c:968 [inline]
>>>  sit_tunnel_xmit+0x2a41/0x3130 net/ipv6/sit.c:1016
>>>  __netdev_start_xmit include/linux/netdevice.h:4022 [inline]
>>>  netdev_start_xmit include/linux/netdevice.h:4031 [inline]
>>>  xmit_one net/core/dev.c:3008 [inline]
>>>  dev_hard_start_xmit+0x248/0xac0 net/core/dev.c:3024
>>>  __dev_queue_xmit+0x17d2/0x2070 net/core/dev.c:3505
>>>  dev_queue_xmit+0x17/0x20 net/core/dev.c:3538
>>>  neigh_direct_output+0x15/0x20 net/core/neighbour.c:1390
>>>  neigh_output include/net/neighbour.h:481 [inline]
>>>  ip6_finish_output2+0xad1/0x22a0 net/ipv6/ip6_output.c:120
>>>  ip6_fragment+0x25ae/0x3420 net/ipv6/ip6_output.c:723
>>>  ip6_finish_output+0x319/0x920 net/ipv6/ip6_output.c:144
>>>  NF_HOOK_COND include/linux/netfilter.h:238 [inline]
>>>  ip6_output+0x1f4/0x850 net/ipv6/ip6_output.c:163
>>>  dst_output include/net/dst.h:459 [inline]
>>>  ip6_local_out+0x95/0x160 net/ipv6/output_core.c:176
>>>  ip6_send_skb+0xa1/0x330 net/ipv6/ip6_output.c:1658
>>>  ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1678
>>>  rawv6_push_pending_frames net/ipv6/raw.c:616 [inline]
>>>  rawv6_sendmsg+0x2eb9/0x3e40 net/ipv6/raw.c:935
>>>  inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
>>>  sock_sendmsg_nosec net/socket.c:633 [inline]
>>>  sock_sendmsg+0xca/0x110 net/socket.c:643
>>>  SYSC_sendto+0x352/0x5a0 net/socket.c:1750
>>>  SyS_sendto+0x40/0x50 net/socket.c:1718
>>>  entry_SYSCALL_64_fastpath+0x1f/0xbe
>>> RIP: 0033:0x452869
>>> RSP: 002b:7fe3c12e5be8 EFLAGS: 0212 ORIG_RAX: 002c
>>> RAX: ffda RBX: 007580d8 RCX: 00452869
>>> RDX: 07f1 RSI: 2013b7ff RDI: 0014
>>> RBP: 0161 R08: 204e8fe4 R09: 001c
>>> R10: 0100 R11: 0212 R12: 006f01b8
>>> R13:  R14: 7fe3c12e66d4 R15: 0017
>>>
>>> Allocated by task 16924:
>>>  save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
>>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>>>  set_track mm/kasan/kasan.c:459 [inline]
>>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>>>  __do_kmalloc_node mm/slab.c:3689 [inline]
>>>  __kmalloc_node_track_caller+0x47/0x70 mm/slab.c:3703
>>>  __kmalloc_reserve.isra.40+0x41/0xd0 net/core/skbuff.c:138
>>>  __alloc_skb+0x13b/0x780 net/core/skbuff.c:206
>>>  alloc_skb include/linux/skbuff.h:985 [inline]
>>>  sock_wmalloc+0x140/0x1d0 net/core/sock.c:1932
>>>  __ip6_append_data.isra.43+0x2681/0x3340 net/ipv6/ip6_output.c:1397
>>>  ip6_append_data+0x189/0x290 net/ipv6/ip6_output.c:1552
>>>  rawv6_sendmsg+0x1dd9/0x3e40 net/ipv6/raw.c:928
>>>  inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
>>>  sock_sendmsg_nosec net/socket.c:633 [inline]
>>>  sock_sendmsg+0xca/0x110 net/socket.c:643
>>>  SYSC_sendto+0x352/0x5a0 net/socket.c:1750
>>>  SyS_sendto+0x40/0x50 net/socket.c:1718
>>>  entry_SYSCALL_64_fastpath+0x1f/0xbe
>>>
>>> Freed by task 16942:
>>>  save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
>>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>>>  set_track mm/kasan/kasan.c:459 [inline]
>>>  kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
>>>  __cache_free mm/slab.c:3503 [inline]
>>>  kfree+0xca/0x250 mm/slab.c:3820
>>>  skb_free_head+0x74/0xb0 net/core/skbuff.c:554
>>>  pskb_expand_head+0x36b/0x1210 net/core/skbuff.c:1494
>>>  __pskb_pull_tail+0x14a/0x17c0 net/core/skbuff.c:1877
>>>  pskb_may_pull include/linux/skbuff.h:2102 [inline]
>>>  _decode_session6+0x8a4/0x1100 net/ipv6

Re: [PATCH net v4 02/13] net/8390: Fix msg_enable patch snafu

2018-02-15 Thread Finn Thain
On Wed, 14 Feb 2018, David Miller wrote:

> > Have you considered that implementing the ethtool hooks in the core 
> > driver might allow removal of all 8390 driver 'msg_enable' module 
> > parameters and msglevel ethtool hooks added by c45f812f0280, excepting 
> > those in the core driver? But even if we did that, it seems to me that 
> > we still need this patch.
> 
> No, because the module parameter lets you set the default msg level at 
> the time the driver loads, so you can control messages printed very 
> early on before it is practical to invoke ethtool and set the msg level.
> 
> This is why most drivers have this module parameter, and implement
> such a scheme.

Among the 8390 drivers, so far only ne2k-pci implements that scheme.

This patch implements that scheme for ax88796 and etherh as well, by 
making better use of the msg_enable module parameter in lib8390.c.

The axnet_cs and pcnet_cs modules lack the msglevel ethtool ops and the 
msg_enable module parameters.

The remaining nine modules lack just the ethtool ops. If you like I will 
write additional patches to implement the missing ethtool ops or module 
parameters or both (?)

This small patch already addresses the use-case in which the end-user 
needs to enable (for example) probe messages for some or all 8390 drivers.

Thoughts?

-- 


Re: [RFC PATCH] net/ncsi: Add generic netlink family

2018-02-15 Thread Samuel Mendoza-Jonas
On Thu, 2018-02-15 at 14:50 +1030, Joel Stanley wrote:
> Hey Sam,
> 
> On Thu, Feb 15, 2018 at 2:00 PM, Samuel Mendoza-Jonas
>  wrote:
> > Add a generic netlink family for NCSI. This supports two commands;
> > NCSI_CMD_PKG_INFO which returns information on packages and their
> > associated channels, and NCSI_CMD_SET_INTERFACE which allows a specific
> > package or package/channel combination to be set as the preferred
> > choice.
> > 
> > Signed-off-by: Samuel Mendoza-Jonas 
> > ---
> > Fielding an RFC first to gauge what sort of information people may want
> > out of an NCSI user API before it gets carved in stone. This RFC exposes
> > a few main things such as link state, active link, channel versions, and
> > active vlan ids, is there more that could be helpful in version 1 of the
> > UAPI?
> > The big drawcard here is of course the ability to set preferred packages
> > and channels so that the NCSI link can be bound to *only* port 0 for
> > example. If you have opinions about how this should function now is the
> > time to speak up :)
> 
> I'd recommend ccing the netdev mailing list in addition to OpenBMC.

True, I was mostly thinking of checking in with the OpenBMC group but
I'll add in netdev now as well in case I've made a netlink faux pas :)

> 
> I was chatting with Facebook people about some advanced uses of NCSI.
> I've added Sai to cc, hopefully he can loop in the right people.
> 
> The advanced uses included per-manufacturer OEM commands for MAC
> address retrieval, firmware updates, and I heard someone mention temp
> sensors-over-NCSI. I'm not that all of those would fall into the
> category of a netlink API or not, but it's worth considering those
> requirements in your design, even if you're not implementing it at
> this stage.

Yep there's been some brainstorming about this as well. I suspect at
least a generic "send an NCSI packet with this header and these
parameters" Netlink command could be a good fit - that way the NCSI
driver is aware of any response frames instead of receiving unexpected
frames and throwing them away / logging an error.
OEM-specific stuff is probably best left out of the driver but with that
interface could be handled nicely.

> 
> Would a netlink API mean we would have to write some userspace tools
> to perform this configuration? Do you have any code for that?

Yes and yes - I've got a little python script that matches this patch,
I'll link it once I've cleaned it up a little.

> 
> Cheers,
> 
> Joel
> 
> 
> > 
> >  include/uapi/linux/ncsi.h |  65 
> >  net/ncsi/Makefile |   2 +-
> >  net/ncsi/internal.h   |   3 +
> >  net/ncsi/ncsi-manage.c|  28 +++-
> >  net/ncsi/ncsi-netlink.c   | 394 
> > ++
> >  net/ncsi/ncsi-netlink.h   |  20 +++
> >  6 files changed, 508 insertions(+), 4 deletions(-)
> >  create mode 100644 include/uapi/linux/ncsi.h
> >  create mode 100644 net/ncsi/ncsi-netlink.c
> >  create mode 100644 net/ncsi/ncsi-netlink.h
> > 
> > diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
> > new file mode 100644
> > index ..45de201569e3
> > --- /dev/null
> > +++ b/include/uapi/linux/ncsi.h
> > @@ -0,0 +1,65 @@
> > +/*
> > + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + */
> > +
> > +#ifndef __UAPI_NCSI_NETLINK_H__
> > +#define __UAPI_NCSI_NETLINK_H__
> > +
> > +enum ncsi_nl_commands {
> > +   NCSI_CMD_UNSPEC,
> > +   NCSI_CMD_SET_INTERFACE,
> > +   NCSI_CMD_PKG_INFO,
> > +   NCSI_CMD_MAX,
> > +};
> > +
> > +#define NCSI_CMD_MAX (NCSI_CMD_MAX - 1)
> > +
> > +enum ncsi_nl_attrs {
> > +   NCSI_ATTR_UNSPEC,
> > +   NCSI_ATTR_IFINDEX,
> > +   NCSI_ATTR_PACKAGE_LIST,
> > +   NCSI_ATTR_PACKAGE_ID,
> > +   NCSI_ATTR_CHANNEL_ID,
> > +   NCSI_ATTR_MAX,
> > +};
> > +
> > +enum ncsi_nl_pkg_attrs {
> > +   NCSI_PKG_ATTR_UNSPEC,
> > +   NCSI_PKG_ATTR,
> > +   NCSI_PKG_ATTR_ID,
> > +   NCSI_PKG_ATTR_CHANNEL_LIST,
> > +   NCSI_PKG_ATTR_MAX,
> > +};
> > +
> > +enum ncsi_nl_channel_attrs {
> > +   NCSI_CHANNEL_ATTR_UNSPEC,
> > +   NCSI_CHANNEL_ATTR,
> > +   NCSI_CHANNEL_ATTR_ID,
> > +   NCSI_CHANNEL_ATTR_VERSION_MAJOR,
> > +   NCSI_CHANNEL_ATTR_VERSION_MINOR,
> > +   NCSI_CHANNEL_ATTR_VERSION_STR,
> > +   NCSI_CHANNEL_ATTR_LINK_STATE,
> > +   NCSI_CHANNEL_ATTR_ACTIVE,
> > +   NCSI_CHANNEL_ATTR_VLAN_LIST,
> > +   NCSI_CHANNEL_ATTR_MAX,
> > +};
> > +
> > +enum ncsi_nl_vlan_attrs {
> > +   NCSI_VLAN_UNSPEC,
> > +   NCSI_VLAN_ATTR,
> > +   NCSI_VLAN_ATTR_ID,
> > +   NCSI_VLAN_ATTR_PROTO,
> > +   NCSI_VLAN_ATTR_MAX,
> > +};
> > +
> > +#define NCSI_ATTR_MAX (NCSI_ATTR_MAX - 1)
> >

[PATCH net-next 0/2] nfp: whitespace sync and flower TCP flags

2018-02-15 Thread Jakub Kicinski
Hi!

Whitespace cleanup from Michael and flower offload support for matching
on TCP flags from Pieter.

Michael Rapson (1):
  nfp: standardize FW header whitespace

Pieter Jansen van Vuuren (1):
  nfp: flower: implement tcp flag match offload

 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  11 +-
 drivers/net/ethernet/netronome/nfp/flower/main.h   |   1 +
 drivers/net/ethernet/netronome/nfp/flower/match.c  |  20 ++
 .../net/ethernet/netronome/nfp/flower/offload.c|  34 +++
 drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h  | 280 ++---
 5 files changed, 204 insertions(+), 142 deletions(-)

-- 
2.15.1



[PATCH net-next 2/2] nfp: flower: implement tcp flag match offload

2018-02-15 Thread Jakub Kicinski
From: Pieter Jansen van Vuuren 

Implement tcp flag match offloading. Current tcp flag match support include
FIN, SYN, RST, PSH and URG flags, other flags are unsupported. The PSH and
URG flags are only set in the hardware fast path when used in combination
with the SYN, RST and PSH flags.

Signed-off-by: Pieter Jansen van Vuuren 
Reviewed-by: John Hurley 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   | 11 +--
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  1 +
 drivers/net/ethernet/netronome/nfp/flower/match.c  | 20 +
 .../net/ethernet/netronome/nfp/flower/offload.c| 34 ++
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h 
b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index adfe474c2cf0..28c1cd5b823b 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -61,6 +61,13 @@
 #define NFP_FLOWER_MASK_MPLS_BOS   BIT(8)
 #define NFP_FLOWER_MASK_MPLS_Q BIT(0)
 
+/* Compressed HW representation of TCP Flags */
+#define NFP_FL_TCP_FLAG_URGBIT(4)
+#define NFP_FL_TCP_FLAG_PSHBIT(3)
+#define NFP_FL_TCP_FLAG_RSTBIT(2)
+#define NFP_FL_TCP_FLAG_SYNBIT(1)
+#define NFP_FL_TCP_FLAG_FINBIT(0)
+
 #define NFP_FL_SC_ACT_DROP 0x8000
 #define NFP_FL_SC_ACT_USER 0x7D00
 #define NFP_FL_SC_ACT_POPV 0x6A00
@@ -257,7 +264,7 @@ struct nfp_flower_tp_ports {
  *3   2   1
  *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |DSCP   |ECN|   protocol|   reserved|
+ * |DSCP   |ECN|   protocol|  ttl  | flags |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |ipv4_addr_src  |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -268,7 +275,7 @@ struct nfp_flower_ipv4 {
u8 tos;
u8 proto;
u8 ttl;
-   u8 reserved;
+   u8 flags;
__be32 ipv4_src;
__be32 ipv4_dst;
 };
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h 
b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 332ff0fdc038..c5cebf6fb1d3 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct net_device;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c 
b/drivers/net/ethernet/netronome/nfp/flower/match.c
index 37c2ecae2a7a..b3bc8279d4fb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -181,6 +181,26 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
frame->tos = flow_ip->tos;
frame->ttl = flow_ip->ttl;
}
+
+   if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_TCP)) {
+   struct flow_dissector_key_tcp *tcp;
+   u32 tcp_flags;
+
+   tcp = skb_flow_dissector_target(flow->dissector,
+   FLOW_DISSECTOR_KEY_TCP, target);
+   tcp_flags = be16_to_cpu(tcp->flags);
+
+   if (tcp_flags & TCPHDR_FIN)
+   frame->flags |= NFP_FL_TCP_FLAG_FIN;
+   if (tcp_flags & TCPHDR_SYN)
+   frame->flags |= NFP_FL_TCP_FLAG_SYN;
+   if (tcp_flags & TCPHDR_RST)
+   frame->flags |= NFP_FL_TCP_FLAG_RST;
+   if (tcp_flags & TCPHDR_PSH)
+   frame->flags |= NFP_FL_TCP_FLAG_PSH;
+   if (tcp_flags & TCPHDR_URG)
+   frame->flags |= NFP_FL_TCP_FLAG_URG;
+   }
 }
 
 static void
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c 
b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index eb5c13dea8f5..f3586c519805 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -44,11 +44,16 @@
 #include "../nfp_net.h"
 #include "../nfp_port.h"
 
+#define NFP_FLOWER_SUPPORTED_TCPFLAGS \
+   (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \
+TCPHDR_PSH | TCPHDR_URG)
+
 #define NFP_FLOWER_WHITELIST_DISSECTOR \
(BIT(FLOW_DISSECTOR_KEY_CONTROL) | \
 BIT(FLOW_DISSECTOR_KEY_BASIC) | \
 BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | \
 BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | \
+BIT(FLOW_DISSECTOR_KEY_TCP) | \
 BIT(FLOW_DISSECTOR_KEY_PORTS) | \
 BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \
 BIT(FLOW_DISSECTOR_KEY_VLAN) | \
@@ -288,6 +293,35 @@ nfp_flower_calculate_key_layers(struct nfp_app *app,
}
 

[PATCH net-next 1/2] nfp: standardize FW header whitespace

2018-02-15 Thread Jakub Kicinski
From: Michael Rapson 

The nfp_net_ctrl.h file used spaces for indentation in the past but
tabs have crept in.  Host driver files use tabs for indentation by
default, so let's convert to tabs for consistency across the file
and our drivers.

Signed-off-by: Michael Rapson 
Acked-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h | 280 +++---
 1 file changed, 140 insertions(+), 140 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
index 4499a7333078..bb63c115537d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015-2017 Netronome Systems, Inc.
+ * Copyright (C) 2015-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -51,12 +51,12 @@
  * The configuration BAR is 8K in size, but due to
  * THB-350, 32k needs to be reserved.
  */
-#define NFP_NET_CFG_BAR_SZ  (32 * 1024)
+#define NFP_NET_CFG_BAR_SZ (32 * 1024)
 
 /**
  * Offset in Freelist buffer where packet starts on RX
  */
-#define NFP_NET_RX_OFFSET   32
+#define NFP_NET_RX_OFFSET  32
 
 /**
  * LSO parameters
@@ -75,65 +75,65 @@
 #define NFP_NET_META_PORTID5
 #define NFP_NET_META_CSUM  6 /* checksum complete type */
 
-#defineNFP_META_PORT_ID_CTRL   ~0U
+#define NFP_META_PORT_ID_CTRL  ~0U
 
 /**
  * Hash type pre-pended when a RSS hash was computed
  */
-#define NFP_NET_RSS_NONE0
-#define NFP_NET_RSS_IPV41
-#define NFP_NET_RSS_IPV62
-#define NFP_NET_RSS_IPV6_EX 3
-#define NFP_NET_RSS_IPV4_TCP4
-#define NFP_NET_RSS_IPV6_TCP5
-#define NFP_NET_RSS_IPV6_EX_TCP 6
-#define NFP_NET_RSS_IPV4_UDP7
-#define NFP_NET_RSS_IPV6_UDP8
-#define NFP_NET_RSS_IPV6_EX_UDP 9
+#define NFP_NET_RSS_NONE   0
+#define NFP_NET_RSS_IPV4   1
+#define NFP_NET_RSS_IPV6   2
+#define NFP_NET_RSS_IPV6_EX3
+#define NFP_NET_RSS_IPV4_TCP   4
+#define NFP_NET_RSS_IPV6_TCP   5
+#define NFP_NET_RSS_IPV6_EX_TCP6
+#define NFP_NET_RSS_IPV4_UDP   7
+#define NFP_NET_RSS_IPV6_UDP   8
+#define NFP_NET_RSS_IPV6_EX_UDP9
 
 /**
  * Ring counts
- * %NFP_NET_TXR_MAX: Maximum number of TX rings
- * %NFP_NET_RXR_MAX: Maximum number of RX rings
+ * %NFP_NET_TXR_MAX:Maximum number of TX rings
+ * %NFP_NET_RXR_MAX:Maximum number of RX rings
  */
-#define NFP_NET_TXR_MAX 64
-#define NFP_NET_RXR_MAX 64
+#define NFP_NET_TXR_MAX64
+#define NFP_NET_RXR_MAX64
 
 /**
  * Read/Write config words (0x - 0x002c)
- * %NFP_NET_CFG_CTRL:Global control
+ * %NFP_NET_CFG_CTRL:   Global control
  * %NFP_NET_CFG_UPDATE:  Indicate which fields are updated
  * %NFP_NET_CFG_TXRS_ENABLE: Bitmask of enabled TX rings
  * %NFP_NET_CFG_RXRS_ENABLE: Bitmask of enabled RX rings
- * %NFP_NET_CFG_MTU: Set MTU size
+ * %NFP_NET_CFG_MTU:Set MTU size
  * %NFP_NET_CFG_FLBUFSZ: Set freelist buffer size (must be larger than MTU)
- * %NFP_NET_CFG_EXN: MSI-X table entry for exceptions
- * %NFP_NET_CFG_LSC: MSI-X table entry for link state changes
+ * %NFP_NET_CFG_EXN:MSI-X table entry for exceptions
+ * %NFP_NET_CFG_LSC:MSI-X table entry for link state changes
  * %NFP_NET_CFG_MACADDR: MAC address
  *
  * TODO:
  * - define Error details in UPDATE
  */
-#define NFP_NET_CFG_CTRL0x
-#define   NFP_NET_CFG_CTRL_ENABLE (0x1 <<  0) /* Global enable */
-#define   NFP_NET_CFG_CTRL_PROMISC(0x1 <<  1) /* Enable Promisc mode */
-#define   NFP_NET_CFG_CTRL_L2BC   (0x1 <<  2) /* Allow L2 Broadcast */
-#define   NFP_NET_CFG_CTRL_L2MC   (0x1 <<  3) /* Allow L2 Multicast */
-#define   NFP_NET_CFG_CTRL_RXCSUM (0x1 <<  4) /* Enable RX Checksum */
-#define   NFP_NET_CFG_CTRL_TXCSUM (0x1 <<  5) /* Enable TX Checksum */
-#define   NFP_NET_CFG_CTRL_RXVLAN (0x1 <<  6) /* Enable VLAN strip */
-#define   NFP_NET_CFG_CTRL_TXVLAN (0x1 <<  7) /* Enable VLAN insert */
-#define   NFP_NET_CFG_CTRL_SCATTER(0x1 <<  8) /* Scatter DMA */
-#define   NFP_NET_CFG_CTRL_GATHER (0x1 <<  9) /* Gather DMA */
-#define   NFP_NET_CFG_CTRL_LSO(0x1 << 10) /* LSO/TSO (version 1) */
+#define NFP_NET_CFG_CTRL   0x
+#define   NFP_NET_CFG_CTRL_ENABLE(0x1 <<  0) /* Global enable */
+#define   NFP_NET_CFG_CTRL_PROMISC   (0x1 <<  1) /* Enable Promisc mode */
+#define   NFP_NET_CFG_CTRL_L2BC 

[PATCH net] tg3: APE heartbeat changes

2018-02-15 Thread Satish Baddipadige
From: Prashant Sreedharan 

In ungraceful host shutdown or driver crash case BMC connectivity is
lost. APE firmware is missing the driver state in this
case to keep the BMC connectivity alive.
This patch has below change to address this issue.

Heartbeat mechanism with APE firmware. This heartbeat mechanism
is needed to notify the APE firmware about driver state.

This patch also has the change in wait time for APE event from
1ms to 20ms as there can be some delay in getting response.

Signed-off-by: Prashant Sreedharan 
Signed-off-by: Satish Baddipadige 
Signed-off-by: Siva Reddy Kallam 
Acked-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/tg3.c | 35 ---
 drivers/net/ethernet/broadcom/tg3.h |  5 +
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index a77ee2f..36d8d1e 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -820,7 +820,7 @@ static int tg3_ape_event_lock(struct tg3 *tp, u32 
timeout_us)
 
tg3_ape_unlock(tp, TG3_APE_LOCK_MEM);
 
-   udelay(10);
+   usleep_range(10, 20);
timeout_us -= (timeout_us > 10) ? 10 : timeout_us;
}
 
@@ -922,8 +922,8 @@ static int tg3_ape_send_event(struct tg3 *tp, u32 event)
if (!(apedata & APE_FW_STATUS_READY))
return -EAGAIN;
 
-   /* Wait for up to 1 millisecond for APE to service previous event. */
-   err = tg3_ape_event_lock(tp, 1000);
+   /* Wait for up to 20 millisecond for APE to service previous event. */
+   err = tg3_ape_event_lock(tp, 2);
if (err)
return err;
 
@@ -946,6 +946,7 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int 
kind)
 
switch (kind) {
case RESET_KIND_INIT:
+   tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++);
tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG,
APE_HOST_SEG_SIG_MAGIC);
tg3_ape_write32(tp, TG3_APE_HOST_SEG_LEN,
@@ -962,13 +963,6 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
int kind)
event = APE_EVENT_STATUS_STATE_START;
break;
case RESET_KIND_SHUTDOWN:
-   /* With the interface we are currently using,
-* APE does not track driver state.  Wiping
-* out the HOST SEGMENT SIGNATURE forces
-* the APE to assume OS absent status.
-*/
-   tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG, 0x0);
-
if (device_may_wakeup(&tp->pdev->dev) &&
tg3_flag(tp, WOL_ENABLE)) {
tg3_ape_write32(tp, TG3_APE_HOST_WOL_SPEED,
@@ -990,6 +984,18 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
int kind)
tg3_ape_send_event(tp, event);
 }
 
+static inline void tg3_send_ape_heartbeat(struct tg3 *tp,
+ unsigned long interval)
+{
+   /* Check if hb interval has exceeded */
+   if (!tg3_flag(tp, ENABLE_APE) ||
+   time_before(jiffies, tp->ape_hb_jiffies + interval))
+   return;
+
+   tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++);
+   tp->ape_hb_jiffies = jiffies;
+}
+
 static void tg3_disable_ints(struct tg3 *tp)
 {
int i;
@@ -7262,6 +7268,7 @@ static int tg3_poll_msix(struct napi_struct *napi, int 
budget)
}
}
 
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1);
return work_done;
 
 tx_recovery:
@@ -7344,6 +7351,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
}
}
 
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1);
return work_done;
 
 tx_recovery:
@@ -10732,7 +10740,7 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy)
if (tg3_flag(tp, ENABLE_APE))
/* Write our heartbeat update interval to APE. */
tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_INT_MS,
-   APE_HOST_HEARTBEAT_INT_DISABLE);
+   APE_HOST_HEARTBEAT_INT_5SEC);
 
tg3_write_sig_post_reset(tp, RESET_KIND_INIT);
 
@@ -11077,6 +11085,9 @@ static void tg3_timer(struct timer_list *t)
tp->asf_counter = tp->asf_multiplier;
}
 
+   /* Update the APE heartbeat every 5 seconds.*/
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL);
+
spin_unlock(&tp->lock);
 
 restart_timer:
@@ -16653,6 +16664,8 @@ static int tg3_get_invariants(struct tg3 *tp, const 
struct pci_device_id *ent)
   pci_state_reg);
 
tg3_ape_lock_init(tp);
+   tp->ape_hb_interval =
+   msecs_to_jiffies(APE_HOST_HEARTBEAT_INT_5SEC);
}
 
/* Set up tp->grc_local_ctrl before calling
diff --

[PATCH net] tg3: APE heartbeat changes

2018-02-15 Thread Satish Baddipadige
From: Prashant Sreedharan 

In ungraceful host shutdown or driver crash case BMC connectivity is
lost. APE firmware is missing the driver state in this
case to keep the BMC connectivity alive.
This patch has below change to address this issue.

Heartbeat mechanism with APE firmware. This heartbeat mechanism
is needed to notify the APE firmware about driver state.

This patch also has the change in wait time for APE event from
1ms to 20ms as there can be some delay in getting response.

Signed-off-by: Prashant Sreedharan 
Signed-off-by: Satish Baddipadige 
Signed-off-by: Siva Reddy Kallam 
Acked-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/tg3.c | 35 ---
 drivers/net/ethernet/broadcom/tg3.h |  5 +
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index a77ee2f..36d8d1e 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -820,7 +820,7 @@ static int tg3_ape_event_lock(struct tg3 *tp, u32 
timeout_us)
 
tg3_ape_unlock(tp, TG3_APE_LOCK_MEM);
 
-   udelay(10);
+   usleep_range(10, 20);
timeout_us -= (timeout_us > 10) ? 10 : timeout_us;
}
 
@@ -922,8 +922,8 @@ static int tg3_ape_send_event(struct tg3 *tp, u32 event)
if (!(apedata & APE_FW_STATUS_READY))
return -EAGAIN;
 
-   /* Wait for up to 1 millisecond for APE to service previous event. */
-   err = tg3_ape_event_lock(tp, 1000);
+   /* Wait for up to 20 millisecond for APE to service previous event. */
+   err = tg3_ape_event_lock(tp, 2);
if (err)
return err;
 
@@ -946,6 +946,7 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int 
kind)
 
switch (kind) {
case RESET_KIND_INIT:
+   tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++);
tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG,
APE_HOST_SEG_SIG_MAGIC);
tg3_ape_write32(tp, TG3_APE_HOST_SEG_LEN,
@@ -962,13 +963,6 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
int kind)
event = APE_EVENT_STATUS_STATE_START;
break;
case RESET_KIND_SHUTDOWN:
-   /* With the interface we are currently using,
-* APE does not track driver state.  Wiping
-* out the HOST SEGMENT SIGNATURE forces
-* the APE to assume OS absent status.
-*/
-   tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG, 0x0);
-
if (device_may_wakeup(&tp->pdev->dev) &&
tg3_flag(tp, WOL_ENABLE)) {
tg3_ape_write32(tp, TG3_APE_HOST_WOL_SPEED,
@@ -990,6 +984,18 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
int kind)
tg3_ape_send_event(tp, event);
 }
 
+static inline void tg3_send_ape_heartbeat(struct tg3 *tp,
+ unsigned long interval)
+{
+   /* Check if hb interval has exceeded */
+   if (!tg3_flag(tp, ENABLE_APE) ||
+   time_before(jiffies, tp->ape_hb_jiffies + interval))
+   return;
+
+   tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++);
+   tp->ape_hb_jiffies = jiffies;
+}
+
 static void tg3_disable_ints(struct tg3 *tp)
 {
int i;
@@ -7262,6 +7268,7 @@ static int tg3_poll_msix(struct napi_struct *napi, int 
budget)
}
}
 
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1);
return work_done;
 
 tx_recovery:
@@ -7344,6 +7351,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
}
}
 
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1);
return work_done;
 
 tx_recovery:
@@ -10732,7 +10740,7 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy)
if (tg3_flag(tp, ENABLE_APE))
/* Write our heartbeat update interval to APE. */
tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_INT_MS,
-   APE_HOST_HEARTBEAT_INT_DISABLE);
+   APE_HOST_HEARTBEAT_INT_5SEC);
 
tg3_write_sig_post_reset(tp, RESET_KIND_INIT);
 
@@ -11077,6 +11085,9 @@ static void tg3_timer(struct timer_list *t)
tp->asf_counter = tp->asf_multiplier;
}
 
+   /* Update the APE heartbeat every 5 seconds.*/
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL);
+
spin_unlock(&tp->lock);
 
 restart_timer:
@@ -16653,6 +16664,8 @@ static int tg3_get_invariants(struct tg3 *tp, const 
struct pci_device_id *ent)
   pci_state_reg);
 
tg3_ape_lock_init(tp);
+   tp->ape_hb_interval =
+   msecs_to_jiffies(APE_HOST_HEARTBEAT_INT_5SEC);
}
 
/* Set up tp->grc_local_ctrl before calling
diff --

Re: [RFC net PATCH] virtio_net: disable XDP_REDIRECT in receive_mergeable() case

2018-02-15 Thread Jason Wang



On 2018年02月16日 06:43, Jesper Dangaard Brouer wrote:

The virtio_net code have three different RX code-paths in receive_buf().
Two of these code paths can handle XDP, but one of them is broken for
at least XDP_REDIRECT.

Function(1): receive_big() does not support XDP.
Function(2): receive_small() support XDP fully and uses build_skb().
Function(3): receive_mergeable() broken XDP_REDIRECT uses napi_alloc_skb().

The simple explanation is that receive_mergeable() is broken because
it uses napi_alloc_skb(), which violates XDP given XDP assumes packet
header+data in single page and enough tail room for skb_shared_info.

The longer explaination is that receive_mergeable() tries to
work-around and satisfy these XDP requiresments e.g. by having a
function xdp_linearize_page() that allocates and memcpy RX buffers
around (in case packet is scattered across multiple rx buffers).  This
does currently satisfy XDP_PASS, XDP_DROP and XDP_TX (but only because
we have not implemented bpf_xdp_adjust_tail yet).

The XDP_REDIRECT action combined with cpumap is broken, and cause hard
to debug crashes.  The main issue is that the RX packet does not have
the needed tail-room (SKB_DATA_ALIGN(skb_shared_info)), causing
skb_shared_info to overlap the next packets head-room (in which cpumap
stores info).

Reproducing depend on the packet payload length and if RX-buffer size
happened to have tail-room for skb_shared_info or not.  But to make
this even harder to troubleshoot, the RX-buffer size is runtime
dynamically change based on an Exponentially Weighted Moving Average
(EWMA) over the packet length, when refilling RX rings.

This patch only disable XDP_REDIRECT support in receive_mergeable()
case, because it can cause a real crash.

But IMHO we should NOT support XDP in receive_mergeable() at all,
because the principles behind XDP are to gain speed by (1) code
simplicity, (2) sacrificing memory and (3) where possible moving
runtime checks to setup time.  These principles are clearly being
violated in receive_mergeable(), that e.g. runtime track average
buffer size to save memory consumption.


I agree to disable it for -net now. For net-next, we probably can do:

- drop xdp_linearize_page() and do XDP through generic XDP helper after 
skb was built

- disable EWMA when XDP is set and reserve enough tailroom.



Besides the described bug:

Update(1): There is also a OOM leak in the XDP_REDIRECT code, which
receive_small() is likely also affected by.

Update(2): Also observed a guest crash when redirecting out an
another virtio_net device, when device is down.


Will have a look at these issues. (Holiday in china now, so will do it 
after).


Thanks



Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
Signed-off-by: Jesper Dangaard Brouer 
---
  drivers/net/virtio_net.c |7 ---
  1 file changed, 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 626c27352ae2..0ca91942a884 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -677,7 +677,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
struct bpf_prog *xdp_prog;
unsigned int truesize;
unsigned int headroom = mergeable_ctx_to_headroom(ctx);
-   int err;
  
  	head_skb = NULL;
  
@@ -754,12 +753,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,

goto err_xdp;
rcu_read_unlock();
goto xdp_xmit;
-   case XDP_REDIRECT:
-   err = xdp_do_redirect(dev, &xdp, xdp_prog);
-   if (!err)
-   *xdp_xmit = true;
-   rcu_read_unlock();
-   goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:





Re: [PATCH bpf] bpf: fix mlock precharge on arraymaps

2018-02-15 Thread Alexei Starovoitov

On 2/15/18 4:10 PM, Daniel Borkmann wrote:

syzkaller recently triggered OOM during percpu map allocation;
while there is work in progress by Dennis Zhou to add __GFP_NORETRY
semantics for percpu allocator under pressure, there seems also a
missing bpf_map_precharge_memlock() check in array map allocation.

Given today the actual bpf_map_charge_memlock() happens after the
find_and_alloc_map() in syscall path, the bpf_map_precharge_memlock()
is there to bail out early before we go and do the map setup work
when we find that we hit the limits anyway. Therefore add this for
array map as well.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Fixes: a10423b87a7e ("bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map")
Reported-by: syzbot+adb03f3f0bb57ce3a...@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann 
Cc: Dennis Zhou 


Applied to bpf tree, Thanks Daniel.



[PATCH net] tg3: APE heartbeat changes

2018-02-15 Thread Satish Baddipadige
From: Prashant Sreedharan 

In ungraceful host shutdown or driver crash case BMC connectivity is
lost. APE firmware is missing the driver state in this
case to keep the BMC connectivity alive.
This patch has below change to address this issue.

Heartbeat mechanism with APE firmware. This heartbeat mechanism
is needed to notify the APE firmware about driver state.

This patch also has the change in wait time for APE event from
1ms to 20ms as there can be some delay in getting response.

Signed-off-by: Prashant Sreedharan 
Signed-off-by: Satish Baddipadige 
Signed-off-by: Siva Reddy Kallam 
Acked-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/tg3.c | 35 ---
 drivers/net/ethernet/broadcom/tg3.h |  5 +
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index a77ee2f..36d8d1e 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -820,7 +820,7 @@ static int tg3_ape_event_lock(struct tg3 *tp, u32 
timeout_us)
 
tg3_ape_unlock(tp, TG3_APE_LOCK_MEM);
 
-   udelay(10);
+   usleep_range(10, 20);
timeout_us -= (timeout_us > 10) ? 10 : timeout_us;
}
 
@@ -922,8 +922,8 @@ static int tg3_ape_send_event(struct tg3 *tp, u32 event)
if (!(apedata & APE_FW_STATUS_READY))
return -EAGAIN;
 
-   /* Wait for up to 1 millisecond for APE to service previous event. */
-   err = tg3_ape_event_lock(tp, 1000);
+   /* Wait for up to 20 millisecond for APE to service previous event. */
+   err = tg3_ape_event_lock(tp, 2);
if (err)
return err;
 
@@ -946,6 +946,7 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int 
kind)
 
switch (kind) {
case RESET_KIND_INIT:
+   tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++);
tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG,
APE_HOST_SEG_SIG_MAGIC);
tg3_ape_write32(tp, TG3_APE_HOST_SEG_LEN,
@@ -962,13 +963,6 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
int kind)
event = APE_EVENT_STATUS_STATE_START;
break;
case RESET_KIND_SHUTDOWN:
-   /* With the interface we are currently using,
-* APE does not track driver state.  Wiping
-* out the HOST SEGMENT SIGNATURE forces
-* the APE to assume OS absent status.
-*/
-   tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG, 0x0);
-
if (device_may_wakeup(&tp->pdev->dev) &&
tg3_flag(tp, WOL_ENABLE)) {
tg3_ape_write32(tp, TG3_APE_HOST_WOL_SPEED,
@@ -990,6 +984,18 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
int kind)
tg3_ape_send_event(tp, event);
 }
 
+static inline void tg3_send_ape_heartbeat(struct tg3 *tp,
+ unsigned long interval)
+{
+   /* Check if hb interval has exceeded */
+   if (!tg3_flag(tp, ENABLE_APE) ||
+   time_before(jiffies, tp->ape_hb_jiffies + interval))
+   return;
+
+   tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++);
+   tp->ape_hb_jiffies = jiffies;
+}
+
 static void tg3_disable_ints(struct tg3 *tp)
 {
int i;
@@ -7262,6 +7268,7 @@ static int tg3_poll_msix(struct napi_struct *napi, int 
budget)
}
}
 
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1);
return work_done;
 
 tx_recovery:
@@ -7344,6 +7351,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
}
}
 
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1);
return work_done;
 
 tx_recovery:
@@ -10732,7 +10740,7 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy)
if (tg3_flag(tp, ENABLE_APE))
/* Write our heartbeat update interval to APE. */
tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_INT_MS,
-   APE_HOST_HEARTBEAT_INT_DISABLE);
+   APE_HOST_HEARTBEAT_INT_5SEC);
 
tg3_write_sig_post_reset(tp, RESET_KIND_INIT);
 
@@ -11077,6 +11085,9 @@ static void tg3_timer(struct timer_list *t)
tp->asf_counter = tp->asf_multiplier;
}
 
+   /* Update the APE heartbeat every 5 seconds.*/
+   tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL);
+
spin_unlock(&tp->lock);
 
 restart_timer:
@@ -16653,6 +16664,8 @@ static int tg3_get_invariants(struct tg3 *tp, const 
struct pci_device_id *ent)
   pci_state_reg);
 
tg3_ape_lock_init(tp);
+   tp->ape_hb_interval =
+   msecs_to_jiffies(APE_HOST_HEARTBEAT_INT_5SEC);
}
 
/* Set up tp->grc_local_ctrl before calling
diff --

[RFC PATCH] ptr_ring: linked list fallback

2018-02-15 Thread Michael S. Tsirkin
So pointer rings work fine, but they have a problem:
make them too small and not enough entries fit.
Make them too large and you start flushing your cache
and running out of memory.

This is a new idea of mine: a ring backed by a
linked list. Once you run out of rin entries,
instead of a drop you fall back on a list with
a common lock.

Should work well for the case where the ring is typically sized
correctly, but will help address the fact that some user try to set e.g.
tx queue length to 100.

My hope this will move us closer to direction where e.g. fw codel can
use ptr rings without locking at all.
The API is still very rough, and I really need to take a hard look
at lock nesting.

Completely untested, sending for early feedback/flames.

Signed-off-by: Michael S. Tsirkin 
---
 include/linux/ptr_ring.h | 64 +---
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d72b2e7..7bf2b7b 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -31,11 +31,18 @@
 #include 
 #endif
 
+/* entries must start with the following structure */
+struct plist {
+   struct plist *next;
+   struct plist *last; /* only valid in the 1st entry */
+};
+
 struct ptr_ring {
int producer cacheline_aligned_in_smp;
spinlock_t producer_lock;
int consumer_head cacheline_aligned_in_smp; /* next valid entry */
int consumer_tail; /* next entry to invalidate */
+   struct plist *consumer_list;
spinlock_t consumer_lock;
/* Shared consumer/producer data */
/* Read-only by both the producer and the consumer */
@@ -120,10 +127,40 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, 
void *ptr)
 }
 
 /*
- * Note: resize (below) nests producer lock within consumer lock, so if you
- * consume in interrupt or BH context, you must disable interrupts/BH when
- * calling this.
+ * Note: resize API with the _fallback should be used when calling this.
  */
+static inline int ptr_ring_produce_fallback(struct ptr_ring *r, void *ptr)
+{
+   int ret;
+   unsigned long flags;
+   plit *p = ptr;
+
+   p->next = NULL;
+   p->last = p;
+
+   spin_lock_irqsave(&r->producer_lock, flags);
+   ret = __ptr_ring_produce(r, ptr);
+   if (ret) {
+   spin_lock(&r->consumer_lock);
+   ret = __ptr_ring_produce(r, ptr);
+   if (ret) {
+   int producer = r->producer ? r->producer - 1 :
+   r->size - 1;
+   plist *first = r->queue[producer];
+
+   BUG_ON(!first);
+
+   first->last->next = p;
+   first->last = p;
+   }
+   spin_unlock(&r->consumer_lock);
+   }
+
+   spin_unlock_irqrestore(&r->producer_lock, flags);
+
+   return ret;
+}
+
 static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
int ret;
@@ -135,6 +172,7 @@ static inline int ptr_ring_produce(struct ptr_ring *r, void 
*ptr)
return ret;
 }
 
+
 static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
 {
int ret;
@@ -359,6 +397,26 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
 }
 
+static inline void *ptr_ring_consume_fallback(struct ptr_ring *r)
+{
+   unsigned long flags;
+   struct plist *ptr;
+
+   spin_lock_irqsave(&r->consumer_lock, flags);
+   if (r->consumer_list) {
+   ptr = r->consumer_list;
+   r->consumer_list = ptr->next;
+   } else {
+   ptr = __ptr_ring_consume(r);
+   if (ptr) {
+   r->consumer_list = ptr->next;
+   }
+   }
+   spin_unlock_irqrestore(&r->consumer_lock, flags);
+
+   return ptr;
+}
+
 static inline int ptr_ring_consume_batched(struct ptr_ring *r,
   void **array, int n)
 {
-- 
MST


<    1   2