[tipc-discussion] [PATCH net-next v2 1/3] tipc: unify tipc_wait_for_sndpkt() and tipc_wait_for_sndmsg() functions

2016-11-29 Thread Jon Maloy
The functions tipc_wait_for_sndpkt() and tipc_wait_for_sndmsg() are very
similar. The latter function is also called from two locations, and
there will be more in the coming commits, which will all need to test on
different conditions.

Instead of making yet another duplicates of the function, we now
introduce a new macro tipc_wait_for_cond() where the wakeup condition
can be stated as an argument to the call. This macro replaces all
current and future uses of the two functions, which can now be
eliminated.

Signed-off-by: Jon Maloy 
---
 net/tipc/socket.c | 110 +-
 1 file changed, 51 insertions(+), 59 deletions(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 333c5da..30732a8 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -110,7 +110,6 @@ static void tipc_write_space(struct sock *sk);
 static void tipc_sock_destruct(struct sock *sk);
 static int tipc_release(struct socket *sock);
 static int tipc_accept(struct socket *sock, struct socket *new_sock, int 
flags);
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
 static void tipc_sk_timeout(unsigned long data);
 static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
   struct tipc_name_seq const *seq);
@@ -334,6 +333,51 @@ static int tipc_set_sk_state(struct sock *sk, int state)
return res;
 }
 
+static int tipc_sk_sock_err(struct socket *sock, long *timeout)
+{
+   struct sock *sk = sock->sk;
+   int err = sock_error(sk);
+   int typ = sock->type;
+
+   if (err)
+   return err;
+   if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
+   if (sk->sk_state == TIPC_DISCONNECTING)
+   return -EPIPE;
+   else if (!tipc_sk_connected(sk))
+   return -ENOTCONN;
+   } else if (sk->sk_shutdown & SEND_SHUTDOWN) {
+   return -EPIPE;
+   }
+   if (!*timeout)
+   return -EAGAIN;
+   if (signal_pending(current))
+   return sock_intr_errno(*timeout);
+
+   return 0;
+}
+
+#define tipc_wait_for_cond(sock_, timeout_, condition_)
\
+({ \
+   int rc_ = 0;\
+   int done_ = 0;  \
+   \
+   while (!(condition_) && !done_) {   \
+   struct sock *sk_ = sock->sk;\
+   DEFINE_WAIT_FUNC(wait_, woken_wake_function);   \
+   \
+   rc_ = tipc_sk_sock_err(sock_, timeout_);\
+   if (rc_)\
+   break;  \
+   prepare_to_wait(sk_sleep(sk_), _,  \
+   TASK_INTERRUPTIBLE);\
+   done_ = sk_wait_event(sk_, timeout_,\
+ (condition_), _);\
+   remove_wait_queue(sk_sleep(sk), _);\
+   }   \
+   rc_;\
+})
+
 /**
  * tipc_sk_create - create a TIPC socket
  * @net: network namespace (must be default network)
@@ -719,7 +763,7 @@ static int tipc_sendmcast(struct  socket *sock, struct 
tipc_name_seq *seq,
 
if (rc == -ELINKCONG) {
tsk->link_cong = 1;
-   rc = tipc_wait_for_sndmsg(sock, );
+   rc = tipc_wait_for_cond(sock, , !tsk->link_cong);
if (!rc)
continue;
}
@@ -828,31 +872,6 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, 
struct sk_buff *skb,
kfree_skb(skb);
 }
 
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
-{
-   DEFINE_WAIT_FUNC(wait, woken_wake_function);
-   struct sock *sk = sock->sk;
-   struct tipc_sock *tsk = tipc_sk(sk);
-   int done;
-
-   do {
-   int err = sock_error(sk);
-   if (err)
-   return err;
-   if (sk->sk_shutdown & SEND_SHUTDOWN)
-   return -EPIPE;
-   if (!*timeo_p)
-   return -EAGAIN;
-   if (signal_pending(current))
-   return sock_intr_errno(*timeo_p);
-
-   add_wait_queue(sk_sleep(sk), );
-   done = sk_wait_event(sk, timeo_p, !tsk->link_cong, );
-   remove_wait_queue(sk_sleep(sk), );
-   } while (!done);
-   

[tipc-discussion] [PATCH net-next v2 0/3] tipc: improve interaction socket-link

2016-11-29 Thread Jon Maloy
We fix a very real starvation problem that may occur when the link
level runs into send buffer congestion. At the same time we make the 
interaction between the socket and link layer simpler and more 
consistent.

v2: - Simplified link congestion check to only check against own
  importance limit. This reduces the risk of higher levels
  starving out lower levels.

Jon Maloy (3):
  tipc: unify tipc_wait_for_sndpkt() and tipc_wait_for_sndmsg()
functions
  tipc: modify struct tipc_plist to be more versatile
  tipc: reduce risk of user starvation during link congestion

 net/tipc/bcast.c  |   2 +-
 net/tipc/link.c   |  81 -
 net/tipc/msg.h|   8 +-
 net/tipc/name_table.c | 100 +++
 net/tipc/name_table.h |  21 +--
 net/tipc/node.c   |   2 +-
 net/tipc/socket.c | 450 ++
 7 files changed, 327 insertions(+), 337 deletions(-)

-- 
2.7.4


--
___
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion


[tipc-discussion] [PATCH net-next v2 3/3] tipc: reduce risk of user starvation during link congestion

2016-11-29 Thread Jon Maloy
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.

This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.

In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is sent to a congested
link, we now let it keep the message in the wakeup-item that it has to
create anyway, and immediately add it to the link's send queue when
enough space has been freed up. Only when this is done do we issue a
wakeup signal to the socket, which can now immediately go on and send
the next message, if any.

The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.

Signed-off-by: Jon Maloy 
---
 net/tipc/bcast.c  |   2 +-
 net/tipc/link.c   |  81 +++--
 net/tipc/msg.h|   8 +-
 net/tipc/node.c   |   2 +-
 net/tipc/socket.c | 346 --
 5 files changed, 200 insertions(+), 239 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index aa1babb..1a56cab 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct 
sk_buff_head *xmitq)
  *and to identified node local sockets
  * @net: the applicable net namespace
  * @list: chain of buffers containing message
- * Consumes the buffer chain, except when returning -ELINKCONG
+ * Consumes the buffer chain.
  * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
  */
 int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
diff --git a/net/tipc/link.c b/net/tipc/link.c
index ecc12411..428437c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -776,58 +776,60 @@ int tipc_link_timeout(struct tipc_link *l, struct 
sk_buff_head *xmitq)
 
 /**
  * link_schedule_user - schedule a message sender for wakeup after congestion
- * @link: congested link
+ * @l: congested link
  * @list: message that was attempted sent
  * Create pseudo msg to send back to user when congestion abates
- * Does not consume buffer list
+ * Consumes buffer list
  */
-static int link_schedule_user(struct tipc_link *link, struct sk_buff_head 
*list)
+static int link_schedule_user(struct tipc_link *l, struct sk_buff_head *list)
 {
-   struct tipc_msg *msg = buf_msg(skb_peek(list));
-   int imp = msg_importance(msg);
-   u32 oport = msg_origport(msg);
-   u32 addr = tipc_own_addr(link->net);
+   struct tipc_msg *hdr = buf_msg(skb_peek(list));
+   int imp = msg_importance(hdr);
+   u32 oport = msg_origport(hdr);
+   u32 dnode = tipc_own_addr(l->net);
struct sk_buff *skb;
+   struct sk_buff_head *pkts;
 
/* This really cannot happen...  */
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
-   pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
+   pr_warn("%s<%s>, send queue full", link_rst_msg, l->name);
return -ENOBUFS;
}
-   /* Non-blocking sender: */
-   if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
-   return -ELINKCONG;
 
/* Create and schedule wakeup pseudo message */
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
- addr, addr, oport, 0, 0);
+ dnode, l->addr, oport, 0, 0);
if (!skb)
return -ENOBUFS;
-   TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
-   TIPC_SKB_CB(skb)->chain_imp = imp;
-   skb_queue_tail(>wakeupq, skb);
-   link->stats.link_congs++;
+   msg_set_dest_droppable(buf_msg(skb), true);
+   skb_queue_tail(>wakeupq, skb);
+
+   /* Keep the packet chain until we can send it */
+   pkts = _SKB_CB(skb)->pkts;
+   __skb_queue_head_init(pkts);
+   skb_queue_splice_init(list, pkts);
+   l->stats.link_congs++;
return -ELINKCONG;
 }
 
 /**
  * link_prepare_wakeup - prepare users for wakeup after congestion
- * @link: congested link
- * Move a number of waiting users, as permitted by available space in
- * the send queue, from link wait queue to node wait queue for wakeup
+ * @l: congested link
+ * Wake up a 

Re: [tipc-discussion] soft lockup for TIPC

2016-11-29 Thread Xue, Ying
Hi Jon,

As today I am just back from travel, I will look at your question and the issue 
tomorrow.

Sorry for late response.

Regards,
Ying

-Original Message-
From: Jon Maloy [mailto:jon.ma...@ericsson.com] 
Sent: Friday, November 25, 2016 11:50 PM
To: XIANG Haiming; tipc-discussion@lists.sourceforge.net; Xue, Ying
Subject: RE: soft lockup for TIPC

Ying,
I looked into the patches you posted to remove the bearer lock around February 
2014, but could not find any obvious candidate addressing this problem. I 
suspect you just "eliminated" the potential issue though your series of 
patches. But you possibly remember better than me what was done, and which of 
the patches are needed to resolve the issue.

BR
///jon


> -Original Message-
> From: XIANG Haiming [mailto:haiming.xi...@alcatel-sbell.com.cn]
> Sent: Wednesday, 23 November, 2016 20:41
> To: Jon Maloy ; tipc-discussion@lists.sourceforge.net
> Subject: RE: soft lockup for TIPC
> 
> Hi Jon,
> 
> I am OK with installing own patches.
> 
> Thank you for your help.
> 
> 
> -Original Message-
> From: Jon Maloy [mailto:jon.ma...@ericsson.com]
> Sent: 2016年11月24日 1:03
> To: XIANG Haiming; tipc-discussion@lists.sourceforge.net
> Subject: RE: soft lockup for TIPC
> 
> Hi,
> I will take a look into this during the next days. Are you ok with installing 
> your own
> patches?
> 
> ///jon
> 
> 
> > -Original Message-
> > From: XIANG Haiming [mailto:haiming.xi...@alcatel-sbell.com.cn]
> > Sent: Sunday, 20 November, 2016 21:35
> > To: tipc-discussion@lists.sourceforge.net
> > Subject: [tipc-discussion] soft lockup for TIPC
> >
> > Hi all,
> >
> > The version for TIPC which we use is TIPC version 2.0.0.
> > The OS info is as follow:
> >
> > Red Hat Enterprise Linux Server 7.2 (Maipo)
> > Kernel 3.10.0-327.18.2.el7.x86_64 on an x86_64
> >
> > We meet two soft lockup issue about TIPC, please help us to solve this 
> > issue.
> > Thank you
> >
> > One issue is as follow:
> >
> >
> > [85502.601198] BUG: soft lockup - CPU#0 stuck for 22s! [scm:2649]
> > [85502.603585] Modules linked in: iptable_filter ip6table_mangle xt_limit
> > iptable_mangle ip6table_filter ip6_tables igb_uio(OE) uio tipc(OE) 8021q 
> > garp
> stp
> > mrp llc bonding dm_mirror dm_region_hash dm_log dm_mod ppdev
> > crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper
> > ablk_helper cryptd pcspkr i6300esb virtio_balloon cirrus syscopyarea 
> > sysfillrect
> > sysimgblt ttm drm_kms_helper drm i2c_piix4 i2c_core parport_pc parport
> > binfmt_misc nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4
> mbcache
> > jbd2 virtio_blk virtio_console crct10dif_pclmul crct10dif_common 
> > crc32c_intel
> > serio_raw ixgbevf virtio_pci virtio_ring virtio ata_generic pata_acpi 
> > ata_piix
> libata
> > floppy
> > [85502.618210] CPU: 0 PID: 2649 Comm: scm Tainted: G   OEL 
> > 
> > 3.10.0-327.18.2.el7.x86_64 #1
> > [85502.620482] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> > [85502.622361] task: 880405da5080 ti: 8800db68c000 task.ti:
> > 8800db68c000
> > [85502.624400] RIP: 0010:[]  []
> > _raw_spin_lock_bh+0x3d/0x50
> > [85502.626558] RSP: 0018:8800db68fa68  EFLAGS: 0202
> > [85502.628412] RAX: 56f6 RBX: 880406600800 RCX:
> > 712a
> > [85502.630419] RDX: 712c RSI: 712c RDI:
> > a03f8548
> > [85502.632423] RBP: 8800db68fa70 R08: 02c0 R09:
> > 0500
> > [85502.634421] R10: 88040f001500 R11: 8800db68fc58 R12:
> 81519be1
> > [85502.636413] R13: 8800db68fa08 R14:  R15:
> > 0240
> > [85502.638415] FS:  () GS:88041fc0(0063)
> > knlGS:d1495b40
> > [85502.640514] CS:  0010 DS: 002b ES: 002b CR0: 80050033
> > [85502.642399] CR2: d37c8208 CR3: 0003cfff7000 CR4:
> > 000406f0
> > [85502.644420] DR0:  DR1:  DR2:
> > 
> > [85502.646416] DR3:  DR6: 0ff0 DR7:
> > 0400
> > [85502.648393] Stack:
> > [85502.649916]  a03f8548 8800db68fa90 a03e2afb
> > 880036242c00
> > [85502.652019]  880406600800 8800db68fac0 a03e5c1b
> > 880036242c00
> > [85502.654108]  880036b5a484 88003698e800 8800db68fb30
> > 8800db68fba8
> > [85502.656179] Call Trace:
> > [85502.657742]  [] tipc_bearer_blocked+0x1b/0x30 [tipc]
> > [85502.659678]  [] link_send_buf_fast+0x5b/0xb0 [tipc]
> > [85502.661592]  [] tipc_link_send_sections_fast+0xe6/0x630
> > [tipc]
> > [85502.663594]  [] ? _raw_read_unlock_bh+0x16/0x20
> > [85502.665473]  [] ? tipc_nametbl_translate+0xc0/0x1f0
> [tipc]
> > [85502.667441]  [] tipc_send2name+0x155/0x1d0 [tipc]
> > [85502.669351]  [] send_msg+0x1eb/0x530 [tipc]
> > [85502.671205]  [] sock_sendmsg+0xb0/0xf0
> > [85502.673028]  [] ? futex_wait+0x193/0x280
> > [85502.674859]