[Cluster-devel] [PATCH 16/17] dlm: fix to use sk_callback_lock correctly

2017-08-08 Thread tsutomu.owa
Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3e7c096..f4ba4a8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -411,17 +411,23 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage 
*addr, int len)
 /* Data available on socket or listen socket received a connect */
 static void lowcomms_data_ready(struct sock *sk)
 {
-   struct connection *con = sock2con(sk);
+   struct connection *con;
+
+   read_lock_bh(>sk_callback_lock);
+   con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, >flags))
queue_work(recv_workqueue, >rwork);
+   read_unlock_bh(>sk_callback_lock);
 }
 
 static void lowcomms_write_space(struct sock *sk)
 {
-   struct connection *con = sock2con(sk);
+   struct connection *con;
 
+   read_lock_bh(>sk_callback_lock);
+   con = sock2con(sk);
if (!con)
-   return;
+   goto out;
 
clear_bit(SOCK_NOSPACE, >sock->flags);
 
@@ -431,6 +437,8 @@ static void lowcomms_write_space(struct sock *sk)
}
 
queue_work(send_workqueue, >swork);
+out:
+   read_unlock_bh(>sk_callback_lock);
 }
 
 static inline void lowcomms_connect_sock(struct connection *con)
@@ -797,8 +805,6 @@ static int tcp_accept_from_sock(struct connection *con)
mutex_lock_nested(>sock_mutex, 2);
if (!othercon->sock) {
newcon->othercon = othercon;
-   othercon->sock = newsock;
-   newsock->sk->sk_user_data = othercon;
add_sock(newsock, othercon);
addcon = othercon;
mutex_unlock(>sock_mutex);
@@ -812,7 +818,6 @@ static int tcp_accept_from_sock(struct connection *con)
}
}
else {
-   newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
/* accept copies the sk after we've saved the callbacks, so we
   don't want to save them a second time or comm errors will
@@ -918,8 +923,6 @@ static int sctp_accept_from_sock(struct connection *con)
mutex_lock_nested(>sock_mutex, 2);
if (!othercon->sock) {
newcon->othercon = othercon;
-   othercon->sock = newsock;
-   newsock->sk->sk_user_data = othercon;
add_sock(newsock, othercon);
addcon = othercon;
mutex_unlock(>sock_mutex);
@@ -931,7 +934,6 @@ static int sctp_accept_from_sock(struct connection *con)
goto accept_err;
}
} else {
-   newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
add_sock(newsock, newcon);
addcon = newcon;
@@ -1058,7 +1060,6 @@ static void sctp_connect_to_sock(struct connection *con)
if (result < 0)
goto socket_err;
 
-   sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = sctp_connect_to_sock;
add_sock(sock, con);
@@ -1143,7 +1144,6 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
}
 
-   sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = tcp_connect_to_sock;
add_sock(sock, con);
@@ -1233,10 +1233,12 @@ static struct socket *tcp_create_listen_sock(struct 
connection *con,
if (result < 0) {
log_print("Failed to set SO_REUSEADDR on socket: %d", result);
}
+   write_lock_bh(>sk->sk_callback_lock);
sock->sk->sk_user_data = con;
save_listen_callbacks(sock);
con->rx_action = tcp_accept_from_sock;
con->connect_action = tcp_connect_to_sock;
+   write_unlock_bh(>sk->sk_callback_lock);
 
/* Bind to our port */
make_sockaddr(saddr, dlm_config.ci_tcp_port, _len);
@@ -1639,8 +1641,11 @@ static void _stop_conn(struct connection *con, bool 
and_other)
set_bit(CF_CLOSE, >flags);
set_bit(CF_READ_PENDING, >flags);
set_bit(CF_WRITE_PENDING, >flags);
-   if (con->sock && con->sock->sk)
+   if (con->sock && con->sock->sk) {
+   write_lock_bh(>sock->sk->sk_callback_lock);
con->sock->sk->sk_user_data = NULL;
+   write_unlock_bh(>sock->sk->sk_callback_lock);
+   }
if (con->othercon && and_other)
_stop_conn(con->othercon, false);
mutex_unlock(>sock_mutex);
-- 
2.7.4





[Cluster-devel] [PATCH 15/17] dlm: fix overflow dlm_cb_seq

2017-08-08 Thread tsutomu.owa
dlm_cb_seq is 64 bits. If dlm_cb_seq overflows and returns to 0,
dlm_rem_lkb_callback() will not work properly.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/ast.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 07fed83..562fa8c 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -181,6 +181,8 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int 
mode, int status,
 
spin_lock(_cb_seq_spin);
new_seq = ++dlm_cb_seq;
+   if (!dlm_cb_seq)
+   new_seq = ++dlm_cb_seq;
spin_unlock(_cb_seq_spin);
 
if (lkb->lkb_flags & DLM_IFL_USER) {
-- 
2.7.4





[Cluster-devel] [PATCH 17/17] dlm: fix to reschedule rwork

2017-08-08 Thread tsutomu.owa
When an error occurs in kernel_recvmsg or kernel_sendpage and
close_connection is called and receive work is already scheduled,
receive work is canceled. In that case, the receive work will not
be scheduled forever after reconnection, because CF_READ_PENDING
flag is established.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index f4ba4a8..4884e00 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -593,10 +593,14 @@ static void close_connection(struct connection *con, bool 
and_other,
 {
bool closing = test_and_set_bit(CF_CLOSING, >flags);
 
-   if (tx && !closing && cancel_work_sync(>swork))
+   if (tx && !closing && cancel_work_sync(>swork)) {
log_print("canceled swork for node %d", con->nodeid);
-   if (rx && !closing && cancel_work_sync(>rwork))
+   clear_bit(CF_WRITE_PENDING, >flags);
+   }
+   if (rx && !closing && cancel_work_sync(>rwork)) {
log_print("canceled rwork for node %d", con->nodeid);
+   clear_bit(CF_READ_PENDING, >flags);
+   }
 
mutex_lock(>sock_mutex);
if (con->sock) {
-- 
2.7.4






[Cluster-devel] [PATCH 14/17] dlm: fix memory leak in tcp_accept_from_sock()

2017-08-08 Thread tsutomu.owa
The sk member of the socket generated by sock_create_kern() is overwritten
by ops->accept(). So the previous sk will not be released.
We use kernel_accept() instead of sock_create_kern() and ops->accept().

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 74ed41e..3e7c096 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -732,22 +732,14 @@ static int tcp_accept_from_sock(struct connection *con)
}
mutex_unlock(_lock);
 
-   memset(, 0, sizeof(peeraddr));
-   result = sock_create_kern(_net, dlm_local_addr[0]->ss_family,
- SOCK_STREAM, IPPROTO_TCP, );
-   if (result < 0)
-   return -ENOMEM;
-
mutex_lock_nested(>sock_mutex, 0);
 
-   result = -ENOTCONN;
-   if (con->sock == NULL)
-   goto accept_err;
-
-   newsock->type = con->sock->type;
-   newsock->ops = con->sock->ops;
+   if (!con->sock) {
+   mutex_unlock(>sock_mutex);
+   return -ENOTCONN;
+   }
 
-   result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+   result = kernel_accept(con->sock, , O_NONBLOCK);
if (result < 0)
goto accept_err;
 
@@ -844,7 +836,8 @@ static int tcp_accept_from_sock(struct connection *con)
 
 accept_err:
mutex_unlock(>sock_mutex);
-   sock_release(newsock);
+   if (newsock)
+   sock_release(newsock);
 
if (result != -EAGAIN)
log_print("error accepting connection from node: %d", result);
-- 
2.7.4







[Cluster-devel] [PATCH 13/17] dlm: fix _can_be_granted() for lock at the head of covert queue.

2017-08-08 Thread tsutomu.owa
If there is a lock resource conflict on multiple nodes, the lock on
convert queue may not be granted forever.

EX.)
grant queue:
node0 grmode NL / rqmode IV
node1 grmode NL / rqmode IV

convert queue:
node2 grmode NL / rqmode EX
node3 grmode PR / rqmode EX

wait queue:
node4 grmode IV / rqmode PR
node5 grmode IV / rqmode PR

When the lock conversion (node PR -> NL) of node 0 is completed, the lock
of node 2 should be grantable. However, __can_be_granted() returns 0
because the grmode of the lock on node 3 in convert queue is PR.

When checking the lock at the head of convert queue, exclude
queue_conflict() targeting convert queue.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 35502d4..dcdd26d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2336,7 +2336,8 @@ static int _can_be_granted(struct dlm_rsb *r, struct 
dlm_lkb *lkb, int now,
 * locks
 */
 
-   if (queue_conflict(>res_convertqueue, lkb))
+   if (!first_in_list(lkb, >res_convertqueue) &&
+   queue_conflict(>res_convertqueue, lkb))
return 0;
 
/*
-- 
2.7.4






[Cluster-devel] [PATCH 12/17] dlm: use CF_CLOSE flag to stop dlm_send correctly

2017-08-08 Thread tsutomu.owa
If reconnection fails while executing dlm_lowcomms_stop,
dlm_send will not stop.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 438fa92..74ed41e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1643,6 +1643,7 @@ static int work_start(void)
 static void _stop_conn(struct connection *con, bool and_other)
 {
mutex_lock(>sock_mutex);
+   set_bit(CF_CLOSE, >flags);
set_bit(CF_READ_PENDING, >flags);
set_bit(CF_WRITE_PENDING, >flags);
if (con->sock && con->sock->sk)
-- 
2.7.4






[Cluster-devel] [PATCH 09/17] dlm: close othercon at send/receive error

2017-08-08 Thread tsutomu.owa
Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 4b53c32..76c6cdd 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -703,7 +703,7 @@ static int receive_from_sock(struct connection *con)
 out_close:
mutex_unlock(>sock_mutex);
if (ret != -EAGAIN) {
-   close_connection(con, false, true, false);
+   close_connection(con, true, true, false);
/* Reconnect when there is something to send */
}
/* Don't return success if we really got EOF */
@@ -1530,7 +1530,7 @@ static void send_to_sock(struct connection *con)
 
 send_error:
mutex_unlock(>sock_mutex);
-   close_connection(con, false, false, true);
+   close_connection(con, true, false, true);
/* Requeue the send work. When the work daemon runs again, it will try
   a new connection, then call this function again. */
queue_work(send_workqueue, >swork);
-- 
2.7.4






[Cluster-devel] [PATCH 10/17] dlm: use schedule_timeout instead of schedule in dlm_recoverd

2017-08-08 Thread tsutomu.owa
When dlm_recoverd_stop() is called between kthread_should_stop() and
set_task_state(), dlm_recoverd will not wake up.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/recoverd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 6859b4b..d3956cc 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -276,6 +276,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
 static int dlm_recoverd(void *arg)
 {
struct dlm_ls *ls;
+   unsigned long timeout = (dlm_config.ci_recover_timer * HZ) >> 1;
 
ls = dlm_find_lockspace_local(arg);
if (!ls) {
@@ -291,7 +292,7 @@ static int dlm_recoverd(void *arg)
set_current_state(TASK_INTERRUPTIBLE);
if (!test_bit(LSFL_RECOVER_WORK, >ls_flags) &&
!test_bit(LSFL_RECOVER_DOWN, >ls_flags))
-   schedule();
+   schedule_timeout(timeout);
set_current_state(TASK_RUNNING);
 
if (test_and_clear_bit(LSFL_RECOVER_DOWN, >ls_flags)) {
-- 
2.7.4






[Cluster-devel] [PATCH 11/17] dlm: Reanimate CF_WRITE_PENDING flag

2017-08-08 Thread tsutomu.owa
CF_WRITE_PENDING flag has been reanimated to make dlm_send stop properly
when running dlm_lowcomms_stop.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 76c6cdd..438fa92 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
struct mutex sock_mutex;
unsigned long flags;
 #define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
@@ -1599,6 +1600,7 @@ static void process_send_sockets(struct work_struct *work)
 {
struct connection *con = container_of(work, struct connection, swork);
 
+   clear_bit(CF_WRITE_PENDING, >flags);
if (con->sock == NULL) /* not mutex protected so check it inside too */
con->connect_action(con);
if (!list_empty(>writequeue))
@@ -1642,6 +1644,7 @@ static void _stop_conn(struct connection *con, bool 
and_other)
 {
mutex_lock(>sock_mutex);
set_bit(CF_READ_PENDING, >flags);
+   set_bit(CF_WRITE_PENDING, >flags);
if (con->sock && con->sock->sk)
con->sock->sk->sk_user_data = NULL;
if (con->othercon && and_other)
@@ -1681,9 +1684,13 @@ static void work_flush(void)
hlist_for_each_entry_safe(con, n,
  _hash[i], list) {
ok &= test_bit(CF_READ_PENDING, >flags);
-   if (con->othercon)
+   ok &= test_bit(CF_WRITE_PENDING, >flags);
+   if (con->othercon) {
ok &= test_bit(CF_READ_PENDING,
   >othercon->flags);
+   ok &= test_bit(CF_WRITE_PENDING,
+  >othercon->flags);
+   }
}
}
} while (!ok);
-- 
2.7.4






[Cluster-devel] [PATCH 08/17] dlm: retry rcom when dlm_wait_function is timed out.

2017-08-08 Thread tsutomu.owa
If a node sends a DLM_RCOM_STATUS command and an error occurs on the
receiving side, the DLM_RCOM_STATUS_REPLY response may not be returned.
We retransmitted the DLM_RCOM_STATUS command so that we do not wait for
an infinite response.

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/rcom.c| 6 ++
 fs/dlm/recover.c | 4 
 2 files changed, 10 insertions(+)

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f3f5e72..4ff061d 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -155,6 +155,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t 
status_flags)
goto out;
}
 
+retry:
error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
sizeof(struct rcom_status), , );
if (error)
@@ -169,6 +170,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t 
status_flags)
 
error = dlm_wait_function(ls, _response);
disallow_sync_reply(ls);
+   if (error == -ETIMEDOUT)
+   goto retry;
if (error)
goto out;
 
@@ -276,6 +279,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char 
*last_name, int last_len)
 
ls->ls_recover_nodeid = nodeid;
 
+retry:
error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, , );
if (error)
goto out;
@@ -288,6 +292,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char 
*last_name, int last_len)
 
error = dlm_wait_function(ls, _response);
disallow_sync_reply(ls);
+   if (error == -ETIMEDOUT)
+   goto retry;
  out:
return error;
 }
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index eaea789..ce2aa54 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -52,6 +52,10 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) 
(struct dlm_ls *ls))
dlm_config.ci_recover_timer * HZ);
if (rv)
break;
+   if (test_bit(LSFL_RCOM_WAIT, >ls_flags)) {
+   log_debug(ls, "dlm_wait_function timed out");
+   return -ETIMEDOUT;
+   }
}
 
if (dlm_recovery_stopped(ls)) {
-- 
2.7.4





[Cluster-devel] [PATCH 05/17] dlm: fix double list_del()

2017-08-08 Thread tsutomu.owa
dlm_lowcomms_stop() was not functioning properly. Correctly, we have to
wait until all processing is finished with send_workqueue and
recv_workqueue.
This problem causes the following issue. Scenario is

1. dlm_send thread:
send_to_sock refers con->writequeue
2. main thread:
dlm_lowcomms_stop calls list_del
3. dlm_send thread:
send_to_sock calls list_del in writequeue_entry_complete

[ 1925.770305] dlm: canceled swork for node 4
[ 1925.772374] general protection fault:  [#1] SMP
[ 1925.777930] Modules linked in: ocfs2_stack_user ocfs2 ocfs2_nodemanager 
ocfs2_stackglue dlm fmxnet(O) fmx_api(O) fmx_cu(O) igb(O) kvm_intel kvm 
irqbypass autofs4
[ 1925.794131] CPU: 3 PID: 6994 Comm: kworker/u8:0 Tainted: G   O
4.4.39 #1
[ 1925.802684] Hardware name: TOSHIBA OX/OX, BIOS OX-P0015 12/03/2015
[ 1925.809595] Workqueue: dlm_send process_send_sockets [dlm]
[ 1925.815714] task: 8804398d3c00 ti: 88046910c000 task.ti: 
88046910c000
[ 1925.824072] RIP: 0010:[]  [] 
process_send_sockets+0xf8/0x280 [dlm]
[ 1925.834480] RSP: 0018:88046910fde0  EFLAGS: 00010246
[ 1925.840411] RAX: dead0200 RBX: 0001 RCX: 000a
[ 1925.848372] RDX: 88046bd980c0 RSI:  RDI: 8804673c5670
[ 1925.856341] RBP: 88046910fe20 R08: 00c9 R09: 0010
[ 1925.864311] R10: 81e22fc0 R11:  R12: 8804673c56d8
[ 1925.872281] R13: 8804673c5660 R14: 88046bd98440 R15: 0058
[ 1925.880251] FS:  () GS:88047fd8() 
knlGS:
[ 1925.889280] CS:  0010 DS:  ES:  CR0: 8005003b
[ 1925.895694] CR2: 7fff09eadf58 CR3: 0004690f5000 CR4: 001006e0
[ 1925.903663] Stack:
[ 1925.905903]  8804673c5630 8804673c5620 8804673c5670 
88007d219b40
[ 1925.914181]  88046f095800 0100 8800717a1400 
8804673c56d8
[ 1925.922459]  88046910fe60 81073db2 00ff8804 
88007d219b40
[ 1925.930736] Call Trace:
[ 1925.933468]  [] process_one_work+0x162/0x450
[ 1925.939983]  [] worker_thread+0x69/0x4a0
[ 1925.946109]  [] ? rescuer_thread+0x350/0x350
[ 1925.952622]  [] kthread+0xef/0x110
[ 1925.958165]  [] ? kthread_park+0x60/0x60
[ 1925.964283]  [] ret_from_fork+0x3f/0x70
[ 1925.970312]  [] ? kthread_park+0x60/0x60
[ 1925.976436] Code: 01 00 00 48 8b 7d d0 e8 07 d3 3a e1 45 01 7e 18 45 29 7e 
1c 75 ab 41 8b 46 24 85 c0 75 a3 49 8b 16 49 8b 46 08 31 f6 48 89 42 08 <48> 89 
10 48 b8 00 01 00 00 00 00 ad de 49 8b 7e 10 49 89 06 66
[ 1925.997791] RIP  [] process_send_sockets+0xf8/0x280 [dlm]
[ 1926.005577]  RSP 

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 44 +++-
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index be62cf1..9f7932c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1628,11 +1628,20 @@ static int work_start(void)
return 0;
 }
 
-static void stop_conn(struct connection *con)
+static void _stop_conn(struct connection *con, bool and_other)
 {
-   con->flags |= 0x0F;
+   mutex_lock(>sock_mutex);
+   set_bit(CF_READ_PENDING, >flags);
if (con->sock && con->sock->sk)
con->sock->sk->sk_user_data = NULL;
+   if (con->othercon && and_other)
+   _stop_conn(con->othercon, false);
+   mutex_unlock(>sock_mutex);
+}
+
+static void stop_conn(struct connection *con)
+{
+   _stop_conn(con, true);
 }
 
 static void free_conn(struct connection *con)
@@ -1644,6 +1653,32 @@ static void free_conn(struct connection *con)
kmem_cache_free(con_cache, con);
 }
 
+static void work_flush(void)
+{
+   int ok;
+   int i;
+   struct hlist_node *n;
+   struct connection *con;
+
+   flush_workqueue(recv_workqueue);
+   flush_workqueue(send_workqueue);
+   do {
+   ok = 1;
+   foreach_conn(stop_conn);
+   flush_workqueue(recv_workqueue);
+   flush_workqueue(send_workqueue);
+   for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
+   hlist_for_each_entry_safe(con, n,
+ _hash[i], list) {
+   ok &= test_bit(CF_READ_PENDING, >flags);
+   if (con->othercon)
+   ok &= test_bit(CF_READ_PENDING,
+  >othercon->flags);
+   }
+   }
+   } while (!ok);
+}
+
 void dlm_lowcomms_stop(void)
 {
/* Set all the flags to prevent any
@@ -1651,11 +1686,10 @@ void dlm_lowcomms_stop(void)
*/
mutex_lock(_lock);
dlm_allow_conn = 0;
-   foreach_conn(stop_conn);
+   mutex_unlock(_lock);
+   work_flush();

[Cluster-devel] [PATCH 07/17] dlm: fix to use sock_mutex correctly in xxx_accept_from_sock

2017-08-08 Thread tsutomu.owa
Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index aec3c59..4b53c32 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -801,16 +801,19 @@ static int tcp_accept_from_sock(struct connection *con)
INIT_WORK(>rwork, process_recv_sockets);
set_bit(CF_IS_OTHERCON, >flags);
}
+   mutex_lock_nested(>sock_mutex, 2);
if (!othercon->sock) {
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
add_sock(newsock, othercon);
addcon = othercon;
+   mutex_unlock(>sock_mutex);
}
else {
printk("Extra connection from node %d attempted\n", 
nodeid);
result = -EAGAIN;
+   mutex_unlock(>sock_mutex);
mutex_unlock(>sock_mutex);
goto accept_err;
}
@@ -918,15 +921,18 @@ static int sctp_accept_from_sock(struct connection *con)
INIT_WORK(>rwork, process_recv_sockets);
set_bit(CF_IS_OTHERCON, >flags);
}
+   mutex_lock_nested(>sock_mutex, 2);
if (!othercon->sock) {
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
add_sock(newsock, othercon);
addcon = othercon;
+   mutex_unlock(>sock_mutex);
} else {
printk("Extra connection from node %d attempted\n", 
nodeid);
ret = -EAGAIN;
+   mutex_unlock(>sock_mutex);
mutex_unlock(>sock_mutex);
goto accept_err;
}
-- 
2.7.4






[Cluster-devel] [PATCH 02/17] DLM: Eliminate CF_WRITE_PENDING flag

2017-08-08 Thread tsutomu.owa
From: Bob Peterson 

Before this patch the CF_WRITE_PENDING flag was used to indicate
when writes to the socket were pending. This caused race conditions
whereby one process set the bit and another cleared it. Instead,
we just check to see if there's anything there to be sent. This
makes the code more intuitive and bullet-proof.

Signed-off-by: Bob Peterson 
Reviewed-by: Tadashi Miyauchi 

---
 fs/dlm/lowcomms.c | 21 -
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 41bf93a..a9b2483 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,7 +106,6 @@ struct connection {
struct mutex sock_mutex;
unsigned long flags;
 #define CF_READ_PENDING 1
-#define CF_WRITE_PENDING 2
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
@@ -426,8 +425,7 @@ static void lowcomms_write_space(struct sock *sk)
clear_bit(SOCKWQ_ASYNC_NOSPACE, >sock->flags);
}
 
-   if (!test_and_set_bit(CF_WRITE_PENDING, >flags))
-   queue_work(send_workqueue, >swork);
+   queue_work(send_workqueue, >swork);
 }
 
 static inline void lowcomms_connect_sock(struct connection *con)
@@ -578,7 +576,6 @@ static void make_sockaddr(struct sockaddr_storage *saddr, 
uint16_t port,
 static void close_connection(struct connection *con, bool and_other,
 bool tx, bool rx)
 {
-   clear_bit(CF_WRITE_PENDING, >flags);
if (tx && cancel_work_sync(>swork))
log_print("canceled swork for node %d", con->nodeid);
if (rx && cancel_work_sync(>rwork))
@@ -1077,7 +1074,6 @@ static void sctp_connect_to_sock(struct connection *con)
if (result == 0)
goto out;
 
-
 bind_err:
con->sock = NULL;
sock_release(sock);
@@ -1102,7 +1098,6 @@ static void sctp_connect_to_sock(struct connection *con)
 
 out:
mutex_unlock(>sock_mutex);
-   set_bit(CF_WRITE_PENDING, >flags);
 }
 
 /* Connect a new socket to its peer */
@@ -1196,7 +1191,6 @@ static void tcp_connect_to_sock(struct connection *con)
}
 out:
mutex_unlock(>sock_mutex);
-   set_bit(CF_WRITE_PENDING, >flags);
return;
 }
 
@@ -1452,9 +1446,7 @@ void dlm_lowcomms_commit_buffer(void *mh)
e->len = e->end - e->offset;
spin_unlock(>writequeue_lock);
 
-   if (!test_and_set_bit(CF_WRITE_PENDING, >flags)) {
-   queue_work(send_workqueue, >swork);
-   }
+   queue_work(send_workqueue, >swork);
return;
 
 out:
@@ -1524,12 +1516,15 @@ static void send_to_sock(struct connection *con)
 send_error:
mutex_unlock(>sock_mutex);
close_connection(con, false, false, true);
-   lowcomms_connect_sock(con);
+   /* Requeue the send work. When the work daemon runs again, it will try
+  a new connection, then call this function again. */
+   queue_work(send_workqueue, >swork);
return;
 
 out_connect:
mutex_unlock(>sock_mutex);
-   lowcomms_connect_sock(con);
+   cond_resched();
+   queue_work(send_workqueue, >swork);
 }
 
 static void clean_one_writequeue(struct connection *con)
@@ -1591,7 +1586,7 @@ static void process_send_sockets(struct work_struct *work)
 
if (con->sock == NULL) /* not mutex protected so check it inside too */
con->connect_action(con);
-   if (test_and_clear_bit(CF_WRITE_PENDING, >flags))
+   if (!list_empty(>writequeue))
send_to_sock(con);
 }
 
-- 
2.7.4






[Cluster-devel] [PATCH 03/17] DLM: Fix saving of NULL callbacks

2017-08-08 Thread tsutomu.owa
From: Bob Peterson 

In a previous patch I noted that accept() often copies the struct
sock (sk) which overwrites the sock callbacks. However, in testing
we discovered that the dlm connection structures (con) are sometimes
deleted and recreated as connections come and go, and since they're
zeroed out by kmem_cache_zalloc, the saved callback pointers are
also initialized to zero. But with today's DLM code, the callbacks
are only saved when a socket is added.

During recovery testing, we discovered a common situation in which
the new con is initialized to zero, then a socket is added after
accept(). In this case, the sock's saved values are all NULL, but
the saved values are wiped out, due to accept(). Therefore, we
don't have a known good copy of the callbacks from which we can
restore.

Since the struct sock callbacks are always good after listen(),
this patch saves the known good values after listen(). These good
values are then used for subsequent restores.

Signed-off-by: Bob Peterson 
Reviewed-by: Tadashi Miyauchi 
---
 fs/dlm/lowcomms.c | 45 +
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a9b2483..ede4522 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -122,10 +122,6 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
-   void (*orig_error_report)(struct sock *);
-   void (*orig_data_ready)(struct sock *);
-   void (*orig_state_change)(struct sock *);
-   void (*orig_write_space)(struct sock *);
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
 
@@ -148,6 +144,13 @@ struct dlm_node_addr {
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
 
+static struct listen_sock_callbacks {
+   void (*sk_error_report)(struct sock *);
+   void (*sk_data_ready)(struct sock *);
+   void (*sk_state_change)(struct sock *);
+   void (*sk_write_space)(struct sock *);
+} listen_sock;
+
 static LIST_HEAD(dlm_node_addrs);
 static DEFINE_SPINLOCK(dlm_node_addrs_spin);
 
@@ -477,7 +480,7 @@ static void lowcomms_error_report(struct sock *sk)
if (con == NULL)
goto out;
 
-   orig_report = con->orig_error_report;
+   orig_report = listen_sock.sk_error_report;
if (con->sock == NULL ||
kernel_getpeername(con->sock, (struct sockaddr *), )) {
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
@@ -514,22 +517,26 @@ static void lowcomms_error_report(struct sock *sk)
 }
 
 /* Note: sk_callback_lock must be locked before calling this function. */
-static void save_callbacks(struct connection *con, struct sock *sk)
+static void save_listen_callbacks(struct socket *sock)
 {
-   con->orig_data_ready = sk->sk_data_ready;
-   con->orig_state_change = sk->sk_state_change;
-   con->orig_write_space = sk->sk_write_space;
-   con->orig_error_report = sk->sk_error_report;
+   struct sock *sk = sock->sk;
+
+   listen_sock.sk_data_ready = sk->sk_data_ready;
+   listen_sock.sk_state_change = sk->sk_state_change;
+   listen_sock.sk_write_space = sk->sk_write_space;
+   listen_sock.sk_error_report = sk->sk_error_report;
 }
 
-static void restore_callbacks(struct connection *con, struct sock *sk)
+static void restore_callbacks(struct socket *sock)
 {
+   struct sock *sk = sock->sk;
+
write_lock_bh(>sk_callback_lock);
sk->sk_user_data = NULL;
-   sk->sk_data_ready = con->orig_data_ready;
-   sk->sk_state_change = con->orig_state_change;
-   sk->sk_write_space = con->orig_write_space;
-   sk->sk_error_report = con->orig_error_report;
+   sk->sk_data_ready = listen_sock.sk_data_ready;
+   sk->sk_state_change = listen_sock.sk_state_change;
+   sk->sk_write_space = listen_sock.sk_write_space;
+   sk->sk_error_report = listen_sock.sk_error_report;
write_unlock_bh(>sk_callback_lock);
 }
 
@@ -542,8 +549,6 @@ static void add_sock(struct socket *sock, struct connection 
*con, bool save_cb)
con->sock = sock;
 
sk->sk_user_data = con;
-   if (save_cb)
-   save_callbacks(con, sk);
/* Install a data_ready callback */
sk->sk_data_ready = lowcomms_data_ready;
sk->sk_write_space = lowcomms_write_space;
@@ -583,8 +588,7 @@ static void close_connection(struct connection *con, bool 
and_other,
 
mutex_lock(>sock_mutex);
if (con->sock) {
-   if (!test_bit(CF_IS_OTHERCON, >flags))
-   restore_callbacks(con, con->sock->sk);
+   restore_callbacks(con->sock);
sock_release(con->sock);
con->sock = NULL;
}
@@ -1226,7 +1230,7 @@ static struct socket *tcp_create_listen_sock(struct 
connection 

[Cluster-devel] [PATCH 04/17] dlm: fix remove save_cb argument from add_sock()

2017-08-08 Thread tsutomu.owa
To remove an unused argument

Signed-off-by: Tadashi Miyauchi 
Signed-off-by: Tsutomu Owa 
---
 fs/dlm/lowcomms.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index ede4522..be62cf1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -541,7 +541,7 @@ static void restore_callbacks(struct socket *sock)
 }
 
 /* Make a socket active */
-static void add_sock(struct socket *sock, struct connection *con, bool save_cb)
+static void add_sock(struct socket *sock, struct connection *con)
 {
struct sock *sk = sock->sk;
 
@@ -801,7 +801,7 @@ static int tcp_accept_from_sock(struct connection *con)
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
-   add_sock(newsock, othercon, false);
+   add_sock(newsock, othercon);
addcon = othercon;
}
else {
@@ -817,7 +817,7 @@ static int tcp_accept_from_sock(struct connection *con)
/* accept copies the sk after we've saved the callbacks, so we
   don't want to save them a second time or comm errors will
   result in calling sk_error_report recursively. */
-   add_sock(newsock, newcon, false);
+   add_sock(newsock, newcon);
addcon = newcon;
}
 
@@ -918,7 +918,7 @@ static int sctp_accept_from_sock(struct connection *con)
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
-   add_sock(newsock, othercon, false);
+   add_sock(newsock, othercon);
addcon = othercon;
} else {
printk("Extra connection from node %d attempted\n", 
nodeid);
@@ -929,7 +929,7 @@ static int sctp_accept_from_sock(struct connection *con)
} else {
newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
-   add_sock(newsock, newcon, false);
+   add_sock(newsock, newcon);
addcon = newcon;
}
 
@@ -1057,7 +1057,7 @@ static void sctp_connect_to_sock(struct connection *con)
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = sctp_connect_to_sock;
-   add_sock(sock, con, true);
+   add_sock(sock, con);
 
/* Bind to all addresses. */
if (sctp_bind_addrs(con, 0))
@@ -1142,7 +1142,7 @@ static void tcp_connect_to_sock(struct connection *con)
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = tcp_connect_to_sock;
-   add_sock(sock, con, true);
+   add_sock(sock, con);
 
/* Bind to our cluster-known address connecting to avoid
   routing problems */
@@ -1361,7 +1361,7 @@ static int tcp_listen_for_all(void)
 
sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
if (sock) {
-   add_sock(sock, con, true);
+   add_sock(sock, con);
result = 0;
}
else {
-- 
2.7.4





[Cluster-devel] [PATCH 01/17] DLM: Eliminate CF_CONNECT_PENDING flag

2017-08-08 Thread tsutomu.owa
From: Bob Peterson 

Before this patch, there was a flag in the con structure that was
used to determine whether or not a connect was needed. The bit was
set here and there, and cleared here and there, so it left some
race conditions: the bit was set, work was queued, then the worker
cleared the bit, allowing someone else to set it while the worker
ran. For the most part, this worked okay, but we got into trouble
if connections were lost and it needed to reconnect.

This patch eliminates the flag in favor of simply checking if we
actually have a sock pointer while protected by the mutex.

Signed-off-by: Bob Peterson 
Reviewed-by: Tadashi Miyauchi 

---
 fs/dlm/lowcomms.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 7d398d3..41bf93a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -107,7 +107,6 @@ struct connection {
unsigned long flags;
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
-#define CF_CONNECT_PENDING 3
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
@@ -435,8 +434,8 @@ static inline void lowcomms_connect_sock(struct connection 
*con)
 {
if (test_bit(CF_CLOSE, >flags))
return;
-   if (!test_and_set_bit(CF_CONNECT_PENDING, >flags))
-   queue_work(send_workqueue, >swork);
+   queue_work(send_workqueue, >swork);
+   cond_resched();
 }
 
 static void lowcomms_state_change(struct sock *sk)
@@ -579,7 +578,6 @@ static void make_sockaddr(struct sockaddr_storage *saddr, 
uint16_t port,
 static void close_connection(struct connection *con, bool and_other,
 bool tx, bool rx)
 {
-   clear_bit(CF_CONNECT_PENDING, >flags);
clear_bit(CF_WRITE_PENDING, >flags);
if (tx && cancel_work_sync(>swork))
log_print("canceled swork for node %d", con->nodeid);
@@ -1098,7 +1096,6 @@ static void sctp_connect_to_sock(struct connection *con)
  con->retries, result);
mutex_unlock(>sock_mutex);
msleep(1000);
-   clear_bit(CF_CONNECT_PENDING, >flags);
lowcomms_connect_sock(con);
return;
}
@@ -1194,7 +1191,6 @@ static void tcp_connect_to_sock(struct connection *con)
  con->retries, result);
mutex_unlock(>sock_mutex);
msleep(1000);
-   clear_bit(CF_CONNECT_PENDING, >flags);
lowcomms_connect_sock(con);
return;
}
@@ -1593,7 +1589,7 @@ static void process_send_sockets(struct work_struct *work)
 {
struct connection *con = container_of(work, struct connection, swork);
 
-   if (test_and_clear_bit(CF_CONNECT_PENDING, >flags))
+   if (con->sock == NULL) /* not mutex protected so check it inside too */
con->connect_action(con);
if (test_and_clear_bit(CF_WRITE_PENDING, >flags))
send_to_sock(con);
-- 
2.7.4






[Cluster-devel] [PATCH 00/17] DLM: dlm patches need review

2017-08-08 Thread tsutomu.owa
Hi,

This series of patches is to fix various bugs.  This patch set is against the 
mainline kernel.
I'd like reviewed to make sure those changes are fine.

Patch number 01/02/03 were posted on this mailing list by Bob Peterson 
.
Patch 03 is modified to correct a bug acquiring rwlock multiple times.
Other than that, there is no changes.

01:
https://www.redhat.com/archives/cluster-devel/2017-April/msg00021.html
02:
https://www.redhat.com/archives/cluster-devel/2017-April/msg00022.html
03:
https://www.redhat.com/archives/cluster-devel/2017-April/msg00023.html

thanks in advance.
-- owa
ps. Sorry to bother you if you got those mails several times.




Re: [Cluster-devel] [PATCH] dlm: use sock_create_lite inside tcp_accept_from_sock

2017-08-08 Thread Steven Whitehouse

Hi,


On 07/08/17 20:10, Bob Peterson wrote:

| | Signed-off-by: Guoqing Jiang 
| | ---
| |  fs/dlm/lowcomms.c | 2 +-
| |  1 file changed, 1 insertion(+), 1 deletion(-)
| |
| | diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
| | index 9382db9..4813d0e 100644
| | --- a/fs/dlm/lowcomms.c
| | +++ b/fs/dlm/lowcomms.c
| | @@ -729,7 +729,7 @@ static int tcp_accept_from_sock(struct connection *con)
| | mutex_unlock(_lock);
| |
| | memset(, 0, sizeof(peeraddr));
| | -   result = sock_create_kern(_net, dlm_local_addr[0]->ss_family,
| | +   result = sock_create_lite(dlm_local_addr[0]->ss_family,
| |   SOCK_STREAM, IPPROTO_TCP, );
| | if (result < 0)
| | return -ENOMEM;
|
| Isn't this also a problem for the sctp equivalent, sctp_connect_to_sock?
|
| Regards,
|
| Bob Peterson
| Red Hat File Systems
|

In fact, I see 5 different calls to sock_create_kern in DLM.
Shouldn't it be done to all of them?

One could also argue that sock_create_kern should itself be fixed,
not its callers.
The two functions do different things. Normally you want to create both 
a struct socket and a struct sock, which is what sock_create_kern does. 
In accept though, you need to pass in a struct socket, and the struct 
sock is created during accept and grafted on to it. That is why you were 
having such trouble with the callbacks, because it was leaking the 
struct sock that was originally created by sock_create_kern. Since DLM 
usually doesn't make many connections, that would not have shown up in 
any greatly increased memory consumption at any stage.


If you look at sctp, it calls kernel_accept() which uses 
sock_create_lite anyway, so that is already correct. We should probably 
be using that helper for the tcp case too, which would be perhaps a 
better way to resolve that problem, since it reduces code duplication,


Steve.


Regards,

Bob Peterson
Red Hat File Systems





[Cluster-devel] [PATCH v3 42/49] gfs2: convert to bio_for_each_segment_all_sp()

2017-08-08 Thread Ming Lei
Cc: Steven Whitehouse 
Cc: Bob Peterson 
Cc: cluster-devel@redhat.com
Signed-off-by: Ming Lei 
---
 fs/gfs2/lops.c| 3 ++-
 fs/gfs2/meta_io.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3010f9edd177..d1fd8ed01b9e 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -206,11 +206,12 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec;
struct page *page;
int i;
+   struct bvec_iter_all bia;
 
if (bio->bi_status)
fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index fabe1614f879..6879b0103539 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len;
-- 
2.9.4