Re: [PATCH 1/6] target: fix hang in target_wait_for_sess_cmds()

2016-10-21 Thread Nicholas A. Bellinger
On Mon, 2016-08-22 at 10:54 +0200, Hannes Reinecke wrote:
> When shutting down a session I'm seeing this hung_task message:
> 
> INFO: task kworker/u128:0:6 blocked for more than 480 seconds.
>   Tainted: GE   N  4.4.17-default+ #241
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u128:0  D 880237615c00 0 6  2 0x
>  Workqueue: fc_rport_eq fc_rport_work [libfc]
>  88017edd7c48 81c11500 88017edd0180 88017edd8000
>  88035b6860a8 88017edd0180 88036461eac0 
>  88017edd7c60 815db4d5 7fff 88017edd7d10
> Call Trace:
>  [] schedule+0x35/0x80
>  [] schedule_timeout+0x237/0x2d0
>  [] wait_for_completion+0xa3/0x110
>  [] target_wait_for_sess_cmds+0x45/0x190 [target_core_mod]
>  [] ft_close_sess+0x24/0x30 [tcm_fc]
>  [] ft_prlo+0x8a/0x95 [tcm_fc]
>  [] fc_rport_work+0x3b8/0x7c0 [libfc]
>  [] process_one_work+0x14e/0x410
>  [] worker_thread+0x116/0x490
> 
> Looking at the code it looks as if there is some confusion about
> when to call 'wait_for_completion(>cmd_wait_comp).
> The problem is that there are _two_ sections where the code
> waits for 'cmd_wait_comp' completion, but the completion
> is only called with 'complete', not 'complete_all'.
> So we cannot have both waiting for it, doing so will lead
> to a stuck wait_for_completion.
> 
> Signed-off-by: Hannes Reinecke 
> ---
>  drivers/target/target_core_transport.c | 16 ++--
>  1 file changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/target/target_core_transport.c 
> b/drivers/target/target_core_transport.c
> index 6094a6b..2e1a6d8 100644
> --- a/drivers/target/target_core_transport.c
> +++ b/drivers/target/target_core_transport.c
> @@ -2522,7 +2522,7 @@ int transport_generic_free_cmd(struct se_cmd *cmd, int 
> wait_for_tasks)
>* the remaining calls to target_put_sess_cmd(), and not the
>* callers of this function.
>*/
> - if (aborted) {
> + if (aborted && !cmd->cmd_wait_set) {
>   pr_debug("Detected CMD_T_ABORTED for ITT: %llu\n", cmd->tag);
>   wait_for_completion(>cmd_wait_comp);
>   cmd->se_tfo->release_cmd(cmd);

Do you recall if hung se_cmd shutdown was with both CMD_T_ABORTED and
!CMD_T_ABORTED descriptors, or former only..?

Using v4.4.y code for CMD_T_ABORTED, you likely hit the missing
SCF_ACK_KREF assignment regression leak:

http://www.spinics.net/lists/target-devel/msg13530.html

> @@ -2642,9 +2642,11 @@ void target_sess_cmd_list_set_waiting(struct 
> se_session *se_sess)
>   list_for_each_entry(se_cmd, _sess->sess_wait_list, se_cmd_list) {
>   rc = kref_get_unless_zero(_cmd->cmd_kref);
>   if (rc) {
> - se_cmd->cmd_wait_set = 1;
>   spin_lock(_cmd->t_state_lock);
> - se_cmd->transport_state |= CMD_T_FABRIC_STOP;
> + if (!(se_cmd->transport_state & CMD_T_FABRIC_STOP)) {
> + se_cmd->cmd_wait_set = 1;
> + se_cmd->transport_state |= CMD_T_FABRIC_STOP;
> + }
>   spin_unlock(_cmd->t_state_lock);
>   }
>   }

So tcm_fc uses transport_generic_free_cmd(se_cmd, wait_for_tasks=0) in
ft_check_stop_free() to drop the extra SCF_ACK_KREF reference, which
does the extra if (aborted) check + possible sleep mentioned above.

AFAICT, ft_check_stop_free() response callback should be using
target_put_sess_cmd() to drop se_cmd->cmd_kref, and not attempt to block
for aborted status at all..

> @@ -2664,24 +2666,26 @@ void target_wait_for_sess_cmds(struct se_session 
> *se_sess)
>  
>   list_for_each_entry_safe(se_cmd, tmp_cmd,
>   _sess->sess_wait_list, se_cmd_list) {
> + int cmd_wait;
>   pr_debug("Waiting for se_cmd: %p t_state: %d, fabric state:"
>   " %d\n", se_cmd, se_cmd->t_state,
>   se_cmd->se_tfo->get_cmd_state(se_cmd));
>  
>   spin_lock_irqsave(_cmd->t_state_lock, flags);
>   tas = (se_cmd->transport_state & CMD_T_TAS);
> + cmd_wait = se_cmd->cmd_wait_set;
>   spin_unlock_irqrestore(_cmd->t_state_lock, flags);
> -
>   if (!target_put_sess_cmd(se_cmd)) {
>   if (tas)
>   target_put_sess_cmd(se_cmd);
>   }
> -
> + if (!cmd_wait)
> + continue;
>   wait_for_completion(_cmd->cmd_wait_comp);
>   pr_debug("After cmd_wait_comp: se_cmd: %p t_state: %d"
>   " fabric state: %d\n", se_cmd, se_cmd->t_state,
>   se_cmd->se_tfo->get_cmd_state(se_cmd));
> -
> + se_cmd->cmd_wait_set = 0;
>   se_cmd->se_tfo->release_cmd(se_cmd);
>   }
>  

target_wait_for_sess_cmd() 

[PATCH 1/6] target: fix hang in target_wait_for_sess_cmds()

2016-08-22 Thread Hannes Reinecke
When shutting down a session I'm seeing this hung_task message:

INFO: task kworker/u128:0:6 blocked for more than 480 seconds.
  Tainted: GE   N  4.4.17-default+ #241
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/u128:0  D 880237615c00 0 6  2 0x
 Workqueue: fc_rport_eq fc_rport_work [libfc]
 88017edd7c48 81c11500 88017edd0180 88017edd8000
 88035b6860a8 88017edd0180 88036461eac0 
 88017edd7c60 815db4d5 7fff 88017edd7d10
Call Trace:
 [] schedule+0x35/0x80
 [] schedule_timeout+0x237/0x2d0
 [] wait_for_completion+0xa3/0x110
 [] target_wait_for_sess_cmds+0x45/0x190 [target_core_mod]
 [] ft_close_sess+0x24/0x30 [tcm_fc]
 [] ft_prlo+0x8a/0x95 [tcm_fc]
 [] fc_rport_work+0x3b8/0x7c0 [libfc]
 [] process_one_work+0x14e/0x410
 [] worker_thread+0x116/0x490

Looking at the code it looks as if there is some confusion about
when to call 'wait_for_completion(>cmd_wait_comp).
The problem is that there are _two_ sections where the code
waits for 'cmd_wait_comp' completion, but the completion
is only called with 'complete', not 'complete_all'.
So we cannot have both waiting for it, doing so will lead
to a stuck wait_for_completion.

Signed-off-by: Hannes Reinecke 
---
 drivers/target/target_core_transport.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/target/target_core_transport.c 
b/drivers/target/target_core_transport.c
index 6094a6b..2e1a6d8 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2522,7 +2522,7 @@ int transport_generic_free_cmd(struct se_cmd *cmd, int 
wait_for_tasks)
 * the remaining calls to target_put_sess_cmd(), and not the
 * callers of this function.
 */
-   if (aborted) {
+   if (aborted && !cmd->cmd_wait_set) {
pr_debug("Detected CMD_T_ABORTED for ITT: %llu\n", cmd->tag);
wait_for_completion(>cmd_wait_comp);
cmd->se_tfo->release_cmd(cmd);
@@ -2642,9 +2642,11 @@ void target_sess_cmd_list_set_waiting(struct se_session 
*se_sess)
list_for_each_entry(se_cmd, _sess->sess_wait_list, se_cmd_list) {
rc = kref_get_unless_zero(_cmd->cmd_kref);
if (rc) {
-   se_cmd->cmd_wait_set = 1;
spin_lock(_cmd->t_state_lock);
-   se_cmd->transport_state |= CMD_T_FABRIC_STOP;
+   if (!(se_cmd->transport_state & CMD_T_FABRIC_STOP)) {
+   se_cmd->cmd_wait_set = 1;
+   se_cmd->transport_state |= CMD_T_FABRIC_STOP;
+   }
spin_unlock(_cmd->t_state_lock);
}
}
@@ -2664,24 +2666,26 @@ void target_wait_for_sess_cmds(struct se_session 
*se_sess)
 
list_for_each_entry_safe(se_cmd, tmp_cmd,
_sess->sess_wait_list, se_cmd_list) {
+   int cmd_wait;
pr_debug("Waiting for se_cmd: %p t_state: %d, fabric state:"
" %d\n", se_cmd, se_cmd->t_state,
se_cmd->se_tfo->get_cmd_state(se_cmd));
 
spin_lock_irqsave(_cmd->t_state_lock, flags);
tas = (se_cmd->transport_state & CMD_T_TAS);
+   cmd_wait = se_cmd->cmd_wait_set;
spin_unlock_irqrestore(_cmd->t_state_lock, flags);
-
if (!target_put_sess_cmd(se_cmd)) {
if (tas)
target_put_sess_cmd(se_cmd);
}
-
+   if (!cmd_wait)
+   continue;
wait_for_completion(_cmd->cmd_wait_comp);
pr_debug("After cmd_wait_comp: se_cmd: %p t_state: %d"
" fabric state: %d\n", se_cmd, se_cmd->t_state,
se_cmd->se_tfo->get_cmd_state(se_cmd));
-
+   se_cmd->cmd_wait_set = 0;
se_cmd->se_tfo->release_cmd(se_cmd);
}
 
-- 
1.8.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html