ping?

OGAWA Hirofumi <hirof...@mail.parknet.co.jp> writes:

> Current abort operation has race.
>
>     xhci_handle_command_timeout()
>       xhci_abort_cmd_ring()
>         xhci_write_64(CMD_RING_ABORT)
>         xhci_handshake(5s)
>         do {
>           check CMD_RING_RUNNING
>             udelay(1)
>                                          ...
>                                          COMP_CMD_ABORT event
>                                          COMP_CMD_STOP event
>                                          xhci_handle_stopped_cmd_ring()
>                                            restart cmd_ring
>                                              CMD_RING_RUNNING become 1 again
>         } while ()
>           return -ETIMEDOUT
>         xhci_write_64(CMD_RING_ABORT)
>         /* can abort random command */
>
> To do abort operation correctly, we have to wait both of COMP_CMD_STOP
> event and negation of CMD_RING_RUNNING.
>
> But like above, while timeout handler is waiting negation of
> CMD_RING_RUNNING, event handler can restart cmd_ring. So timeout
> handler never be notice negation of CMD_RING_RUNNING, and retry of
> CMD_RING_ABORT can abort random command (BTW, I guess retry of
> CMD_RING_ABORT was workaround of this race).
>
> To fix this race, this moves xhci_handle_stopped_cmd_ring() to
> xhci_abort_cmd_ring().  And timeout handler waits COMP_CMD_STOP event.
>
> At this point, timeout handler is owner of cmd_ring, and safely
> restart cmd_ring by using xhci_handle_stopped_cmd_ring().
>
> [FWIW, as bonus, this way would be easily extend to add CMD_RING_PAUSE
> operation]
>
> Signed-off-by: OGAWA Hirofumi <hirof...@mail.parknet.co.jp>
> ---
>
>  drivers/usb/host/xhci-mem.c  |    1 
>  drivers/usb/host/xhci-ring.c |  163 
> ++++++++++++++++++++++--------------------
>  drivers/usb/host/xhci.h      |    1 
>  3 files changed, 88 insertions(+), 77 deletions(-)
>
> diff -puN drivers/usb/host/xhci-mem.c~xhci-fix-abort-race2 
> drivers/usb/host/xhci-mem.c
> --- xhci/drivers/usb/host/xhci-mem.c~xhci-fix-abort-race2     2016-11-16 
> 18:38:52.551146764 +0900
> +++ xhci-hirofumi/drivers/usb/host/xhci-mem.c 2016-11-17 01:04:11.402014445 
> +0900
> @@ -2344,6 +2344,7 @@ int xhci_mem_init(struct xhci_hcd *xhci,
>  
>       /* init command timeout work */
>       INIT_DELAYED_WORK(&xhci->cmd_timer, xhci_handle_command_timeout);
> +     init_completion(&xhci->cmd_ring_stop_completion);
>  
>       page_size = readl(&xhci->op_regs->page_size);
>       xhci_dbg_trace(xhci, trace_xhci_dbg_init,
> diff -puN drivers/usb/host/xhci-ring.c~xhci-fix-abort-race2 
> drivers/usb/host/xhci-ring.c
> --- xhci/drivers/usb/host/xhci-ring.c~xhci-fix-abort-race2    2016-11-16 
> 18:38:52.552146764 +0900
> +++ xhci-hirofumi/drivers/usb/host/xhci-ring.c        2016-11-17 
> 01:04:06.393018485 +0900
> @@ -284,6 +284,60 @@ static bool xhci_mod_cmd_timer(struct xh
>       return mod_delayed_work(system_wq, &xhci->cmd_timer, delay);
>  }
>  
> +static struct xhci_command *xhci_next_queued_cmd(struct xhci_hcd *xhci)
> +{
> +     return list_first_entry_or_null(&xhci->cmd_list, struct xhci_command,
> +                                     cmd_list);
> +}
> +
> +/*
> + * Turn all commands on command ring with status set to "aborted" to no-op 
> trbs.
> + * If there are other commands waiting then restart the ring and kick the 
> timer.
> + * This must be called with command ring stopped and xhci->lock held.
> + */
> +static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci,
> +                                      struct xhci_command *cur_cmd)
> +{
> +     struct xhci_command *i_cmd;
> +     u32 cycle_state;
> +
> +     /* Turn all aborted commands in list to no-ops, then restart */
> +     list_for_each_entry(i_cmd, &xhci->cmd_list, cmd_list) {
> +
> +             if (i_cmd->status != COMP_CMD_ABORT)
> +                     continue;
> +
> +             i_cmd->status = COMP_CMD_STOP;
> +
> +             xhci_dbg(xhci, "Turn aborted command %p to no-op\n",
> +                      i_cmd->command_trb);
> +             /* get cycle state from the original cmd trb */
> +             cycle_state = le32_to_cpu(
> +                     i_cmd->command_trb->generic.field[3]) & TRB_CYCLE;
> +             /* modify the command trb to no-op command */
> +             i_cmd->command_trb->generic.field[0] = 0;
> +             i_cmd->command_trb->generic.field[1] = 0;
> +             i_cmd->command_trb->generic.field[2] = 0;
> +             i_cmd->command_trb->generic.field[3] = cpu_to_le32(
> +                     TRB_TYPE(TRB_CMD_NOOP) | cycle_state);
> +
> +             /*
> +              * caller waiting for completion is called when command
> +              *  completion event is received for these no-op commands
> +              */
> +     }
> +
> +     xhci->cmd_ring_state = CMD_RING_STATE_RUNNING;
> +
> +     /* ring command ring doorbell to restart the command ring */
> +     if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) &&
> +         !(xhci->xhc_state & XHCI_STATE_DYING)) {
> +             xhci->current_cmd = cur_cmd;
> +             xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT);
> +             xhci_ring_cmd_db(xhci);
> +     }
> +}
> +
>  static int xhci_abort_cmd_ring(struct xhci_hcd *xhci)
>  {
>       u64 temp_64;
> @@ -291,16 +345,9 @@ static int xhci_abort_cmd_ring(struct xh
>  
>       xhci_dbg(xhci, "Abort command ring\n");
>  
> -     temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring);
> -     xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
> +     reinit_completion(&xhci->cmd_ring_stop_completion);
>  
> -     /*
> -      * Writing the CMD_RING_ABORT bit should cause a cmd completion event,
> -      * however on some host hw the CMD_RING_RUNNING bit is correctly cleared
> -      * but the completion event in never sent. Use the cmd timeout timer to
> -      * handle those cases. Use twice the time to cover the bit polling retry
> -      */
> -     xhci_mod_cmd_timer(xhci, 2 * XHCI_CMD_DEFAULT_TIMEOUT);
> +     temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring);
>       xhci_write_64(xhci, temp_64 | CMD_RING_ABORT,
>                       &xhci->op_regs->cmd_ring);
>  
> @@ -320,15 +367,30 @@ static int xhci_abort_cmd_ring(struct xh
>               udelay(1000);
>               ret = xhci_handshake(&xhci->op_regs->cmd_ring,
>                                    CMD_RING_RUNNING, 0, 3 * 1000 * 1000);
> -             if (ret == 0)
> -                     return 0;
> +             if (ret < 0) {
> +                     xhci_err(xhci, "Stopped the command ring failed, "
> +                              "maybe the host is dead\n");
> +                     xhci->xhc_state |= XHCI_STATE_DYING;
> +                     xhci_halt(xhci);
> +                     return -ESHUTDOWN;
> +             }
> +     }
>  
> -             xhci_err(xhci, "Stopped the command ring failed, "
> -                             "maybe the host is dead\n");
> -             cancel_delayed_work(&xhci->cmd_timer);
> -             xhci->xhc_state |= XHCI_STATE_DYING;
> -             xhci_halt(xhci);
> -             return -ESHUTDOWN;
> +     /*
> +      * Writing the CMD_RING_ABORT bit should cause a cmd completion event,
> +      * however on some host hw the CMD_RING_RUNNING bit is correctly cleared
> +      * but the completion event in never sent. Wait 2 secs (arbitrary
> +      * number) to handle those cases after negation of CMD_RING_RUNNING.
> +      */
> +     if (!wait_for_completion_timeout(&xhci->cmd_ring_stop_completion,
> +                                      2 * HZ)) {
> +             xhci_dbg(xhci, "No stop event for abort, ring start fail?\n");
> +             xhci_cleanup_command_queue(xhci);
> +     } else {
> +             unsigned long flags;
> +             spin_lock_irqsave(&xhci->lock, flags);
> +             xhci_handle_stopped_cmd_ring(xhci, xhci_next_queued_cmd(xhci));
> +             spin_unlock_irqrestore(&xhci->lock, flags);
>       }
>  
>       return 0;
> @@ -1212,79 +1274,26 @@ void xhci_cleanup_command_queue(struct x
>               xhci_complete_del_and_free_cmd(cur_cmd, COMP_CMD_ABORT);
>  }
>  
> -/*
> - * Turn all commands on command ring with status set to "aborted" to no-op 
> trbs.
> - * If there are other commands waiting then restart the ring and kick the 
> timer.
> - * This must be called with command ring stopped and xhci->lock held.
> - */
> -static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci,
> -                                      struct xhci_command *cur_cmd)
> -{
> -     struct xhci_command *i_cmd, *tmp_cmd;
> -     u32 cycle_state;
> -
> -     /* Turn all aborted commands in list to no-ops, then restart */
> -     list_for_each_entry_safe(i_cmd, tmp_cmd, &xhci->cmd_list,
> -                              cmd_list) {
> -
> -             if (i_cmd->status != COMP_CMD_ABORT)
> -                     continue;
> -
> -             i_cmd->status = COMP_CMD_STOP;
> -
> -             xhci_dbg(xhci, "Turn aborted command %p to no-op\n",
> -                      i_cmd->command_trb);
> -             /* get cycle state from the original cmd trb */
> -             cycle_state = le32_to_cpu(
> -                     i_cmd->command_trb->generic.field[3]) & TRB_CYCLE;
> -             /* modify the command trb to no-op command */
> -             i_cmd->command_trb->generic.field[0] = 0;
> -             i_cmd->command_trb->generic.field[1] = 0;
> -             i_cmd->command_trb->generic.field[2] = 0;
> -             i_cmd->command_trb->generic.field[3] = cpu_to_le32(
> -                     TRB_TYPE(TRB_CMD_NOOP) | cycle_state);
> -
> -             /*
> -              * caller waiting for completion is called when command
> -              *  completion event is received for these no-op commands
> -              */
> -     }
> -
> -     xhci->cmd_ring_state = CMD_RING_STATE_RUNNING;
> -
> -     /* ring command ring doorbell to restart the command ring */
> -     if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) &&
> -         !(xhci->xhc_state & XHCI_STATE_DYING)) {
> -             xhci->current_cmd = cur_cmd;
> -             xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT);
> -             xhci_ring_cmd_db(xhci);
> -     }
> -     return;
> -}
> -
> -
>  void xhci_handle_command_timeout(struct work_struct *work)
>  {
>       struct xhci_hcd *xhci;
>       int ret;
>       unsigned long flags;
>       u64 hw_ring_state;
> -     bool second_timeout = false;
>  
>       xhci = container_of(to_delayed_work(work), struct xhci_hcd, cmd_timer);
>  
>       /* mark this command to be cancelled */
>       spin_lock_irqsave(&xhci->lock, flags);
> -     if (xhci->current_cmd) {
> -             if (xhci->current_cmd->status == COMP_CMD_ABORT)
> -                     second_timeout = true;
> +     if (xhci->current_cmd)
>               xhci->current_cmd->status = COMP_CMD_ABORT;
> -     }
>  
>       /* Make sure command ring is running before aborting it */
>       hw_ring_state = xhci_read_64(xhci, &xhci->op_regs->cmd_ring);
>       if ((xhci->cmd_ring_state & CMD_RING_STATE_RUNNING) &&
>           (hw_ring_state & CMD_RING_RUNNING))  {
> +             /* Prevent new doorbell, and start command abort */
> +             xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
>               spin_unlock_irqrestore(&xhci->lock, flags);
>               xhci_dbg(xhci, "Command timeout\n");
>               ret = xhci_abort_cmd_ring(xhci);
> @@ -1297,10 +1306,10 @@ void xhci_handle_command_timeout(struct
>               return;
>       }
>  
> -     /* command ring failed to restart, or host removed. Bail out */
> -     if (second_timeout || xhci->xhc_state & XHCI_STATE_REMOVING) {
> +     /* host removed. Bail out */
> +     if (xhci->xhc_state & XHCI_STATE_REMOVING) {
>               spin_unlock_irqrestore(&xhci->lock, flags);
> -             xhci_dbg(xhci, "command timed out twice, ring start fail?\n");
> +             xhci_dbg(xhci, "host removed, ring start fail?\n");
>               xhci_cleanup_command_queue(xhci);
>               return;
>       }
> @@ -1347,7 +1356,7 @@ static void handle_cmd_completion(struct
>  
>       /* If CMD ring stopped we own the trbs between enqueue and dequeue */
>       if (cmd_comp_code == COMP_CMD_STOP) {
> -             xhci_handle_stopped_cmd_ring(xhci, cmd);
> +             complete_all(&xhci->cmd_ring_stop_completion);
>               return;
>       }
>  
> diff -puN drivers/usb/host/xhci.h~xhci-fix-abort-race2 drivers/usb/host/xhci.h
> --- xhci/drivers/usb/host/xhci.h~xhci-fix-abort-race2 2016-11-16 
> 18:38:52.552146764 +0900
> +++ xhci-hirofumi/drivers/usb/host/xhci.h     2016-11-17 01:04:06.393018485 
> +0900
> @@ -1574,6 +1574,7 @@ struct xhci_hcd {
>       struct list_head        cmd_list;
>       unsigned int            cmd_ring_reserved_trbs;
>       struct delayed_work     cmd_timer;
> +     struct completion       cmd_ring_stop_completion;
>       struct xhci_command     *current_cmd;
>       struct xhci_ring        *event_ring;
>       struct xhci_erst        erst;
> _

-- 
OGAWA Hirofumi <hirof...@mail.parknet.co.jp>
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to