This is great but the real question is why does the IO get jammed?
On Fri, Dec 24, 2010 at 04:09:23PM +1000, David Gwynne wrote:
> i can reliably produce a situation where an io on a disk attached
> to mpii(4) never completes. this implements timeouts on scsi io so
> we can recover from this situation.
>
> ok?
>
> Index: mpii.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/mpii.c,v
> retrieving revision 1.35
> diff -u -p -r1.35 mpii.c
> --- mpii.c 23 Aug 2010 00:53:36 -0000 1.35
> +++ mpii.c 24 Dec 2010 06:04:38 -0000
> @@ -1757,7 +1757,8 @@ struct mpii_ccb {
> volatile enum {
> MPII_CCB_FREE,
> MPII_CCB_READY,
> - MPII_CCB_QUEUED
> + MPII_CCB_QUEUED,
> + MPII_CCB_TIMEOUT
> } ccb_state;
>
> void (*ccb_done)(struct mpii_ccb *);
> @@ -1822,6 +1823,15 @@ struct mpii_softc {
> struct mpii_ccb_list sc_ccb_free;
> struct mutex sc_ccb_free_mtx;
>
> + struct mutex sc_ccb_mtx;
> + /*
> + * this protects the ccb state and list entry
> + * between mpii_scsi_cmd and scsidone.
> + */
> +
> + struct mpii_ccb_list sc_ccb_tmos;
> + struct scsi_iohandler sc_ccb_tmo_handler;
> +
> struct scsi_iopool sc_iopool;
>
> struct mpii_dmamem *sc_requests;
> @@ -1894,6 +1904,10 @@ int mpii_alloc_queues(struct mpii_softc
> void mpii_push_reply(struct mpii_softc *, struct mpii_rcb *);
> void mpii_push_replies(struct mpii_softc *);
>
> +void mpii_scsi_cmd_tmo(void *);
> +void mpii_scsi_cmd_tmo_handler(void *, void *);
> +void mpii_scsi_cmd_tmo_done(struct mpii_ccb *);
> +
> int mpii_alloc_dev(struct mpii_softc *);
> int mpii_insert_dev(struct mpii_softc *, struct mpii_device *);
> int mpii_remove_dev(struct mpii_softc *, struct mpii_device *);
> @@ -4035,7 +4049,11 @@ mpii_alloc_ccbs(struct mpii_softc *sc)
> int i;
>
> SLIST_INIT(&sc->sc_ccb_free);
> + SLIST_INIT(&sc->sc_ccb_tmos);
> mtx_init(&sc->sc_ccb_free_mtx, IPL_BIO);
> + mtx_init(&sc->sc_ccb_mtx, IPL_BIO);
> + scsi_ioh_set(&sc->sc_ccb_tmo_handler, &sc->sc_iopool,
> + mpii_scsi_cmd_tmo_handler, sc);
>
> sc->sc_ccbs = malloc(sizeof(*ccb) * (sc->sc_request_depth-1),
> M_DEVBUF, M_NOWAIT | M_ZERO);
> @@ -4448,6 +4466,7 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
> DNPRINTF(MPII_D_CMD, "%s: Offset0: 0x%02x\n", DEVNAME(sc),
> io->sgl_offset0);
>
> + timeout_set(&xs->stimeout, mpii_scsi_cmd_tmo, ccb);
> if (xs->flags & SCSI_POLL) {
> if (mpii_poll(sc, ccb) != 0) {
> xs->error = XS_DRIVER_STUFFUP;
> @@ -4459,10 +4478,66 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
> DNPRINTF(MPII_D_CMD, "%s: mpii_scsi_cmd(): opcode: %02x "
> "datalen: %d\n", DEVNAME(sc), xs->cmd->opcode, xs->datalen);
>
> + timeout_add_msec(&xs->stimeout, xs->timeout);
> mpii_start(sc, ccb);
> }
>
> void
> +mpii_scsi_cmd_tmo(void *xccb)
> +{
> + struct mpii_ccb *ccb = xccb;
> + struct mpii_softc *sc = ccb->ccb_sc;
> +
> + printf("%s: mpii_scsi_cmd_tmo\n", DEVNAME(sc));
> +
> + mtx_enter(&sc->sc_ccb_mtx);
> + if (ccb->ccb_state == MPII_CCB_QUEUED) {
> + ccb->ccb_state = MPII_CCB_TIMEOUT;
> + SLIST_INSERT_HEAD(&sc->sc_ccb_tmos, ccb, ccb_link);
> + }
> + mtx_leave(&sc->sc_ccb_mtx);
> +
> + scsi_ioh_add(&sc->sc_ccb_tmo_handler);
> +}
> +
> +void
> +mpii_scsi_cmd_tmo_handler(void *cookie, void *io)
> +{
> + struct mpii_softc *sc = cookie;
> + struct mpii_ccb *tccb = io;
> + struct mpii_ccb *ccb;
> + struct mpii_msg_scsi_task_request *stq;
> +
> + mtx_enter(&sc->sc_ccb_mtx);
> + ccb = SLIST_FIRST(&sc->sc_ccb_tmos);
> + if (ccb != NULL) {
> + SLIST_REMOVE_HEAD(&sc->sc_ccb_tmos, ccb_link);
> + ccb->ccb_state = MPII_CCB_QUEUED;
> + }
> + /* should remove any other ccbs for the same dev handle */
> + mtx_leave(&sc->sc_ccb_mtx);
> +
> + if (ccb == NULL) {
> + scsi_io_put(&sc->sc_iopool, tccb);
> + return;
> + }
> +
> + stq = tccb->ccb_cmd;
> + stq->function = MPII_FUNCTION_SCSI_TASK_MGMT;
> + stq->task_type = MPII_SCSI_TASK_TARGET_RESET;
> + stq->dev_handle = htole16(ccb->ccb_dev_handle);
> +
> + tccb->ccb_done = mpii_scsi_cmd_tmo_done;
> + mpii_start(sc, tccb);
> +}
> +
> +void
> +mpii_scsi_cmd_tmo_done(struct mpii_ccb *tccb)
> +{
> + mpii_scsi_cmd_tmo_handler(tccb->ccb_sc, tccb);
> +}
> +
> +void
> mpii_scsi_cmd_done(struct mpii_ccb *ccb)
> {
> struct mpii_msg_scsi_io_error *sie;
> @@ -4470,6 +4545,14 @@ mpii_scsi_cmd_done(struct mpii_ccb *ccb)
> struct scsi_xfer *xs = ccb->ccb_cookie;
> struct mpii_ccb_bundle *mcb = ccb->ccb_cmd;
> bus_dmamap_t dmap = ccb->ccb_dmamap;
> +
> + timeout_del(&xs->stimeout);
> + mtx_enter(&sc->sc_ccb_mtx);
> + if (ccb->ccb_state == MPII_CCB_TIMEOUT)
> + SLIST_REMOVE(&sc->sc_ccb_tmos, ccb, mpii_ccb, ccb_link);
> +
> + ccb->ccb_state = MPII_CCB_READY;
> + mtx_leave(&sc->sc_ccb_mtx);
>
> if (xs->datalen != 0) {
> bus_dmamap_sync(sc->sc_dmat, dmap, 0, dmap->dm_mapsize,