Chris Wilson <ch...@chris-wilson.co.uk> writes:

> During reset, we try and stop the active ring. This has the consequence
> that we often clobber the RING registers within the context image. When
> we find an active request, we update the context image to rerun that
> request (if it was guilty, we replace the hanging user payload with
> NOPs). However, we were ignoring an active context if the request had
> completed, with the consequence that the next submission on that request
> would start with RING_HEAD==0 and not the tail of the previous request,
> causing all requests still in the ring to be rerun. Rare, but
> occasionally seen within CI where we would spot that the context seqno
> would reverse and complain that we were retiring an incomplete request.
>
>     <0> [412.390350]   <idle>-0       3d.s2 408373352us : 
> __i915_request_submit: rcs0 fence 1e95b:3640 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373353us : 
> __i915_request_submit: rcs0 fence 1e95b:3642 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373354us : 
> __i915_request_submit: rcs0 fence 1e95b:3644 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373354us : 
> __i915_request_submit: rcs0 fence 1e95b:3646 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373356us : 
> __execlists_submission_tasklet: rcs0 in[0]:  ctx=2.1, fence 1e95b:3646 
> (current 3638), prio=4
>     <0> [412.390350] i915_sel-4613    0.... 408373374us : 
> __i915_request_commit: rcs0 fence 1e95b:3648
>     <0> [412.390350] i915_sel-4613    0d..1 408373377us : process_csb: rcs0 
> cs-irq head=2, tail=3
>     <0> [412.390350] i915_sel-4613    0d..1 408373377us : process_csb: rcs0 
> csb[3]: status=0x00000001:0x00000000, active=0x1
>     <0> [412.390350] i915_sel-4613    0d..1 408373378us : 
> __i915_request_submit: rcs0 fence 1e95b:3648 -> current 3638
>     <0> [412.390350]   <idle>-0       3..s1 408373378us : 
> execlists_submission_tasklet: rcs0 awake?=1, active=5
>     <0> [412.390350] i915_sel-4613    0d..1 408373379us : 
> __execlists_submission_tasklet: rcs0 in[0]:  ctx=2.2, fence 1e95b:3648 
> (current 3638), prio=4
>     <0> [412.390350] i915_sel-4613    0.... 408373381us : i915_reset_engine: 
> rcs0 flags=4
>     <0> [412.390350] i915_sel-4613    0.... 408373382us : 
> execlists_reset_prepare: rcs0: depth<-0
>     <0> [412.390350]   <idle>-0       3d.s2 408373390us : process_csb: rcs0 
> cs-irq head=3, tail=4
>     <0> [412.390350]   <idle>-0       3d.s2 408373390us : process_csb: rcs0 
> csb[4]: status=0x00008002:0x00000002, active=0x1
>     <0> [412.390350]   <idle>-0       3d.s2 408373390us : process_csb: rcs0 
> out[0]: ctx=2.2, fence 1e95b:3648 (current 3640), prio=4
>     <0> [412.390350] i915_sel-4613    0.... 408373401us : 
> intel_engine_stop_cs: rcs0
>     <0> [412.390350] i915_sel-4613    0d..1 408373402us : process_csb: rcs0 
> cs-irq head=4, tail=4
>     <0> [412.390350] i915_sel-4613    0.... 408373403us : intel_gpu_reset: 
> engine_mask=1
>     <0> [412.390350] i915_sel-4613    0d..1 408373408us : 
> execlists_cancel_port_requests: rcs0:port0 fence 1e95b:3648, (current 3648)
>     <0> [412.390350] i915_sel-4613    0.... 408373442us : 
> intel_engine_cancel_stop_cs: rcs0
>     <0> [412.390350] i915_sel-4613    0.... 408373442us : 
> execlists_reset_finish: rcs0: depth->0
>     <0> [412.390350] ksoftirq-26      3..s. 408373442us : 
> execlists_submission_tasklet: rcs0 awake?=1, active=0
>     <0> [412.390350] ksoftirq-26      3d.s1 408373443us : process_csb: rcs0 
> cs-irq head=5, tail=5
>     <0> [412.390350] i915_sel-4613    0.... 408373475us : 
> i915_request_retire: rcs0 fence 1e95b:3640, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373476us : 
> i915_request_retire: __retire_engine_request(rcs0) fence 1e95b:3640, current 
> 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373494us : 
> __i915_request_commit: rcs0 fence 1e95b:3650
>     <0> [412.390350] i915_sel-4613    0d..1 408373496us : process_csb: rcs0 
> cs-irq head=5, tail=5
>     <0> [412.390350] i915_sel-4613    0d..1 408373496us : 
> __i915_request_submit: rcs0 fence 1e95b:3650 -> current 3648
>     <0> [412.390350] i915_sel-4613    0d..1 408373498us : 
> __execlists_submission_tasklet: rcs0 in[0]:  ctx=2.1, fence 1e95b:3650 
> (current 3648), prio=6
>     <0> [412.390350] i915_sel-4613    0.... 408373500us : 
> i915_request_retire_upto: rcs0 fence 1e95b:3648, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373500us : 
> i915_request_retire: rcs0 fence 1e95b:3642, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373501us : 
> i915_request_retire: __retire_engine_request(rcs0) fence 1e95b:3642, current 
> 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373514us : 
> i915_request_retire: rcs0 fence 1e95b:3644, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373515us : 
> i915_request_retire: __retire_engine_request(rcs0) fence 1e95b:3644, current 
> 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373527us : 
> i915_request_retire: rcs0 fence 1e95b:3646, current 3640
>     <0> [412.390350]   <idle>-0       3..s1 408373569us : 
> execlists_submission_tasklet: rcs0 awake?=1, active=1
>     <0> [412.390350]   <idle>-0       3d.s2 408373569us : process_csb: rcs0 
> cs-irq head=5, tail=1
>     <0> [412.390350]   <idle>-0       3d.s2 408373570us : process_csb: rcs0 
> csb[0]: status=0x00000001:0x00000000, active=0x1
>     <0> [412.390350]   <idle>-0       3d.s2 408373570us : process_csb: rcs0 
> csb[1]: status=0x00000018:0x00000002, active=0x5
>     <0> [412.390350]   <idle>-0       3d.s2 408373570us : process_csb: rcs0 
> out[0]: ctx=2.1, fence 1e95b:3650 (current 3650), prio=6
>     <0> [412.390350]   <idle>-0       3d.s2 408373571us : process_csb: rcs0 
> completed ctx=2
>     <0> [412.390350] i915_sel-4613    0.... 408373621us : 
> i915_request_retire: i915_request_retire:253 
> GEM_BUG_ON(!i915_request_completed(request))
>
> v2: Fixup the cancellation path to drain the CSB and reset the pointers.
>
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 239 +++++++++++++++++--------------
>  1 file changed, 132 insertions(+), 107 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index f096bc7bbe35..b020bc59b233 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -893,96 +893,6 @@ invalidate_csb_entries(const u32 *first, const u32 *last)
>       clflush((void *)last);
>  }
>  
> -static void reset_csb_pointers(struct intel_engine_execlists *execlists)
> -{
> -     const unsigned int reset_value = GEN8_CSB_ENTRIES - 1;
> -
> -     /*
> -      * After a reset, the HW starts writing into CSB entry [0]. We
> -      * therefore have to set our HEAD pointer back one entry so that
> -      * the *first* entry we check is entry 0. To complicate this further,
> -      * as we don't wait for the first interrupt after reset, we have to
> -      * fake the HW write to point back to the last entry so that our
> -      * inline comparison of our cached head position against the last HW
> -      * write works even before the first interrupt.
> -      */
> -     execlists->csb_head = reset_value;
> -     WRITE_ONCE(*execlists->csb_write, reset_value);
> -
> -     invalidate_csb_entries(&execlists->csb_status[0],
> -                            &execlists->csb_status[GEN8_CSB_ENTRIES - 1]);
> -}
> -
> -static void nop_submission_tasklet(unsigned long data)
> -{
> -     /* The driver is wedged; don't process any more events. */
> -}
> -
> -static void execlists_cancel_requests(struct intel_engine_cs *engine)
> -{
> -     struct intel_engine_execlists * const execlists = &engine->execlists;
> -     struct i915_request *rq, *rn;
> -     struct rb_node *rb;
> -     unsigned long flags;
> -
> -     GEM_TRACE("%s\n", engine->name);
> -
> -     /*
> -      * Before we call engine->cancel_requests(), we should have exclusive
> -      * access to the submission state. This is arranged for us by the
> -      * caller disabling the interrupt generation, the tasklet and other
> -      * threads that may then access the same state, giving us a free hand
> -      * to reset state. However, we still need to let lockdep be aware that
> -      * we know this state may be accessed in hardirq context, so we
> -      * disable the irq around this manipulation and we want to keep
> -      * the spinlock focused on its duties and not accidentally conflate
> -      * coverage to the submission's irq state. (Similarly, although we
> -      * shouldn't need to disable irq around the manipulation of the
> -      * submission's irq state, we also wish to remind ourselves that
> -      * it is irq state.)
> -      */
> -     spin_lock_irqsave(&engine->timeline.lock, flags);
> -
> -     /* Cancel the requests on the HW and clear the ELSP tracker. */
> -     execlists_cancel_port_requests(execlists);
> -     execlists_user_end(execlists);

Cancel all active in the new __reset seems to handle this now.

> -
> -     /* Mark all executing requests as skipped. */
> -     list_for_each_entry(rq, &engine->timeline.requests, link) {
> -             if (!i915_request_signaled(rq))
> -                     dma_fence_set_error(&rq->fence, -EIO);
> -
> -             i915_request_mark_complete(rq);
> -     }
> -
> -     /* Flush the queued requests to the timeline list (for retiring). */
> -     while ((rb = rb_first_cached(&execlists->queue))) {
> -             struct i915_priolist *p = to_priolist(rb);
> -             int i;
> -
> -             priolist_for_each_request_consume(rq, rn, p, i) {
> -                     list_del_init(&rq->sched.link);
> -                     __i915_request_submit(rq);
> -                     dma_fence_set_error(&rq->fence, -EIO);
> -                     i915_request_mark_complete(rq);
> -             }
> -
> -             rb_erase_cached(&p->node, &execlists->queue);
> -             i915_priolist_free(p);
> -     }
> -
> -     /* Remaining _unready_ requests will be nop'ed when submitted */
> -
> -     execlists->queue_priority_hint = INT_MIN;
> -     execlists->queue = RB_ROOT_CACHED;
> -     GEM_BUG_ON(port_isset(execlists->port));
> -
> -     GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
> -     execlists->tasklet.func = nop_submission_tasklet;
> -
> -     spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -}
> -
>  static inline bool
>  reset_in_progress(const struct intel_engine_execlists *execlists)
>  {
> @@ -1904,7 +1814,6 @@ static void execlists_reset_prepare(struct 
> intel_engine_cs *engine)
>  
>       /* And flush any current direct submission. */
>       spin_lock_irqsave(&engine->timeline.lock, flags);
> -     process_csb(engine); /* drain preemption events */
>       spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  }
>  
> @@ -1925,14 +1834,47 @@ static bool lrc_regs_ok(const struct i915_request *rq)
>       return true;
>  }
>  
> -static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> +static void reset_csb_pointers(struct intel_engine_execlists *execlists)
> +{
> +     const unsigned int reset_value = GEN8_CSB_ENTRIES - 1;
> +
> +     /*
> +      * After a reset, the HW starts writing into CSB entry [0]. We
> +      * therefore have to set our HEAD pointer back one entry so that
> +      * the *first* entry we check is entry 0. To complicate this further,
> +      * as we don't wait for the first interrupt after reset, we have to
> +      * fake the HW write to point back to the last entry so that our
> +      * inline comparison of our cached head position against the last HW
> +      * write works even before the first interrupt.
> +      */
> +     execlists->csb_head = reset_value;
> +     WRITE_ONCE(*execlists->csb_write, reset_value);
> +
> +     invalidate_csb_entries(&execlists->csb_status[0],
> +                            &execlists->csb_status[GEN8_CSB_ENTRIES - 1]);
> +}
> +
> +static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  {
>       struct intel_engine_execlists * const execlists = &engine->execlists;
> +     struct intel_context *ce;
>       struct i915_request *rq;
> -     unsigned long flags;
>       u32 *regs;
>  
> -     spin_lock_irqsave(&engine->timeline.lock, flags);
> +     process_csb(engine); /* drain preemption events */
> +
> +     /* Following the reset, we need to reload the CSB read/write pointers */
> +     reset_csb_pointers(&engine->execlists);
> +
> +     /*
> +      * Save the currently executing context, even if we completed
> +      * its request, it was still running at the time of the
> +      * reset and will have been clobbered.
> +      */
> +     if (!port_isset(execlists->port))
> +             goto out_clear;
> +
> +     ce = port_request(execlists->port)->hw_context;
>  
>       /*
>        * Catch up with any missed context-switch interrupts.
> @@ -1947,12 +1889,13 @@ static void execlists_reset(struct intel_engine_cs 
> *engine, bool stalled)
>  
>       /* Push back any incomplete requests for replay after the reset. */
>       rq = __unwind_incomplete_requests(engine);
> -
> -     /* Following the reset, we need to reload the CSB read/write pointers */
> -     reset_csb_pointers(&engine->execlists);
> -
>       if (!rq)
> -             goto out_unlock;
> +             goto out_replay;
> +
> +     if (rq->hw_context != ce) { /* caught just before a CS event */
> +             rq = NULL;

So it was complete but not yet processed.

> +             goto out_replay;
> +     }
>  
>       /*
>        * If this request hasn't started yet, e.g. it is waiting on a
> @@ -1967,7 +1910,7 @@ static void execlists_reset(struct intel_engine_cs 
> *engine, bool stalled)
>        * perfectly and we do not need to flag the result as being erroneous.
>        */
>       if (!i915_request_started(rq) && lrc_regs_ok(rq))
> -             goto out_unlock;
> +             goto out_replay;
>  
>       /*
>        * If the request was innocent, we leave the request in the ELSP
> @@ -1982,7 +1925,7 @@ static void execlists_reset(struct intel_engine_cs 
> *engine, bool stalled)
>        */
>       i915_reset_request(rq, stalled);
>       if (!stalled && lrc_regs_ok(rq))
> -             goto out_unlock;
> +             goto out_replay;
>  
>       /*
>        * We want a simple context + ring to execute the breadcrumb update.
> @@ -1992,21 +1935,103 @@ static void execlists_reset(struct intel_engine_cs 
> *engine, bool stalled)
>        * future request will be after userspace has had the opportunity
>        * to recreate its own state.
>        */
> -     regs = rq->hw_context->lrc_reg_state;
> +     regs = ce->lrc_reg_state;

Yes, looks much saner. Only play with guaranteed active one.

>       if (engine->pinned_default_state) {
>               memcpy(regs, /* skip restoring the vanilla PPHWSP */
>                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
>                      engine->context_size - PAGE_SIZE);
>       }
> +     execlists_init_reg_state(regs, ce, engine, ce->ring);
>  
>       /* Rerun the request; its payload has been neutered (if guilty). */
> -     rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
> -     intel_ring_update_space(rq->ring);
> +out_replay:
> +     ce->ring->head =
> +             rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail;

The ce and rq ring should be same with the rq set. I guess
you had a reasons to keep it as ce, perhaps because it is
the culprit.

Rest is mostly code movement and can't poke holes in this.

Reviewed-by: Mika Kuoppala <mika.kuopp...@linux.intel.com>


> +     intel_ring_update_space(ce->ring);
> +     __execlists_update_reg_state(ce, engine);
> +
> +out_clear:
> +     execlists_clear_all_active(execlists);
> +}
> +
> +static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> +{
> +     unsigned long flags;
> +
> +     GEM_TRACE("%s\n", engine->name);
> +
> +     spin_lock_irqsave(&engine->timeline.lock, flags);
> +
> +     __execlists_reset(engine, stalled);
> +
> +     spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +}
> +
> +static void nop_submission_tasklet(unsigned long data)
> +{
> +     /* The driver is wedged; don't process any more events. */
> +}
> +
> +static void execlists_cancel_requests(struct intel_engine_cs *engine)
> +{
> +     struct intel_engine_execlists * const execlists = &engine->execlists;
> +     struct i915_request *rq, *rn;
> +     struct rb_node *rb;
> +     unsigned long flags;
>  
> -     execlists_init_reg_state(regs, rq->hw_context, engine, rq->ring);
> -     __execlists_update_reg_state(rq->hw_context, engine);
> +     GEM_TRACE("%s\n", engine->name);
> +
> +     /*
> +      * Before we call engine->cancel_requests(), we should have exclusive
> +      * access to the submission state. This is arranged for us by the
> +      * caller disabling the interrupt generation, the tasklet and other
> +      * threads that may then access the same state, giving us a free hand
> +      * to reset state. However, we still need to let lockdep be aware that
> +      * we know this state may be accessed in hardirq context, so we
> +      * disable the irq around this manipulation and we want to keep
> +      * the spinlock focused on its duties and not accidentally conflate
> +      * coverage to the submission's irq state. (Similarly, although we
> +      * shouldn't need to disable irq around the manipulation of the
> +      * submission's irq state, we also wish to remind ourselves that
> +      * it is irq state.)
> +      */
> +     spin_lock_irqsave(&engine->timeline.lock, flags);
> +
> +     __execlists_reset(engine, true);
> +
> +     /* Mark all executing requests as skipped. */
> +     list_for_each_entry(rq, &engine->timeline.requests, link) {
> +             if (!i915_request_signaled(rq))
> +                     dma_fence_set_error(&rq->fence, -EIO);
> +
> +             i915_request_mark_complete(rq);
> +     }
> +
> +     /* Flush the queued requests to the timeline list (for retiring). */
> +     while ((rb = rb_first_cached(&execlists->queue))) {
> +             struct i915_priolist *p = to_priolist(rb);
> +             int i;
> +
> +             priolist_for_each_request_consume(rq, rn, p, i) {
> +                     list_del_init(&rq->sched.link);
> +                     __i915_request_submit(rq);
> +                     dma_fence_set_error(&rq->fence, -EIO);
> +                     i915_request_mark_complete(rq);
> +             }
> +
> +             rb_erase_cached(&p->node, &execlists->queue);
> +             i915_priolist_free(p);
> +     }
> +
> +     /* Remaining _unready_ requests will be nop'ed when submitted */
> +
> +     execlists->queue_priority_hint = INT_MIN;
> +     execlists->queue = RB_ROOT_CACHED;
> +     GEM_BUG_ON(port_isset(execlists->port));
> +
> +     GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
> +     execlists->tasklet.func = nop_submission_tasklet;
>  
> -out_unlock:
>       spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  }
>  
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to