Chris Wilson <ch...@chris-wilson.co.uk> writes: > Check that we can reset the GPU and continue executing from the next > request. > > Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/intel_hangcheck.c | 4 + > .../gpu/drm/i915/selftests/i915_live_selftests.h | 1 + > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 463 > +++++++++++++++++++++ > 3 files changed, 468 insertions(+) > create mode 100644 drivers/gpu/drm/i915/selftests/intel_hangcheck.c > > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c > b/drivers/gpu/drm/i915/intel_hangcheck.c > index f05971f5586f..dce742243ba6 100644 > --- a/drivers/gpu/drm/i915/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c > @@ -480,3 +480,7 @@ void intel_hangcheck_init(struct drm_i915_private *i915) > INIT_DELAYED_WORK(&i915->gpu_error.hangcheck_work, > i915_hangcheck_elapsed); > } > + > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > +#include "selftests/intel_hangcheck.c" > +#endif > diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h > b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h > index 0c925f17b445..e6699c59f244 100644 > --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h > +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h > @@ -15,3 +15,4 @@ selftest(object, i915_gem_object_live_selftests) > selftest(coherency, i915_gem_coherency_live_selftests) > selftest(gtt, i915_gem_gtt_live_selftests) > selftest(context, i915_gem_context_live_selftests) > +selftest(hangcheck, intel_hangcheck_live_selftests) > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > new file mode 100644 > index 000000000000..d306890ba7eb > --- /dev/null > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -0,0 +1,463 @@ > +/* > + * Copyright © 2016 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS > + * IN THE SOFTWARE. > + * > + */ > + > +#include "i915_selftest.h" > + > +struct hang { > + struct drm_i915_private *i915; > + struct drm_i915_gem_object *hws; > + struct drm_i915_gem_object *obj; > + u32 *seqno; > + u32 *batch; > +}; > + > +static int hang_init(struct hang *h, struct drm_i915_private *i915) > +{ > + void *vaddr; > + > + memset(h, 0, sizeof(*h)); > + h->i915 = i915; > + > + h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE); > + if (IS_ERR(h->hws)) > + return PTR_ERR(h->hws); > + > + h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE); > + if (IS_ERR(h->obj)) { > + i915_gem_object_put(h->obj);
i915_gem_object_put(h->hws); > + return PTR_ERR(h->obj); > + } > + > + i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC); > + vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); > + if (IS_ERR(vaddr)) { > + i915_gem_object_put(h->hws); > + i915_gem_object_put(h->obj); > + return PTR_ERR(vaddr); > + } > + h->seqno = memset(vaddr, 0xff, PAGE_SIZE); > + > + vaddr = i915_gem_object_pin_map(h->obj, > + HAS_LLC(i915) ? I915_MAP_WB : > I915_MAP_WC); > + if (IS_ERR(vaddr)) { > + i915_gem_object_unpin_map(h->hws); > + i915_gem_object_put(h->hws); > + i915_gem_object_put(h->obj); > + return PTR_ERR(vaddr); > + } > + h->batch = vaddr; > + > + return 0; > +} > + > +static u64 hws_address(const struct i915_vma *hws, > + const struct drm_i915_gem_request *rq) > +{ > + return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); fence.context is something unique returned by dma_fence_context_alloc() and we assume we don't collide in the scope of this test? > +} > + > +static int emit_recurse_batch(struct hang *h, > + struct drm_i915_gem_request *rq) > +{ > + struct drm_i915_private *i915 = h->i915; > + struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base > : &i915->ggtt.base; > + struct i915_vma *hws, *vma; > + u32 *batch; > + int err; > + > + vma = i915_vma_instance(h->obj, vm, NULL); > + if (IS_ERR(vma)) > + return PTR_ERR(vma); > + > + hws = i915_vma_instance(h->hws, vm, NULL); > + if (IS_ERR(hws)) > + return PTR_ERR(hws); > + > + err = i915_vma_pin(vma, 0, 0, PIN_USER); > + if (err) > + return err; > + > + err = i915_vma_pin(hws, 0, 0, PIN_USER); > + if (err) > + goto unpin_vma; > + > + i915_vma_move_to_active(vma, rq, 0); > + i915_vma_move_to_active(hws, rq, 0); > + > + batch = h->batch; > + if (INTEL_GEN(i915) >= 8) { > + *batch++ = MI_STORE_DWORD_IMM_GEN4; > + *batch++ = lower_32_bits(hws_address(hws, rq)); > + *batch++ = upper_32_bits(hws_address(hws, rq)); > + *batch++ = rq->fence.seqno; > + *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; > + *batch++ = lower_32_bits(vma->node.start); > + *batch++ = upper_32_bits(vma->node.start); > + } else if (INTEL_GEN(i915) >= 6) { > + *batch++ = MI_STORE_DWORD_IMM_GEN4; > + *batch++ = 0; > + *batch++ = lower_32_bits(hws_address(hws, rq)); > + *batch++ = rq->fence.seqno; > + *batch++ = MI_BATCH_BUFFER_START | 1 << 8; > + *batch++ = lower_32_bits(vma->node.start); > + } else if (INTEL_GEN(i915) >= 4) { > + *batch++ = MI_STORE_DWORD_IMM_GEN4; > + *batch++ = 0; > + *batch++ = lower_32_bits(hws_address(hws, rq)); > + *batch++ = rq->fence.seqno; > + *batch++ = MI_BATCH_BUFFER_START | 2 << 6; > + *batch++ = lower_32_bits(vma->node.start); > + } else { > + *batch++ = MI_STORE_DWORD_IMM; > + *batch++ = lower_32_bits(hws_address(hws, rq)); > + *batch++ = rq->fence.seqno; > + *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1; > + *batch++ = lower_32_bits(vma->node.start); > + } > + > + err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); > + > + i915_vma_unpin(hws); > +unpin_vma: > + i915_vma_unpin(vma); > + return err; > +} > + > +static struct drm_i915_gem_request * > +hang_create_request(struct hang *h, > + struct intel_engine_cs *engine, > + struct i915_gem_context *ctx) > +{ > + struct drm_i915_gem_request *rq; > + int err; > + > + if (i915_gem_object_is_active(h->obj)) { > + struct drm_i915_gem_object *obj; > + void *vaddr; > + > + obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); > + if (IS_ERR(obj)) > + return ERR_CAST(obj); > + > + vaddr = i915_gem_object_pin_map(obj, > + HAS_LLC(h->i915) ? I915_MAP_WB > : I915_MAP_WC); > + if (IS_ERR(vaddr)) { > + i915_gem_object_put(obj); > + return ERR_CAST(vaddr); > + } > + > + i915_gem_object_unpin_map(h->obj); > + __i915_gem_object_release_unless_active(h->obj); > + > + h->obj = obj; > + h->batch = vaddr; This whole block confuses me. Is it about the reset queue test if something went wrong with the previous request? > + } > + > + rq = i915_gem_request_alloc(engine, ctx); > + if (IS_ERR(rq)) > + return rq; > + > + err = emit_recurse_batch(h, rq); > + if (err) { > + __i915_add_request(rq, false); > + return ERR_PTR(err); > + } > + > + return rq; > +} > + > +static u32 hws_seqno(const struct hang *h, > + const struct drm_i915_gem_request *rq) > +{ > + return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); > +} > + > +static void hang_fini(struct hang *h) > +{ > + *h->batch = MI_BATCH_BUFFER_END; > + > + i915_gem_object_unpin_map(h->obj); > + __i915_gem_object_release_unless_active(h->obj); > + > + i915_gem_object_unpin_map(h->hws); > + __i915_gem_object_release_unless_active(h->hws); > +} > + > +static int igt_hang_sanitycheck(void *arg) > +{ > + struct drm_i915_private *i915 = arg; > + struct drm_i915_gem_request *rq; > + struct hang h; > + int err; > + > + /* Basic check that we can execute our hanging batch */ > + > + mutex_lock(&i915->drm.struct_mutex); > + err = hang_init(&h, i915); > + if (err) > + goto unlock; > + > + rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto fini; > + } > + > + i915_gem_request_get(rq); > + > + *h.batch = MI_BATCH_BUFFER_END; > + __i915_add_request(rq, true); > + > + i915_wait_request(rq, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT); > + i915_gem_request_put(rq); > + > +fini: > + hang_fini(&h); > +unlock: > + mutex_unlock(&i915->drm.struct_mutex); > + return err; > +} > + > +static int igt_global_reset(void *arg) > +{ > + struct drm_i915_private *i915 = arg; > + unsigned int reset_count; > + int err = 0; > + > + /* Check that we can issue a global GPU reset */ > + > + if (!intel_has_gpu_reset(i915)) > + return 0; > + > + set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags); > + > + mutex_lock(&i915->drm.struct_mutex); > + reset_count = i915_reset_count(&i915->gpu_error); > + > + i915_reset(i915); > + > + if (i915_reset_count(&i915->gpu_error) == reset_count) { > + pr_err("No GPU reset recorded!\n"); > + err = -EINVAL; > + } > + mutex_unlock(&i915->drm.struct_mutex); > + > + GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags)); > + if (i915_terminally_wedged(&i915->gpu_error)) > + err = -EIO; > + > + return err; > +} > + > +static u32 fake_hangcheck(struct drm_i915_gem_request *rq) > +{ > + u32 reset_count; > + > + rq->engine->hangcheck.stalled = true; > + rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine); > + > + reset_count = i915_reset_count(&rq->i915->gpu_error); > + > + set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags); > + wake_up_all(&rq->i915->gpu_error.wait_queue); > + > + return reset_count; > +} > + > +static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq) > +{ > + return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), > + rq->fence.seqno), > + 10) && > + wait_for(i915_seqno_passed(hws_seqno(h, rq), > + rq->fence.seqno), > + 1000)); > +} > + > +static int igt_wait_reset(void *arg) > +{ > + struct drm_i915_private *i915 = arg; > + struct drm_i915_gem_request *rq; > + unsigned int reset_count; > + struct hang h; > + long timeout; > + int err; > + > + /* Check that we detect a stuck waiter and issue a reset */ > + > + if (!intel_has_gpu_reset(i915)) > + return 0; > + > + set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags); Noticed that you do this early. In this test the fake_hangcheck would do it for you also but I suspect you want this to gain exclusive access after this point? -Mika > + > + mutex_lock(&i915->drm.struct_mutex); > + err = hang_init(&h, i915); > + if (err) > + goto unlock; > + > + rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto fini; > + } > + > + i915_gem_request_get(rq); > + __i915_add_request(rq, true); > + > + if (!wait_for_hang(&h, rq)) { > + pr_err("Failed to start request %x\n", rq->fence.seqno); > + err = -EIO; > + goto fini; > + } > + > + reset_count = fake_hangcheck(rq); > + > + timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10); > + if (timeout < 0) { > + pr_err("i915_wait_request failed on a stuck request: err=%ld\n", > + timeout); > + err = timeout; > + goto fini; > + } > + GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags)); > + > + if (i915_reset_count(&i915->gpu_error) == reset_count) { > + pr_err("No GPU reset recorded!\n"); > + err = -EINVAL; > + goto fini; > + } > + > +fini: > + hang_fini(&h); > +unlock: > + mutex_unlock(&i915->drm.struct_mutex); > + > + if (i915_terminally_wedged(&i915->gpu_error)) > + return -EIO; > + > + return err; > +} > + > +static int igt_reset_queue(void *arg) > +{ > + IGT_TIMEOUT(end_time); > + struct drm_i915_private *i915 = arg; > + struct drm_i915_gem_request *prev; > + unsigned int count; > + struct hang h; > + int err; > + > + /* Check that we replay pending requests following a hang */ > + > + if (!intel_has_gpu_reset(i915)) > + return 0; > + > + mutex_lock(&i915->drm.struct_mutex); > + err = hang_init(&h, i915); > + if (err) > + goto unlock; > + > + prev = hang_create_request(&h, i915->engine[RCS], i915->kernel_context); > + if (IS_ERR(prev)) { > + err = PTR_ERR(prev); > + goto fini; > + } > + > + i915_gem_request_get(prev); > + __i915_add_request(prev, true); > + > + count = 0; > + do { > + struct drm_i915_gem_request *rq; > + unsigned int reset_count; > + > + rq = hang_create_request(&h, i915->engine[RCS], > i915->kernel_context); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto fini; > + } > + > + i915_gem_request_get(rq); > + __i915_add_request(rq, true); > + > + if (!wait_for_hang(&h, prev)) { > + pr_err("Failed to start request %x\n", > + prev->fence.seqno); > + err = -EIO; > + goto fini; > + } > + > + reset_count = fake_hangcheck(prev); > + > + i915_reset(i915); > + > + GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, > &i915->gpu_error.flags)); > + if (prev->fence.error != -EIO) { > + pr_err("GPU reset not recorded on hanging request > [fence.error=%d]!\n", > + prev->fence.error); > + err = -EINVAL; > + goto fini; > + } > + > + if (rq->fence.error) { > + pr_err("Fence error status not zero [%d] after > unrelated reset\n", > + rq->fence.error); > + err = -EINVAL; > + goto fini; > + } > + > + if (i915_reset_count(&i915->gpu_error) == reset_count) { > + pr_err("No GPU reset recorded!\n"); > + err = -EINVAL; > + goto fini; > + } > + > + i915_gem_request_put(prev); > + prev = rq; > + count++; > + } while (time_before(jiffies, end_time)); > + pr_info("Completed %d resets\n", count); > + i915_gem_request_put(prev); > + > +fini: > + hang_fini(&h); > +unlock: > + mutex_unlock(&i915->drm.struct_mutex); > + > + if (i915_terminally_wedged(&i915->gpu_error)) > + return -EIO; > + > + return err; > +} > + > +int intel_hangcheck_live_selftests(struct drm_i915_private *i915) > +{ > + static const struct i915_subtest tests[] = { > + SUBTEST(igt_hang_sanitycheck), > + SUBTEST(igt_global_reset), > + SUBTEST(igt_wait_reset), > + SUBTEST(igt_reset_queue), > + }; > + return i915_subtests(tests, i915); > +} > -- > 2.11.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx