Chris Wilson <ch...@chris-wilson.co.uk> writes:

> We call i915_gem_reset_prepare_engine() during reset and then upon
> wedging if the reset fails. Unfortunately, kthread_park and similar do
> not support being called recursively and so we must count the number of
> times we prepare for reset and only actually prepare on the outermost
> layer. (Similarly for finish on unwinding the onion.)
>
> [   87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505 
> kthread_park+0x55/0x60
> [   87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek 
> snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp 
> crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec 
> snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers 
> mei lpc_ich
> [   87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G     U            
> 4.17.0-rc5-CI-CI_DRM_4177+ #1
> [   87.705620] Hardware name: Dell Inc. XPS 8300  /0Y2MRG, BIOS A06 10/17/2011
> [   87.705622] RIP: 0010:kthread_park+0x55/0x60
> [   87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202
> [   87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX: 
> 0000000000000001
> [   87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI: 
> ffff88020e8f0040
> [   87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09: 
> 0000000000000000
> [   87.705632] R10: 0000000000000000 R11: 0000000000000000 R12: 
> ffff8802159376b0
> [   87.705634] R13: ffff880215937670 R14: ffff880215930000 R15: 
> ffffffffa01c8d60
> [   87.705636] FS:  00007f0c32061980(0000) GS:ffff88022fa80000(0000) 
> knlGS:0000000000000000
> [   87.705637] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4: 
> 00000000000606e0
> [   87.705641] Call Trace:
> [   87.705668]  i915_gem_reset_prepare_engine+0x1d/0xa0 [i915]
> [   87.705694]  i915_gem_set_wedged+0x7b/0x1e0 [i915]
> [   87.705699]  ? __drm_printfn_info+0x20/0x20
> [   87.705722]  i915_reset+0x14a/0x290 [i915]
> [   87.705743]  i915_reset_device+0x1fb/0x290 [i915]
> [   87.705767]  ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915]
> [   87.705772]  ? work_on_cpu_safe+0x50/0x50
> [   87.705798]  i915_handle_error+0x207/0x4a0 [i915]
> [   87.705810]  ? __might_fault+0x39/0x90
> [   87.705835]  i915_wedged_set+0x7f/0xc0 [i915]
> [   87.705841]  simple_attr_write+0xb0/0xd0
> [   87.705847]  full_proxy_write+0x51/0x80
> [   87.705852]  __vfs_write+0x31/0x160
> [   87.705857]  ? rcu_read_lock_sched_held+0x6f/0x80
> [   87.705860]  ? rcu_sync_lockdep_assert+0x29/0x50
> [   87.705862]  ? __sb_start_write+0x152/0x1f0
> [   87.705864]  ? __sb_start_write+0x168/0x1f0
> [   87.705868]  vfs_write+0xbd/0x1a0
> [   87.705872]  ksys_write+0x50/0xc0
> [   87.705877]  do_syscall_64+0x55/0x190
> [   87.705880]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   87.705882] RIP: 0033:0x7f0c315df281
> [   87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX: 
> 0000000000000001
> [   87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 
> 00007f0c315df281
> [   87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI: 
> 0000000000000047
> [   87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09: 
> 0000000000000034
> [   87.705892] R10: 0000000000000000 R11: 0000000000000246 R12: 
> 000055a5e23ebc50
> [   87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15: 
> 0000000000000000
> [   87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0 
> 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f> 
> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff
>
> References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() 
> completion issue")
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         | 52 ++++++++++++++++++-------
>  drivers/gpu/drm/i915/intel_engine_cs.c  |  1 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
>  3 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 0a2070112b66..b169b630bf78 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs 
> *engine)
>       return active;
>  }
>  
> -/*
> - * Ensure irq handler finishes, and not run again.
> - * Also return the active request so that we only search for it once.
> - */
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  {
> -     struct i915_request *request = NULL;
> +     mutex_lock(&engine->reset_lock);
> +     if (engine->reset_depth++)
> +             goto unlock;
>  
>       /*
>        * During the reset sequence, we must prevent the engine from
> @@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs 
> *engine)
>       if (engine->i915->guc.preempt_wq)
>               flush_workqueue(engine->i915->guc.preempt_wq);
>  
> +unlock:
> +     mutex_unlock(&engine->reset_lock);
> +}
> +
> +static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> +{
> +     mutex_lock(&engine->reset_lock);
> +
> +     GEM_BUG_ON(engine->reset_depth);

!engine->reset_depth

-Mika

> +     if (--engine->reset_depth)
> +             goto unlock;
> +
> +     tasklet_enable(&engine->execlists.tasklet);
> +     kthread_unpark(engine->breadcrumbs.signaler);
> +
> +     intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +
> +unlock:
> +     mutex_unlock(&engine->reset_lock);
> +}
> +
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * Also return the active request so that we only search for it once.
> + */
> +struct i915_request *
> +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +{
> +     struct i915_request *request = NULL;
> +
> +     __i915_gem_reset_prepare_engine(engine);
> +
>       if (engine->irq_seqno_barrier)
>               engine->irq_seqno_barrier(engine);
>  
> @@ -3265,10 +3294,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv,
>  
>  void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
>  {
> -     tasklet_enable(&engine->execlists.tasklet);
> -     kthread_unpark(engine->breadcrumbs.signaler);
> -
> -     intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +     __i915_gem_reset_finish_engine(engine);
>  }
>  
>  void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> @@ -3332,7 +3358,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>        * for which we haven't set the fence error to EIO yet).
>        */
>       for_each_engine(engine, i915, id) {
> -             i915_gem_reset_prepare_engine(engine);
> +             __i915_gem_reset_prepare_engine(engine);
>  
>               engine->submit_request = nop_submit_request;
>               engine->schedule = NULL;
> @@ -3380,7 +3406,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>                                              
> intel_engine_last_submit(engine));
>               spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  
> -             i915_gem_reset_finish_engine(engine);
> +             __i915_gem_reset_finish_engine(engine);
>       }
>  
>       GEM_TRACE("end\n");
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
> b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 6bfd7e3ed152..0de489da514e 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -306,6 +306,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
>       /* Nothing to do here, execute in order of dependencies */
>       engine->schedule = NULL;
>  
> +     mutex_init(&engine->reset_lock);
>       seqlock_init(&engine->stats.lock);
>  
>       ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 010750e8ee44..ec2b359f3e8b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -566,6 +566,9 @@ struct intel_engine_cs {
>  
>       struct intel_engine_hangcheck hangcheck;
>  
> +     struct mutex reset_lock;
> +     unsigned int reset_depth;
> +
>  #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>  #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>  #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> -- 
> 2.17.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to