This is purely a timing issue. Here, sometimes Job free
is happening before the job is done.
To fix this issue moving 'dma_fence_cb' callback from
job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).

BUG: kernel NULL pointer dereference, address: 0000000000000088
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
 Arvind : [dma_fence_default_wait _START] timeout = -1
 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
 Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 
00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 
ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
 FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
 Call Trace:
  <IRQ>
  drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
  dma_fence_signal_timestamp_locked+0x7e/0x110
  dma_fence_signal+0x31/0x60
  amdgpu_fence_process+0xc4/0x140 [amdgpu]
  gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
  amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
  amdgpu_ih_process+0x86/0x100 [amdgpu]
  amdgpu_irq_handler+0x24/0x60 [amdgpu]
  __handle_irq_event_percpu+0x4b/0x190
  handle_irq_event_percpu+0x15/0x50
  handle_irq_event+0x39/0x60
  handle_edge_irq+0xaf/0x210
  __common_interrupt+0x6e/0x110
  common_interrupt+0xc1/0xe0
  </IRQ>
  <TASK>

Signed-off-by: Arvind Yadav <arvind.ya...@amd.com>
---

Changes in v2: Moving 'dma_fence_cb' callback from
job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence)
instead of adding NULL check for s_fence.

---
 drivers/gpu/drm/scheduler/sched_main.c | 23 +++++++++++------------
 include/drm/gpu_scheduler.h            |  6 ++++--
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index 4cc59bae38dd..62d8eca05b92 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -253,13 +253,12 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 
 /**
  * drm_sched_job_done - complete a job
- * @s_job: pointer to the job which is done
+ * @s_fence: pointer to the fence of a done job
  *
  * Finish the job's fence and wake up the worker thread.
  */
-static void drm_sched_job_done(struct drm_sched_job *s_job)
+static void drm_sched_job_done(struct drm_sched_fence *s_fence)
 {
-       struct drm_sched_fence *s_fence = s_job->s_fence;
        struct drm_gpu_scheduler *sched = s_fence->sched;
 
        atomic_dec(&sched->hw_rq_count);
@@ -280,9 +279,9 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
  */
 static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
 {
-       struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, 
cb);
+       struct drm_sched_fence *s_fence = container_of(cb, struct 
drm_sched_fence, cb);
 
-       drm_sched_job_done(s_job);
+       drm_sched_job_done(s_fence);
 }
 
 /**
@@ -506,7 +505,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
                                         list) {
                if (s_job->s_fence->parent &&
                    dma_fence_remove_callback(s_job->s_fence->parent,
-                                             &s_job->cb)) {
+                                             &s_job->s_fence->cb)) {
                        dma_fence_put(s_job->s_fence->parent);
                        s_job->s_fence->parent = NULL;
                        atomic_dec(&sched->hw_rq_count);
@@ -576,15 +575,15 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, 
bool full_recovery)
                        continue;
 
                if (fence) {
-                       r = dma_fence_add_callback(fence, &s_job->cb,
+                       r = dma_fence_add_callback(fence, &s_job->s_fence->cb,
                                                   drm_sched_job_done_cb);
                        if (r == -ENOENT)
-                               drm_sched_job_done(s_job);
+                               drm_sched_job_done(s_job->s_fence);
                        else if (r)
                                DRM_DEV_ERROR(sched->dev, "fence add callback 
failed (%d)\n",
                                          r);
                } else
-                       drm_sched_job_done(s_job);
+                       drm_sched_job_done(s_job->s_fence);
        }
 
        if (full_recovery) {
@@ -1053,10 +1052,10 @@ static int drm_sched_main(void *param)
                        /* Drop for original kref_init of the fence */
                        dma_fence_put(fence);
 
-                       r = dma_fence_add_callback(fence, &sched_job->cb,
+                       r = dma_fence_add_callback(fence, &s_fence->cb,
                                                   drm_sched_job_done_cb);
                        if (r == -ENOENT)
-                               drm_sched_job_done(sched_job);
+                               drm_sched_job_done(s_fence);
                        else if (r)
                                DRM_DEV_ERROR(sched->dev, "fence add callback 
failed (%d)\n",
                                          r);
@@ -1064,7 +1063,7 @@ static int drm_sched_main(void *param)
                        if (IS_ERR(fence))
                                dma_fence_set_error(&s_fence->finished, 
PTR_ERR(fence));
 
-                       drm_sched_job_done(sched_job);
+                       drm_sched_job_done(s_fence);
                }
 
                wake_up(&sched->job_scheduled);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 1f7d9dd1a444..9a96d49dfd75 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -281,6 +281,10 @@ struct drm_sched_fence {
          * @owner: job owner for debugging
          */
        void                            *owner;
+       /**
+        * @cb: callback
+        */
+       struct dma_fence_cb cb;
 };
 
 struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
@@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence 
*f);
  *         be scheduled further.
  * @s_priority: the priority of the job.
  * @entity: the entity to which this job belongs.
- * @cb: the callback for the parent fence in s_fence.
  *
  * A job is created by the driver using drm_sched_job_init(), and
  * should call drm_sched_entity_push_job() once it wants the scheduler
@@ -325,7 +328,6 @@ struct drm_sched_job {
        atomic_t                        karma;
        enum drm_sched_priority         s_priority;
        struct drm_sched_entity         *entity;
-       struct dma_fence_cb             cb;
        /**
         * @dependencies:
         *
-- 
2.25.1

Reply via email to