sched: Waiting for pending jobs to complete in scheduler kill

Matthew Brost Mon, 11 Sep 2023 19:17:52 -0700

Wait for pending jobs to be complete before signaling queued jobs. This
ensures dma-fence signaling order correct and also ensures the entity is
not running on the hardware after drm_sched_entity_flush or
drm_sched_entity_fini returns.


Signed-off-by: Matthew Brost <matthew.br...@intel.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
 drivers/gpu/drm/scheduler/sched_entity.c    |  7 ++-
 drivers/gpu/drm/scheduler/sched_main.c      | 50 ++++++++++++++++++---
 include/drm/gpu_scheduler.h                 | 18 ++++++++
 4 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index fb5dad687168..7835c0da65c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1873,7 +1873,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
amdgpu_ring *ring)
        list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
                if (dma_fence_is_signaled(&s_job->s_fence->finished)) {
                        /* remove job from ring_mirror_list */
-                       list_del_init(&s_job->list);
+                       drm_sched_remove_pending_job(s_job);
                        sched->ops->free_job(s_job);
                        continue;
                }
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c
index 1dec97caaba3..37557fbb96d0 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -104,9 +104,11 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
        }
 
        init_completion(&entity->entity_idle);
+       init_completion(&entity->jobs_done);
 
-       /* We start in an idle state. */
+       /* We start in an idle and jobs done state. */
        complete_all(&entity->entity_idle);
+       complete_all(&entity->jobs_done);
 
        spin_lock_init(&entity->rq_lock);
        spsc_queue_init(&entity->job_queue);
@@ -256,6 +258,9 @@ static void drm_sched_entity_kill(struct drm_sched_entity 
*entity)
        /* Make sure this entity is not used by the scheduler at the moment */
        wait_for_completion(&entity->entity_idle);
 
+       /* Make sure all pending jobs are done */
+       wait_for_completion(&entity->jobs_done);
+
        /* The entity is guaranteed to not be used by the scheduler */
        prev = rcu_dereference_check(entity->last_scheduled, true);
        dma_fence_get(prev);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index 689fb6686e01..ed6f5680793a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -510,12 +510,52 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler 
*sched,
 }
 EXPORT_SYMBOL(drm_sched_resume_timeout);
 
+/**
+ * drm_sched_add_pending_job - Add pending job to scheduler
+ *
+ * @job: scheduler job to add
+ * @tail: add to tail of pending list
+ */
+void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail)
+{
+       struct drm_gpu_scheduler *sched = job->sched;
+       struct drm_sched_entity *entity = job->entity;
+
+       lockdep_assert_held(&sched->job_list_lock);
+
+       if (tail)
+               list_add_tail(&job->list, &sched->pending_list);
+       else
+               list_add(&job->list, &sched->pending_list);
+       if (!entity->pending_job_count++)
+               reinit_completion(&entity->jobs_done);
+}
+EXPORT_SYMBOL(drm_sched_add_pending_job);
+
+/**
+ * drm_sched_remove_pending_job - Remove pending job from` scheduler
+ *
+ * @job: scheduler job to remove
+ */
+void drm_sched_remove_pending_job(struct drm_sched_job *job)
+{
+       struct drm_gpu_scheduler *sched = job->sched;
+       struct drm_sched_entity *entity = job->entity;
+
+       lockdep_assert_held(&sched->job_list_lock);
+
+       list_del_init(&job->list);
+       if (!--entity->pending_job_count)
+               complete_all(&entity->jobs_done);
+}
+EXPORT_SYMBOL(drm_sched_remove_pending_job);
+
 static void drm_sched_job_begin(struct drm_sched_job *s_job)
 {
        struct drm_gpu_scheduler *sched = s_job->sched;
 
        spin_lock(&sched->job_list_lock);
-       list_add_tail(&s_job->list, &sched->pending_list);
+       drm_sched_add_pending_job(s_job, true);
        spin_unlock(&sched->job_list_lock);
 }
 
@@ -538,7 +578,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
                 * drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
                 * is parked at which point it's safe.
                 */
-               list_del_init(&job->list);
+               drm_sched_remove_pending_job(job);
                spin_unlock(&sched->job_list_lock);
 
                status = job->sched->ops->timedout_job(job);
@@ -589,7 +629,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
                 * Add at the head of the queue to reflect it was the earliest
                 * job extracted.
                 */
-               list_add(&bad->list, &sched->pending_list);
+               drm_sched_add_pending_job(bad, false);
 
        /*
         * Iterate the job list from later to  earlier one and either deactive
@@ -611,7 +651,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
                         * Locking here is for concurrent resume timeout
                         */
                        spin_lock(&sched->job_list_lock);
-                       list_del_init(&s_job->list);
+                       drm_sched_remove_pending_job(s_job);
                        spin_unlock(&sched->job_list_lock);
 
                        /*
@@ -1066,7 +1106,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 
        if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
                /* remove job from pending_list */
-               list_del_init(&job->list);
+               drm_sched_remove_pending_job(job);
 
                /* cancel this job's TO timer */
                cancel_delayed_work(&sched->work_tdr);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index b7b818cd81b6..7c628f36fe78 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -233,6 +233,21 @@ struct drm_sched_entity {
         */
        struct completion               entity_idle;
 
+       /**
+        * @pending_job_count:
+        *
+        * Number of pending jobs.
+        */
+       unsigned int                    pending_job_count;
+
+       /**
+        * @jobs_done:
+        *
+        * Signals when entity has no pending jobs, used to sequence entity
+        * cleanup in drm_sched_entity_fini().
+        */
+       struct completion               jobs_done;
+
        /**
         * @oldest_job_waiting:
         *
@@ -656,4 +671,7 @@ struct drm_gpu_scheduler *
 drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
                     unsigned int num_sched_list);
 
+void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail);
+void drm_sched_remove_pending_job(struct drm_sched_job *job);
+
 #endif
-- 
2.34.1

[PATCH v3 11/13] drm/sched: Waiting for pending jobs to complete in scheduler kill

Reply via email to