Re: [PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.
Am 17.12.18 um 17:57 schrieb Grodzovsky, Andrey: > > On 12/17/2018 10:27 AM, Christian König wrote: >> Am 10.12.18 um 22:43 schrieb Andrey Grodzovsky: >>> Decauple sched threads stop and start and ring mirror >>> list handling from the policy of what to do about the >>> guilty jobs. >>> When stoppping the sched thread and detaching sched fences >>> from non signaled HW fenes wait for all signaled HW fences >>> to complete before rerunning the jobs. >>> >>> v2: Fix resubmission of guilty job into HW after refactoring. >>> >>> Suggested-by: Christian Koenig >>> Signed-off-by: Andrey Grodzovsky >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 8 +-- >>> drivers/gpu/drm/scheduler/sched_main.c | 110 >>> ++--- >>> drivers/gpu/drm/v3d/v3d_sched.c | 11 +-- >>> include/drm/gpu_scheduler.h | 10 ++- >>> 5 files changed, 95 insertions(+), 61 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index ef36cc5..42111d5 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -3292,17 +3292,16 @@ static int >>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, >>> /* block all schedulers and reset given job's ring */ >>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >>> struct amdgpu_ring *ring = adev->rings[i]; >>> + bool park_only = job && job->base.sched != &ring->sched; >>> if (!ring || !ring->sched.thread) >>> continue; >>> - kthread_park(ring->sched.thread); >>> + drm_sched_stop(&ring->sched, job ? &job->base : NULL, >>> park_only); >>> - if (job && job->base.sched != &ring->sched) >>> + if (park_only) >>> continue; >>> - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : >>> NULL); >>> - >>> /* after all hw jobs are reset, hw fence is meaningless, so >>> force_completion */ >>> amdgpu_fence_driver_force_completion(ring); >>> } >>> @@ -3445,6 +3444,7 @@ static void >>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >>> struct amdgpu_job *job) >>> { >>> int i; >>> + bool unpark_only; >>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >>> struct amdgpu_ring *ring = adev->rings[i]; >>> @@ -3456,10 +3456,13 @@ static void >>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >>> * or all rings (in the case @job is NULL) >>> * after above amdgpu_reset accomplished >>> */ >>> - if ((!job || job->base.sched == &ring->sched) && >>> !adev->asic_reset_res) >>> - drm_sched_job_recovery(&ring->sched); >>> + unpark_only = (job && job->base.sched != &ring->sched) || >>> + adev->asic_reset_res; >>> + >>> + if (!unpark_only) >>> + drm_sched_resubmit_jobs(&ring->sched); >>> - kthread_unpark(ring->sched.thread); >>> + drm_sched_start(&ring->sched, unpark_only); >>> } >>> if (!amdgpu_device_has_dc_support(adev)) { >>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>> index 49a6763..fab3b51 100644 >>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >>> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct >>> drm_sched_job *sched_job) >>> } >>> /* block scheduler */ >>> - kthread_park(gpu->sched.thread); >>> - drm_sched_hw_job_reset(&gpu->sched, sched_job); >>> + drm_sched_stop(&gpu->sched, sched_job, false); >>> /* get the GPU back into the init state */ >>> etnaviv_core_dump(gpu); >>> etnaviv_gpu_recover_hang(gpu); >>> + drm_sched_resubmit_jobs(&gpu->sched); >>> + >>> /* restart scheduler after GPU is usable again */ >>> - drm_sched_job_recovery(&gpu->sched); >>> - kthread_unpark(gpu->sched.thread); >>> + drm_sched_start(&gpu->sched); >>> } >>> static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) >>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >>> b/drivers/gpu/drm/scheduler/sched_main.c >>> index dbb6906..cdf95e2 100644 >>> --- a/drivers/gpu/drm/scheduler/sched_main.c >>> +++ b/drivers/gpu/drm/scheduler/sched_main.c >>> @@ -60,8 +60,6 @@ >>> static void drm_sched_process_job(struct dma_fence *f, struct >>> dma_fence_cb *cb); >>> -static void drm_sched_expel_job_unlocked(struct drm_sched_job >>> *s_job); >>> - >>> /** >>> * drm_sched_rq_init - initialize a given run queue struct >>> * >>> @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct >>> work_struct *work) >>> * @bad: bad scheduler job >>> * >>> */ >>> -void drm_sched_hw_job_reset(struct drm_gpu_
Re: [PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.
On 12/17/2018 10:27 AM, Christian König wrote: > Am 10.12.18 um 22:43 schrieb Andrey Grodzovsky: >> Decauple sched threads stop and start and ring mirror >> list handling from the policy of what to do about the >> guilty jobs. >> When stoppping the sched thread and detaching sched fences >> from non signaled HW fenes wait for all signaled HW fences >> to complete before rerunning the jobs. >> >> v2: Fix resubmission of guilty job into HW after refactoring. >> >> Suggested-by: Christian Koenig >> Signed-off-by: Andrey Grodzovsky >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 8 +-- >> drivers/gpu/drm/scheduler/sched_main.c | 110 >> ++--- >> drivers/gpu/drm/v3d/v3d_sched.c | 11 +-- >> include/drm/gpu_scheduler.h | 10 ++- >> 5 files changed, 95 insertions(+), 61 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index ef36cc5..42111d5 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3292,17 +3292,16 @@ static int >> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, >> /* block all schedulers and reset given job's ring */ >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> struct amdgpu_ring *ring = adev->rings[i]; >> + bool park_only = job && job->base.sched != &ring->sched; >> if (!ring || !ring->sched.thread) >> continue; >> - kthread_park(ring->sched.thread); >> + drm_sched_stop(&ring->sched, job ? &job->base : NULL, >> park_only); >> - if (job && job->base.sched != &ring->sched) >> + if (park_only) >> continue; >> - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : >> NULL); >> - >> /* after all hw jobs are reset, hw fence is meaningless, so >> force_completion */ >> amdgpu_fence_driver_force_completion(ring); >> } >> @@ -3445,6 +3444,7 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >> struct amdgpu_job *job) >> { >> int i; >> + bool unpark_only; >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> struct amdgpu_ring *ring = adev->rings[i]; >> @@ -3456,10 +3456,13 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev, >> * or all rings (in the case @job is NULL) >> * after above amdgpu_reset accomplished >> */ >> - if ((!job || job->base.sched == &ring->sched) && >> !adev->asic_reset_res) >> - drm_sched_job_recovery(&ring->sched); >> + unpark_only = (job && job->base.sched != &ring->sched) || >> + adev->asic_reset_res; >> + >> + if (!unpark_only) >> + drm_sched_resubmit_jobs(&ring->sched); >> - kthread_unpark(ring->sched.thread); >> + drm_sched_start(&ring->sched, unpark_only); >> } >> if (!amdgpu_device_has_dc_support(adev)) { >> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> index 49a6763..fab3b51 100644 >> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct >> drm_sched_job *sched_job) >> } >> /* block scheduler */ >> - kthread_park(gpu->sched.thread); >> - drm_sched_hw_job_reset(&gpu->sched, sched_job); >> + drm_sched_stop(&gpu->sched, sched_job, false); >> /* get the GPU back into the init state */ >> etnaviv_core_dump(gpu); >> etnaviv_gpu_recover_hang(gpu); >> + drm_sched_resubmit_jobs(&gpu->sched); >> + >> /* restart scheduler after GPU is usable again */ >> - drm_sched_job_recovery(&gpu->sched); >> - kthread_unpark(gpu->sched.thread); >> + drm_sched_start(&gpu->sched); >> } >> static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) >> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >> b/drivers/gpu/drm/scheduler/sched_main.c >> index dbb6906..cdf95e2 100644 >> --- a/drivers/gpu/drm/scheduler/sched_main.c >> +++ b/drivers/gpu/drm/scheduler/sched_main.c >> @@ -60,8 +60,6 @@ >> static void drm_sched_process_job(struct dma_fence *f, struct >> dma_fence_cb *cb); >> -static void drm_sched_expel_job_unlocked(struct drm_sched_job >> *s_job); >> - >> /** >> * drm_sched_rq_init - initialize a given run queue struct >> * >> @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct >> work_struct *work) >> * @bad: bad scheduler job >> * >> */ >> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad) >> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad, >> + bool park_only) >> { >> struct drm_s
Re: [PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.
Am 10.12.18 um 22:43 schrieb Andrey Grodzovsky: Decauple sched threads stop and start and ring mirror list handling from the policy of what to do about the guilty jobs. When stoppping the sched thread and detaching sched fences from non signaled HW fenes wait for all signaled HW fences to complete before rerunning the jobs. v2: Fix resubmission of guilty job into HW after refactoring. Suggested-by: Christian Koenig Signed-off-by: Andrey Grodzovsky --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- drivers/gpu/drm/etnaviv/etnaviv_sched.c| 8 +-- drivers/gpu/drm/scheduler/sched_main.c | 110 ++--- drivers/gpu/drm/v3d/v3d_sched.c| 11 +-- include/drm/gpu_scheduler.h| 10 ++- 5 files changed, 95 insertions(+), 61 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ef36cc5..42111d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; + bool park_only = job && job->base.sched != &ring->sched; if (!ring || !ring->sched.thread) continue; - kthread_park(ring->sched.thread); + drm_sched_stop(&ring->sched, job ? &job->base : NULL, park_only); - if (job && job->base.sched != &ring->sched) + if (park_only) continue; - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); - /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } @@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, struct amdgpu_job *job) { int i; + bool unpark_only; for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, * or all rings (in the case @job is NULL) * after above amdgpu_reset accomplished */ - if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res) - drm_sched_job_recovery(&ring->sched); + unpark_only = (job && job->base.sched != &ring->sched) || + adev->asic_reset_res; + + if (!unpark_only) + drm_sched_resubmit_jobs(&ring->sched); - kthread_unpark(ring->sched.thread); + drm_sched_start(&ring->sched, unpark_only); } if (!amdgpu_device_has_dc_support(adev)) { diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 49a6763..fab3b51 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) } /* block scheduler */ - kthread_park(gpu->sched.thread); - drm_sched_hw_job_reset(&gpu->sched, sched_job); + drm_sched_stop(&gpu->sched, sched_job, false); /* get the GPU back into the init state */ etnaviv_core_dump(gpu); etnaviv_gpu_recover_hang(gpu); + drm_sched_resubmit_jobs(&gpu->sched); + /* restart scheduler after GPU is usable again */ - drm_sched_job_recovery(&gpu->sched); - kthread_unpark(gpu->sched.thread); + drm_sched_start(&gpu->sched); } static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index dbb6906..cdf95e2 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -60,8 +60,6 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job); - /** * drm_sched_rq_init - initialize a given run queue struct * @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct *work) * @bad: bad scheduler job * */ -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad, + bool park_only) { struct drm_sched_job *s_job; struct drm_sched_entity *entity, *tmp; unsigned long flags; + struct list_head wait_list; int i; + kthread_park(sched->thread); + if (park_only) + ret
[PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.
Decauple sched threads stop and start and ring mirror list handling from the policy of what to do about the guilty jobs. When stoppping the sched thread and detaching sched fences from non signaled HW fenes wait for all signaled HW fences to complete before rerunning the jobs. v2: Fix resubmission of guilty job into HW after refactoring. Suggested-by: Christian Koenig Signed-off-by: Andrey Grodzovsky --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- drivers/gpu/drm/etnaviv/etnaviv_sched.c| 8 +-- drivers/gpu/drm/scheduler/sched_main.c | 110 ++--- drivers/gpu/drm/v3d/v3d_sched.c| 11 +-- include/drm/gpu_scheduler.h| 10 ++- 5 files changed, 95 insertions(+), 61 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ef36cc5..42111d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; + bool park_only = job && job->base.sched != &ring->sched; if (!ring || !ring->sched.thread) continue; - kthread_park(ring->sched.thread); + drm_sched_stop(&ring->sched, job ? &job->base : NULL, park_only); - if (job && job->base.sched != &ring->sched) + if (park_only) continue; - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); - /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } @@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, struct amdgpu_job *job) { int i; + bool unpark_only; for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, * or all rings (in the case @job is NULL) * after above amdgpu_reset accomplished */ - if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res) - drm_sched_job_recovery(&ring->sched); + unpark_only = (job && job->base.sched != &ring->sched) || + adev->asic_reset_res; + + if (!unpark_only) + drm_sched_resubmit_jobs(&ring->sched); - kthread_unpark(ring->sched.thread); + drm_sched_start(&ring->sched, unpark_only); } if (!amdgpu_device_has_dc_support(adev)) { diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 49a6763..fab3b51 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) } /* block scheduler */ - kthread_park(gpu->sched.thread); - drm_sched_hw_job_reset(&gpu->sched, sched_job); + drm_sched_stop(&gpu->sched, sched_job, false); /* get the GPU back into the init state */ etnaviv_core_dump(gpu); etnaviv_gpu_recover_hang(gpu); + drm_sched_resubmit_jobs(&gpu->sched); + /* restart scheduler after GPU is usable again */ - drm_sched_job_recovery(&gpu->sched); - kthread_unpark(gpu->sched.thread); + drm_sched_start(&gpu->sched); } static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index dbb6906..cdf95e2 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -60,8 +60,6 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job); - /** * drm_sched_rq_init - initialize a given run queue struct * @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct *work) * @bad: bad scheduler job * */ -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad, + bool park_only) { struct drm_sched_job *s_job; struct drm_sched_entity *entity, *tmp; unsigned long flags; + struct list_head wait_list; int i; + kthread_park(sched->thread); + if (park_only) +