[PATCH] drm/amd/scheduler:fix duplicate operation in entity fini
no need to manually cleanup job and fence after sched_fence_finish(), they are auto handled by the callback Change-Id: I9da26064c9e73c178949663bed1e490539e95e41 Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 39b6205..f9f3daa 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -230,11 +230,10 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched, kthread_unpark(sched->thread); while (kfifo_out(&entity->job_queue, &job, sizeof(job))) { struct amd_sched_fence *s_fence = job->s_fence; + amd_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); amd_sched_fence_finished(s_fence); - dma_fence_put(&s_fence->finished); - sched->ops->free_job(job); } } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/5] drm/amdgpu:add hang_limit for sched(v2)
since gpu_scheduler source domain cannot access amdgpu variable so need create the hang_limit membewr for sched, and it can refer it for the upcoming GPU RESET patches v2: make hang_limit a parameter of sched_init() Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653 Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 6 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 2167dac..f43319c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -446,7 +446,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, timeout = MAX_SCHEDULE_TIMEOUT; } r = amd_sched_init(&ring->sched, &amdgpu_sched_ops, - num_hw_submission, + num_hw_submission, amdgpu_job_hang_limit, timeout, ring->name); if (r) { DRM_ERROR("Failed to create scheduler on ring %s.\n", diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 1bbbce2..369f15d 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -676,13 +676,17 @@ static int amd_sched_main(void *param) */ int amd_sched_init(struct amd_gpu_scheduler *sched, const struct amd_sched_backend_ops *ops, - unsigned hw_submission, long timeout, const char *name) + unsigned hw_submission, + unsigned hang_limit, + long timeout, + const char *name) { int i; sched->ops = ops; sched->hw_submission_limit = hw_submission; sched->name = name; sched->timeout = timeout; + sched->hang_limit = hang_limit; for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++) amd_sched_rq_init(&sched->sched_rq[i]); diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index 3f75b45..3ea75a2 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -144,11 +144,12 @@ struct amd_gpu_scheduler { struct task_struct *thread; struct list_headring_mirror_list; spinlock_t job_list_lock; + int hang_limit; }; int amd_sched_init(struct amd_gpu_scheduler *sched, const struct amd_sched_backend_ops *ops, - uint32_t hw_submission, long timeout, const char *name); + uint32_t hw_submission, unsigned hang_limit, long timeout, const char *name); void amd_sched_fini(struct amd_gpu_scheduler *sched); int amd_sched_entity_init(struct amd_gpu_scheduler *sched, -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 4/5] drm/amdgpu:skip job for guilty ctx in parser_init
Change-Id: I44019f6475b1eaaba55633cf5f8bb84284f19a2c Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a760b6e..1072fc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -90,6 +90,12 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) goto free_chunk; } + /* skip guilty context job */ + if (atomic_read(&p->ctx->guilty) == 1) { + ret = -ECANCELED; + goto free_chunk; + } + mutex_lock(&p->ctx->lock); /* get chunks */ -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/5] drm/amd/scheduler:introduce guilty pointer member
this member will be used later, it will points to the real var inside of context and CS_SUBMIT & gpu schdduler can decide if skip a job depends on context->guilty or *entity->guilty Change-Id: I411e117a01c54286db0765fd2f6bf9d3bda01a3b Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 2 +- drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 2 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 3 ++- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++- 10 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index c184468..bb5a46a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, continue; r = amd_sched_entity_init(&ring->sched, &ctx->rings[i].entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r) goto failed; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index dcdfb8d..af65cb0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -110,7 +110,7 @@ static int amdgpu_ttm_global_init(struct amdgpu_device *adev) ring = adev->mman.buffer_funcs_ring; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_KERNEL]; r = amd_sched_entity_init(&ring->sched, &adev->mman.entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r) { DRM_ERROR("Failed setting up TTM BO move run queue.\n"); goto error_entity; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index e8bd50c..6604771 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -232,7 +232,7 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev) ring = &adev->uvd.ring; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->uvd.entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up UVD run queue.\n"); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index b46280c..60a8fb6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -176,7 +176,7 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size) ring = &adev->vce.ring[0]; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->vce.entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up VCE run queue.\n"); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 041e012..96df21c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -106,7 +106,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) ring = &adev->vcn.ring_dec; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->vcn.entity_dec, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up VCN dec run queue.\n"); return r; @@ -115,7 +115,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) ring = &adev->vcn.ring_enc[0]; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->vcn.entity_enc, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up VCN enc run queue.\n"); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ef8b7a9..3e1959e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -263
[PATCH 5/5] drm/amdgpu:cleanup job reset routine(v2)
merge the setting guilty on context into this function to avoid implement extra routine. v2: go through entity list and compare the fence_ctx before operate on the entity, otherwise the entity may be just a wild pointer Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540 Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 4 ++-- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 31 ++- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 +- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dae1e5b..a07544d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) amd_sched_job_kickout(&job->base); /* only do job_reset on the hang ring if @job not NULL */ - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); @@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) if (!ring || !ring->sched.thread) continue; kthread_park(ring->sched.thread); - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 97fd255..39b6205 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) +static void amd_sched_set_guilty(struct amd_sched_job *s_job) +{ + if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) + if (s_job->s_entity->guilty) + atomic_set(s_job->s_entity->guilty, 1); +} + +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) { struct amd_sched_job *s_job; + struct amd_sched_entity *entity, *tmp; + int i;; spin_lock(&sched->job_list_lock); list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { @@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) } } spin_unlock(&sched->job_list_lock); + + if (bad) { + bool found = false; + + for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { + struct amd_sched_rq *rq = &sched->sched_rq[i]; + + spin_lock(&rq->lock); + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { + if (bad->s_fence->scheduled.context == entity->fence_context) { + found = true; + amd_sched_set_guilty(bad); + break; + } + } + spin_unlock(&rq->lock); + if (found) + break; + } + } } void amd_sched_job_kickout(struct amd_sched_job *s_job) diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index a05994c..be75172 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, void *owner); -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched); +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job); void amd_sched_job_recovery(struct amd_gpu_scheduler *sched); bool amd_sched_dependency_optimized(struct dma_fence* fence, struct amd_sched_entity *entity); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/5] drm/amdgpu:pass ctx->guilty address to entity init
this way the real interested guilty is connected to entity->guilty pointer, and we can use entity->pointer later in gpu recovery procedure Change-Id: I09b392ebfc59254795e4fbd5816abd3d94a95853 Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 774edc1..6a4178b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -744,6 +744,7 @@ struct amdgpu_ctx { enum amd_sched_priority init_priority; enum amd_sched_priority override_priority; struct mutexlock; + atomic_tguilty; }; struct amdgpu_ctx_mgr { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index bb5a46a..1bf4cdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, continue; r = amd_sched_entity_init(&ring->sched, &ctx->rings[i].entity, - rq, amdgpu_sched_jobs, NULL); + rq, amdgpu_sched_jobs, &ctx->guilty); if (r) goto failed; } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu/psp: fix compile warnings on buffer address print
drivers/gpu/drm/amd/amdgpu//psp_v3_1.c:389:13: warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 4 has type ‘struct psp_gfx_rb_frame *’ [-Wformat=] DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame = %x\n", ^ ./include/drm/drmP.h:178:36: note: in definition of macro ‘DRM_ERROR’ drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__) ^ warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 5 has type ‘struct psp_gfx_rb_frame *’ [-Wformat=] DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame = %x\n", ^ ./include/drm/drmP.h:178:36: note: in definition of macro ‘DRM_ERROR’ drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__) ^ warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 6 has type ‘struct psp_gfx_rb_frame *’ [-Wformat=] DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame = %x\n", ^ ./include/drm/drmP.h:178:36: note: in definition of macro ‘DRM_ERROR’ drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__) Change-Id: Ie807eba4594d588561a1d4427ccb2286cc5f5065 Signed-off-by: Evan Quan Reported-by: Rex Zhu --- drivers/gpu/drm/amd/amdgpu/psp_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/psp_v3_1.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c index 26e10ab..6b56123 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c @@ -274,7 +274,7 @@ int psp_v10_0_cmd_submit(struct psp_context *psp, write_frame = ring_buffer_start + (psp_write_ptr_reg / rb_frame_size_dw); /* Check invalid write_frame ptr address */ if ((write_frame < ring_buffer_start) || (ring_buffer_end < write_frame)) { - DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame = %x\n", + DRM_ERROR("ring_buffer_start = %p; ring_buffer_end = %p; write_frame = %p\n", ring_buffer_start, ring_buffer_end, write_frame); diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c index d217050..7008569 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c @@ -386,7 +386,7 @@ int psp_v3_1_cmd_submit(struct psp_context *psp, write_frame = ring_buffer_start + (psp_write_ptr_reg / rb_frame_size_dw); /* Check invalid write_frame ptr address */ if ((write_frame < ring_buffer_start) || (ring_buffer_end < write_frame)) { - DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame = %x\n", + DRM_ERROR("ring_buffer_start = %p; ring_buffer_end = %p; write_frame = %p\n", ring_buffer_start, ring_buffer_end, write_frame); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 1/5] drm/amdgpu:cleanup job reset routine
Sorry, this patch is invalide, please ignore this one, will send out v2 for review -Original Message- From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of Monk Liu Sent: 2017年10月23日 13:34 To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 1/5] drm/amdgpu:cleanup job reset routine merge the setting guilty on context into this function to avoid implement extra routine. Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540 Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 4 ++-- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 - drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dae1e5b..a07544d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) amd_sched_job_kickout(&job->base); /* only do job_reset on the hang ring if @job not NULL */ - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); @@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) if (!ring || !ring->sched.thread) continue; kthread_park(ring->sched.thread); - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index e4d3b4e..322b6c1 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) +static void amd_sched_set_guilty(struct amd_sched_job *s_job) { + if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) + if (s_job->s_entity->guilty) + atomic_set(s_job->s_entity->guilty, 1); } + +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct +amd_sched_job *bad) { struct amd_sched_job *s_job; @@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) s_job->s_fence->parent = NULL; atomic_dec(&sched->hw_rq_count); } + + if (bad && bad == s_job) { + amd_sched_set_guilty(s_job); + } } spin_unlock(&sched->job_list_lock); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index 52c8e54..d26e778 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -170,7 +170,7 @@ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, void *owner); -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched); +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct +amd_sched_job *job); void amd_sched_job_recovery(struct amd_gpu_scheduler *sched); bool amd_sched_dependency_optimized(struct dma_fence* fence, struct amd_sched_entity *entity); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 4/5] drm/amdgpu:pass ctx->guilty address to entity init
this way the real interested guilty is connected to entity->guilty pointer, and we can use entity->pointer later in gpu recovery procedure Change-Id: I09b392ebfc59254795e4fbd5816abd3d94a95853 Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 774edc1..6a4178b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -744,6 +744,7 @@ struct amdgpu_ctx { enum amd_sched_priority init_priority; enum amd_sched_priority override_priority; struct mutexlock; + atomic_tguilty; }; struct amdgpu_ctx_mgr { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index bb5a46a..1bf4cdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, continue; r = amd_sched_entity_init(&ring->sched, &ctx->rings[i].entity, - rq, amdgpu_sched_jobs, NULL); + rq, amdgpu_sched_jobs, &ctx->guilty); if (r) goto failed; } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/5] drm/amd/scheduler:introduce guilty pointer member
this member will be used later, it will points to the real var inside of context and CS_SUBMIT & gpu schdduler can decide if skip a job depends on context->guilty or *entity->guilty Change-Id: I411e117a01c54286db0765fd2f6bf9d3bda01a3b Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 2 +- drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 2 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 3 ++- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++- 10 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index c184468..bb5a46a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, continue; r = amd_sched_entity_init(&ring->sched, &ctx->rings[i].entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r) goto failed; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index dcdfb8d..af65cb0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -110,7 +110,7 @@ static int amdgpu_ttm_global_init(struct amdgpu_device *adev) ring = adev->mman.buffer_funcs_ring; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_KERNEL]; r = amd_sched_entity_init(&ring->sched, &adev->mman.entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r) { DRM_ERROR("Failed setting up TTM BO move run queue.\n"); goto error_entity; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index e8bd50c..6604771 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -232,7 +232,7 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev) ring = &adev->uvd.ring; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->uvd.entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up UVD run queue.\n"); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index b46280c..60a8fb6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -176,7 +176,7 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size) ring = &adev->vce.ring[0]; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->vce.entity, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up VCE run queue.\n"); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 041e012..96df21c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -106,7 +106,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) ring = &adev->vcn.ring_dec; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->vcn.entity_dec, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up VCN dec run queue.\n"); return r; @@ -115,7 +115,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) ring = &adev->vcn.ring_enc[0]; rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL]; r = amd_sched_entity_init(&ring->sched, &adev->vcn.entity_enc, - rq, amdgpu_sched_jobs); + rq, amdgpu_sched_jobs, NULL); if (r != 0) { DRM_ERROR("Failed setting up VCN enc run queue.\n"); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ef8b7a9..3e1959e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -263
[PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init
Change-Id: I44019f6475b1eaaba55633cf5f8bb84284f19a2c Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index f7fceb6..1d5cbe7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -90,6 +90,12 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) goto free_chunk; } + /* skip guilty context job */ + if (atomic_read(&p->ctx->guilty) == 1) { + ret = -ECANCELED; + goto free_chunk; + } + mutex_lock(&p->ctx->lock); /* get chunks */ -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/5] drm/amdgpu:cleanup job reset routine
merge the setting guilty on context into this function to avoid implement extra routine. Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540 Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 4 ++-- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 - drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dae1e5b..a07544d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) amd_sched_job_kickout(&job->base); /* only do job_reset on the hang ring if @job not NULL */ - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); @@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) if (!ring || !ring->sched.thread) continue; kthread_park(ring->sched.thread); - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index e4d3b4e..322b6c1 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) +static void amd_sched_set_guilty(struct amd_sched_job *s_job) +{ + if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) + if (s_job->s_entity->guilty) + atomic_set(s_job->s_entity->guilty, 1); +} + +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) { struct amd_sched_job *s_job; @@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) s_job->s_fence->parent = NULL; atomic_dec(&sched->hw_rq_count); } + + if (bad && bad == s_job) { + amd_sched_set_guilty(s_job); + } } spin_unlock(&sched->job_list_lock); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index 52c8e54..d26e778 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -170,7 +170,7 @@ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, void *owner); -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched); +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job); void amd_sched_job_recovery(struct amd_gpu_scheduler *sched); bool amd_sched_dependency_optimized(struct dma_fence* fence, struct amd_sched_entity *entity); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/5] drm/amdgpu:add hang_limit for sched(v2)
since gpu_scheduler source domain cannot access amdgpu variable so need create the hang_limit membewr for sched, and it can refer it for the upcoming GPU RESET patches v2: make hang_limit a parameter of sched_init() Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653 Signed-off-by: Monk Liu Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 6 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 2167dac..f43319c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -446,7 +446,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, timeout = MAX_SCHEDULE_TIMEOUT; } r = amd_sched_init(&ring->sched, &amdgpu_sched_ops, - num_hw_submission, + num_hw_submission, amdgpu_job_hang_limit, timeout, ring->name); if (r) { DRM_ERROR("Failed to create scheduler on ring %s.\n", diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 322b6c1..dfda486 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -686,13 +686,17 @@ static int amd_sched_main(void *param) */ int amd_sched_init(struct amd_gpu_scheduler *sched, const struct amd_sched_backend_ops *ops, - unsigned hw_submission, long timeout, const char *name) + unsigned hw_submission, + unsigned hang_limit, + long timeout, + const char *name) { int i; sched->ops = ops; sched->hw_submission_limit = hw_submission; sched->name = name; sched->timeout = timeout; + sched->hang_limit = hang_limit; for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++) amd_sched_rq_init(&sched->sched_rq[i]); diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index d26e778..774b5da 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -142,11 +142,12 @@ struct amd_gpu_scheduler { struct task_struct *thread; struct list_headring_mirror_list; spinlock_t job_list_lock; + int hang_limit; }; int amd_sched_init(struct amd_gpu_scheduler *sched, const struct amd_sched_backend_ops *ops, - uint32_t hw_submission, long timeout, const char *name); + uint32_t hw_submission, unsigned hang_limit, long timeout, const char *name); void amd_sched_fini(struct amd_gpu_scheduler *sched); int amd_sched_entity_init(struct amd_gpu_scheduler *sched, -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 3/5] drm/amdgpu:implement guilty pointer
Don't set this directly, make it a parameter of amd_sched_entity_init(). And please split up the patches differently. First all changes to the scheduler, including the guilty marking. Then the changes to amdgpu. [ML] that way the first patch will break kernel from compiling because you change the interface of entity_init without changing all place Calling this interface I suggest just use one patch -Original Message- From: Christian König [mailto:ckoenig.leichtzumer...@gmail.com] Sent: 2017年10月20日 17:09 To: Liu, Monk ; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer Am 20.10.2017 um 05:33 schrieb Monk Liu: > for user context there will be a guilty pointer in entity that points > to the guilty member of its context, thus we cant track if a given > entity is not from kernel ctx and if it is already marked guilty. > > Change-Id: Ie0a30830d89f52a6c4a514e67206140698a46367 > Signed-off-by: Monk Liu > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 1 + > drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 + > 3 files changed, 3 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 774edc1..6a4178b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -744,6 +744,7 @@ struct amdgpu_ctx { > enum amd_sched_priority init_priority; > enum amd_sched_priority override_priority; > struct mutexlock; > + atomic_tguilty; > }; > > struct amdgpu_ctx_mgr { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > index c184468..d822e95 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > @@ -93,6 +93,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, > rq, amdgpu_sched_jobs); > if (r) > goto failed; > + ctx->rings[i].entity.guilty = &ctx->guilty; Don't set this directly, make it a parameter of amd_sched_entity_init(). And please split up the patches differently. First all changes to the scheduler, including the guilty marking. Then the changes to amdgpu. Regards, Christian. > } > > r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git > a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > index 1dac7bc..2d59fc5 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > @@ -50,6 +50,7 @@ struct amd_sched_entity { > > struct dma_fence*dependency; > struct dma_fence_cb cb; > + atomic_t*guilty; /* points to ctx's guilty */ > }; > > /** ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.
If the deadlock issue could be solved I don't see why we give up kfifo and switch to SPSC .. -Original Message- From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky Sent: 2017年10月20日 21:32 To: amd-gfx@lists.freedesktop.org Cc: Grodzovsky, Andrey ; Koenig, Christian Subject: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler. It is intended to sabstitute the bounded fifo we are currently using. Signed-off-by: Andrey Grodzovsky --- drivers/gpu/drm/amd/scheduler/spsc_queue.h | 120 + 1 file changed, 120 insertions(+) create mode 100644 drivers/gpu/drm/amd/scheduler/spsc_queue.h diff --git a/drivers/gpu/drm/amd/scheduler/spsc_queue.h b/drivers/gpu/drm/amd/scheduler/spsc_queue.h new file mode 100644 index 000..a3394f1 --- /dev/null +++ b/drivers/gpu/drm/amd/scheduler/spsc_queue.h @@ -0,0 +1,120 @@ +/* + * Copyright 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person +obtaining a + * copy of this software and associated documentation files (the +"Software"), + * to deal in the Software without restriction, including without +limitation + * the rights to use, copy, modify, merge, publish, distribute, +sublicense, + * and/or sell copies of the Software, and to permit persons to whom +the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be +included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, +DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef AMD_SCHEDULER_SPSC_QUEUE_H_ +#define AMD_SCHEDULER_SPSC_QUEUE_H_ + +#include + +/** SPSC lockless queue */ + +struct spsc_node { + + /* Stores spsc_node* */ + struct spsc_node *next; +}; + +struct spsc_queue { + +struct spsc_node *head; + + /* atomic pointer to struct spsc_node* */ + atomic_long_t tail; + + atomic_t job_count; +}; + +static inline void spsc_queue_init(struct spsc_queue *queue) { + queue->head = NULL; + atomic_long_set(&queue->tail, (long)&queue->head); + atomic_set(&queue->job_count, 0); +} + +static inline struct spsc_node *spsc_queue_peek(struct spsc_queue +*queue) { + return queue->head; +} + +static inline int spsc_queue_count(struct spsc_queue *queue) { + return atomic_read(&queue->job_count); } + +static inline bool spsc_queue_push(struct spsc_queue *queue, struct +spsc_node *node) { + struct spsc_node **tail; + + node->next = NULL; + + preempt_disable(); + + tail = (struct spsc_node **)atomic_long_xchg(&queue->tail, (long)&node->next); + WRITE_ONCE(*tail, node); + atomic_inc(&queue->job_count); + + /* +* In case of first element verify new node will be visible to the consumer +* thread when we ping the kernel thread that there is new work to do. +*/ + smp_wmb(); + + preempt_enable(); + + return tail == &queue->head; +} + + +static inline struct spsc_node *spsc_queue_pop(struct spsc_queue +*queue) { + struct spsc_node *next, *node; + + /* Verify reading from memory and not the cache */ + smp_rmb(); + + node = READ_ONCE(queue->head); + + if (!node) + return NULL; + + next = READ_ONCE(node->next); + WRITE_ONCE(queue->head, next); + + if (unlikely(!next)) { + /* slowpath for the last element in the queue */ + + if (atomic_long_cmpxchg(&queue->tail, + (long)&node->next,(long) &queue->head) != (long)&node->next) { + /* Updating tail failed wait for new next to appear */ + do { + smp_rmb(); + }while (unlikely(!(queue->head = READ_ONCE(node->next; + } + } + + atomic_dec(&queue->job_count); + return node; +} + + + +#endif /* AMD_SCHEDULER_SPSC_QUEUE_H_ */ -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
Why not use a more simple way ? Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ? That could solve the deadlock from your description And the push order is already guaranteed by context->mutex (which is also a patch from you) BR Monk -Original Message- From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky Sent: 2017年10月20日 21:32 To: amd-gfx@lists.freedesktop.org Cc: Grodzovsky, Andrey ; Koenig, Christian Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset. Switch from kfifo to SPSC queue. Bug: Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock. Fix: Switch from kfifo to SPSC queue which is unlimited in size. Signed-off-by: Andrey Grodzovsky --- drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h | 4 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 51 ++--- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 4 +- 3 files changed, 26 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h index 8bd3810..86838a8 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job, __entry->id = sched_job->id; __entry->fence = &sched_job->s_fence->finished; __entry->name = sched_job->sched->name; - __entry->job_count = kfifo_len( - &sched_job->s_entity->job_queue) / sizeof(sched_job); + __entry->job_count = spsc_queue_count( + &sched_job->s_entity->job_queue); __entry->hw_job_count = atomic_read( &sched_job->sched->hw_rq_count); ), diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 1bbbce2..0c9cdc0 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -28,9 +28,14 @@ #include #include "gpu_scheduler.h" +#include "spsc_queue.h" + #define CREATE_TRACE_POINTS #include "gpu_sched_trace.h" +#define to_amd_sched_job(sched_job)\ + container_of((sched_job), struct amd_sched_job, queue_node) + static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity); static void amd_sched_wakeup(struct amd_gpu_scheduler *sched); static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched, struct amd_sched_rq *rq, uint32_t jobs) { - int r; - if (!(sched && entity && rq)) return -EINVAL; @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched, spin_lock_init(&entity->rq_lock); spin_lock_init(&entity->queue_lock); - r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL); - if (r) - return r; + spsc_queue_init(&entity->job_queue); atomic_set(&entity->fence_seq, 0); entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched, static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity) { rmb(); - if (kfifo_is_empty(&entity->job_queue)) + if (spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity) */ static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity) { - if (kfifo_is_empty(&entity->job_queue)) + if (spsc_queue_peek(&entity->job_queue) == NULL) return false; if (ACCESS_ONCE(entity->dependency)) @@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched, */ kthread_park(sched->thread); kthread_unpark(sched->thread); - while (kfifo_out(&entity->job_queue, &job, sizeof(job))) { + while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue +{ struct amd_sched_fence *s_fence = job->s_fence; amd_sched_fence_scheduled(s_fence); d
[PATCH] drm/ttm:fix memory leak due to individualize
Change-Id: I6d06b81b8b894775e55e495e96f3c999b83cf2be Signed-off-by: Monk Liu --- drivers/gpu/drm/ttm/ttm_bo.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 97125cb..3c9c0d9 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -460,8 +460,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) if (reservation_object_test_signaled_rcu(&bo->ttm_resv, true)) { ttm_bo_del_from_lru(bo); spin_unlock(&glob->lru_lock); - if (bo->resv != &bo->ttm_resv) + if (bo->resv != &bo->ttm_resv) { reservation_object_unlock(&bo->ttm_resv); + reservation_object_fini(&bo->ttm_resv); + } + ttm_bo_cleanup_memtype_use(bo); return; } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [pull] amdgpu dc drm-next-4.15-dc
Am 22.10.2017 23:48, schrieb Dieter Nützel: Am 21.10.2017 23:22, schrieb Alex Deucher: Hi Dave, Last batch of new stuff for DC. Highlights: - Fix some memory leaks - S3 fixes - Hotplug fixes - Fix some CX multi-display issues - MST fixes - DML updates from the hw team - Various code cleanups - Misc bug fixes Now this tree has the same fan regression as 'amd-staging-drm-next' startet with 0944c350c8eddf4064e7abb881dd245032fdfa23. Look here: [amd-staging-drm-next] regression - no fan info (sensors) on RX580 https://lists.freedesktop.org/archives/amd-gfx/2017-October/014065.html Second: KDE's greeter 'kdm_greet' (login screen went into dpms) and KDE's 'screen blank' (energy saving / dpms off) never came back. All I can do is a clean reboot. So I have to disable all 'dpms'. But I could attach gdb remotely on it. 'kdm_greet' hang in 'poll'. Nothing alarming in 'dmesg' and 'Xorg.0.log'. (Both available taken from 'amd-staging-drm-next' if needed). Hello Alex and Rex, I've found good hint from Jan (randomsalad) on phoronix for the 'screen blank' (monitor standby mode): https://www.phoronix.com/forums/forum/phoronix/latest-phoronix-articles/984483-amdgpu-dc-gets-a-final-batch-of-changes-before-linux-4-15?p=984555#post984555 My Reply: https://www.phoronix.com/forums/forum/phoronix/latest-phoronix-articles/984483-amdgpu-dc-gets-a-final-batch-of-changes-before-linux-4-15?p=984581#post984581 I can swear, that I could 'return' one time (the first time, maybe due to only warm reboot) on 'drm-next-4.15-dc-wip' directly from within KDE session with replugging the video cable, but for all later tests on both kernels I have to blindly switching back to login screen (kdm_greet) and then replugging the video cable. For me these regression started with 'amd-staging-drm-next' much earlier than with the latest commit. Dieter ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [amd-staging-drm-next] compilation error with latest commit #1b006d838f78
Am 17.10.2017 16:59, schrieb Michel Dänzer: On 17/10/17 04:53 PM, Harry Wentland wrote: On 2017-10-17 10:47 AM, Michel Dänzer wrote: On 13/10/17 09:22 PM, Harry Wentland wrote: On 2017-10-12 08:22 PM, Dieter Nützel wrote: next (regression) compilation error: drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_resource.c: In function ‘resource_map_pool_resources’: drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_resource.c:1688:14: error: implicit declaration of function ‘acquire_first_split_pipe’; did you mean ‘acquire_first_free_pipe’? [-Werror=implicit-function-declaration] pipe_idx = acquire_first_split_pipe(&context->res_ctx, pool, stream); ^~~~ acquire_first_free_pipe It is wrongly (?) guarded behind: #if defined(CONFIG_DRM_AMD_DC_DCN1_0) static int acquire_first_split_pipe( struct resource_context *res_ctx, const struct resource_pool *pool, struct dc_stream_state *stream) [snip] Sent and merged a patch. This function only makes sense for DCN and shouldn't be called otherwise. Thanks for reporting this. I gotta make sure to build without the DCN flag in the future to avoid this. Would it be possible to drop options like DRM_AMD_DC_FBC and DRM_AMD_DC_DCN1_0 from amd-staging-drm-next (but especially upstream), and just always compile the code? DRM_AMD_DC_FBC should be pretty stable by now and can probably be dropped. I'll check with Roma who implemented it. Cool, thanks. I'm running my RX580 under 'amd-staging-drm-next' and 'drm-next-4.15-dc-wip' with it and don't see any problems so far. Greetings, Dieter ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [pull] amdgpu dc drm-next-4.15-dc
Am 21.10.2017 23:22, schrieb Alex Deucher: Hi Dave, Last batch of new stuff for DC. Highlights: - Fix some memory leaks - S3 fixes - Hotplug fixes - Fix some CX multi-display issues - MST fixes - DML updates from the hw team - Various code cleanups - Misc bug fixes Now this tree has the same fan regression as 'amd-staging-drm-next' startet with 0944c350c8eddf4064e7abb881dd245032fdfa23. Look here: [amd-staging-drm-next] regression - no fan info (sensors) on RX580 https://lists.freedesktop.org/archives/amd-gfx/2017-October/014065.html Second: KDE's greeter 'kdm_greet' (login screen went into dpms) and KDE's 'screen blank' (energy saving / dpms off) never came back. All I can do is a clean reboot. So I have to disable all 'dpms'. But I could attach gdb remotely on it. 'kdm_greet' hang in 'poll'. Nothing alarming in 'dmesg' and 'Xorg.0.log'. (Both available taken from 'amd-staging-drm-next' if needed). Cheers, Dieter ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH v4] drm/amd/powerplay: Remove unnecessary cast on void pointer
Hi Harsha, [auto build test ERROR on v4.14-rc3] [cannot apply to drm/drm-next next-20171018] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Harsha-Sharma/drm-amd-powerplay-Remove-unnecessary-cast-on-void-pointer/20171017-011426 config: x86_64-allmodconfig (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All errors (new ones prefixed by >>): drivers/gpu/drm/amd/amdgpu/../powerplay/hwmgr/smu7_hwmgr.c: In function 'smu7_get_pp_table_entry_callback_func_v1': >> drivers/gpu/drm/amd/amdgpu/../powerplay/hwmgr/smu7_hwmgr.c:2954:47: error: >> initialization from incompatible pointer type >> [-Werror=incompatible-pointer-types] struct smu7_power_state *smu7_power_state = &(power_state->hardware); ^ cc1: some warnings being treated as errors vim +2954 drivers/gpu/drm/amd/amdgpu/../powerplay/hwmgr/smu7_hwmgr.c 2948 2949 static int smu7_get_pp_table_entry_callback_func_v1(struct pp_hwmgr *hwmgr, 2950 void *state, struct pp_power_state *power_state, 2951 void *pp_table, uint32_t classification_flag) 2952 { 2953 struct smu7_hwmgr *data = hwmgr->backend; > 2954 struct smu7_power_state *smu7_power_state = > &(power_state->hardware); 2955 struct smu7_performance_level *performance_level; 2956 ATOM_Tonga_State *state_entry = state; 2957 ATOM_Tonga_POWERPLAYTABLE *powerplay_table = pp_table; 2958 PPTable_Generic_SubTable_Header *sclk_dep_table = 2959 (PPTable_Generic_SubTable_Header *) 2960 (((unsigned long)powerplay_table) + 2961 le16_to_cpu(powerplay_table->usSclkDependencyTableOffset)); 2962 2963 ATOM_Tonga_MCLK_Dependency_Table *mclk_dep_table = 2964 (ATOM_Tonga_MCLK_Dependency_Table *) 2965 (((unsigned long)powerplay_table) + 2966 le16_to_cpu(powerplay_table->usMclkDependencyTableOffset)); 2967 2968 /* The following fields are not initialized here: id orderedList allStatesList */ 2969 power_state->classification.ui_label = 2970 (le16_to_cpu(state_entry->usClassification) & 2971 ATOM_PPLIB_CLASSIFICATION_UI_MASK) >> 2972 ATOM_PPLIB_CLASSIFICATION_UI_SHIFT; 2973 power_state->classification.flags = classification_flag; 2974 /* NOTE: There is a classification2 flag in BIOS that is not being used right now */ 2975 2976 power_state->classification.temporary_state = false; 2977 power_state->classification.to_be_deleted = false; 2978 2979 power_state->validation.disallowOnDC = 2980 (0 != (le32_to_cpu(state_entry->ulCapsAndSettings) & 2981 ATOM_Tonga_DISALLOW_ON_DC)); 2982 2983 power_state->pcie.lanes = 0; 2984 2985 power_state->display.disableFrameModulation = false; 2986 power_state->display.limitRefreshrate = false; 2987 power_state->display.enableVariBright = 2988 (0 != (le32_to_cpu(state_entry->ulCapsAndSettings) & 2989 ATOM_Tonga_ENABLE_VARIBRIGHT)); 2990 2991 power_state->validation.supportedPowerLevels = 0; 2992 power_state->uvd_clocks.VCLK = 0; 2993 power_state->uvd_clocks.DCLK = 0; 2994 power_state->temperatures.min = 0; 2995 power_state->temperatures.max = 0; 2996 2997 performance_level = &(smu7_power_state->performance_levels 2998 [smu7_power_state->performance_level_count++]); 2999 3000 PP_ASSERT_WITH_CODE( 3001 (smu7_power_state->performance_level_count < smum_get_mac_definition(hwmgr->smumgr, SMU_MAX_LEVELS_GRAPHICS)), 3002 "Performance levels exceeds SMC limit!", 3003 return -EINVAL); 3004 3005 PP_ASSERT_WITH_CODE( 3006 (smu7_power_state->performance_level_count <= 3007 hwmgr->platform_descriptor.hardwareActivityPerformanceLevels), 3008 "Performance levels exceeds Driver limit!", 3009 return -EINVAL); 3010 3011 /* Performance levels are arranged from low to high. */ 3012 performance_level->memory_clock = mclk_dep_table->entries 3013 [state_entry->ucMemoryClockIndexLow].ulMclk; 3014