[PATCH 1/5] drm/amdgpu:add hang_limit for sched(v2)

2017-10-22 Thread Monk Liu
since gpu_scheduler source domain cannot access amdgpu variable
so need create the hang_limit membewr for sched, and it can
refer it for the upcoming GPU RESET patches

v2:
make hang_limit a parameter of sched_init()

Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 6 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2167dac..f43319c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -446,7 +446,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
timeout = MAX_SCHEDULE_TIMEOUT;
}
r = amd_sched_init(>sched, _sched_ops,
-  num_hw_submission,
+  num_hw_submission, amdgpu_job_hang_limit,
   timeout, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bbbce2..369f15d 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -676,13 +676,17 @@ static int amd_sched_main(void *param)
 */
 int amd_sched_init(struct amd_gpu_scheduler *sched,
   const struct amd_sched_backend_ops *ops,
-  unsigned hw_submission, long timeout, const char *name)
+  unsigned hw_submission,
+  unsigned hang_limit,
+  long timeout,
+  const char *name)
 {
int i;
sched->ops = ops;
sched->hw_submission_limit = hw_submission;
sched->name = name;
sched->timeout = timeout;
+   sched->hang_limit = hang_limit;
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++)
amd_sched_rq_init(>sched_rq[i]);
 
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 3f75b45..3ea75a2 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -144,11 +144,12 @@ struct amd_gpu_scheduler {
struct task_struct  *thread;
struct list_headring_mirror_list;
spinlock_t  job_list_lock;
+   int hang_limit;
 };
 
 int amd_sched_init(struct amd_gpu_scheduler *sched,
   const struct amd_sched_backend_ops *ops,
-  uint32_t hw_submission, long timeout, const char *name);
+  uint32_t hw_submission, unsigned hang_limit, long timeout, 
const char *name);
 void amd_sched_fini(struct amd_gpu_scheduler *sched);
 
 int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 4/5] drm/amdgpu:skip job for guilty ctx in parser_init

2017-10-22 Thread Monk Liu
Change-Id: I44019f6475b1eaaba55633cf5f8bb84284f19a2c
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index a760b6e..1072fc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -90,6 +90,12 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, 
void *data)
goto free_chunk;
}
 
+   /* skip guilty context job */
+   if (atomic_read(>ctx->guilty) == 1) {
+   ret = -ECANCELED;
+   goto free_chunk;
+   }
+
mutex_lock(>ctx->lock);
 
/* get chunks */
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/5] drm/amd/scheduler:introduce guilty pointer member

2017-10-22 Thread Monk Liu
this member will be used later, it will points to
the real var inside of context and CS_SUBMIT & gpu schdduler
can decide if skip a job depends on context->guilty or *entity->guilty

Change-Id: I411e117a01c54286db0765fd2f6bf9d3bda01a3b
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 2 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 2 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 3 ++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++-
 10 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c184468..bb5a46a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
continue;
 
r = amd_sched_entity_init(>sched, >rings[i].entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r)
goto failed;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcdfb8d..af65cb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -110,7 +110,7 @@ static int amdgpu_ttm_global_init(struct amdgpu_device 
*adev)
ring = adev->mman.buffer_funcs_ring;
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_KERNEL];
r = amd_sched_entity_init(>sched, >mman.entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r) {
DRM_ERROR("Failed setting up TTM BO move run queue.\n");
goto error_entity;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index e8bd50c..6604771 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -232,7 +232,7 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev)
ring = >uvd.ring;
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >uvd.entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up UVD run queue.\n");
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index b46280c..60a8fb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -176,7 +176,7 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned 
long size)
ring = >vce.ring[0];
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >vce.entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up VCE run queue.\n");
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 041e012..96df21c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -106,7 +106,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
ring = >vcn.ring_dec;
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >vcn.entity_dec,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up VCN dec run queue.\n");
return r;
@@ -115,7 +115,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
ring = >vcn.ring_enc[0];
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >vcn.entity_enc,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up VCN enc run queue.\n");
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ef8b7a9..3e1959e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2637,7 +2637,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
ring = 

[PATCH 5/5] drm/amdgpu:cleanup job reset routine(v2)

2017-10-22 Thread Monk Liu
merge the setting guilty on context into this function
to avoid implement extra routine.

v2:
go through entity list and compare the fence_ctx
before operate on the entity, otherwise the entity
may be just a wild pointer

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  4 ++--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 31 ++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
amd_sched_job_kickout(>base);
 
/* only do job_reset on the hang ring if @job not NULL */
-   amd_sched_hw_job_reset(>sched);
+   amd_sched_hw_job_reset(>sched, NULL);
 
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
-   amd_sched_hw_job_reset(>sched);
+   amd_sched_hw_job_reset(>sched, NULL);
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 97fd255..39b6205 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
job->sched->ops->timedout_job(job);
 }
 
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
+static void amd_sched_set_guilty(struct amd_sched_job *s_job)
+{
+   if (atomic_inc_return(_job->karma) > s_job->sched->hang_limit)
+   if (s_job->s_entity->guilty)
+   atomic_set(s_job->s_entity->guilty, 1);
+}
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *bad)
 {
struct amd_sched_job *s_job;
+   struct amd_sched_entity *entity, *tmp;
+   int i;;
 
spin_lock(>job_list_lock);
list_for_each_entry_reverse(s_job, >ring_mirror_list, node) {
@@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched)
}
}
spin_unlock(>job_list_lock);
+
+   if (bad) {
+   bool found = false;
+
+   for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; 
i++ ) {
+   struct amd_sched_rq *rq = >sched_rq[i];
+
+   spin_lock(>lock);
+   list_for_each_entry_safe(entity, tmp, >entities, 
list) {
+   if (bad->s_fence->scheduled.context == 
entity->fence_context) {
+   found = true;
+   amd_sched_set_guilty(bad);
+   break;
+   }
+   }
+   spin_unlock(>lock);
+   if (found)
+   break;
+   }
+   }
 }
 
 void amd_sched_job_kickout(struct amd_sched_job *s_job)
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index a05994c..be75172 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
   struct amd_gpu_scheduler *sched,
   struct amd_sched_entity *entity,
   void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *job);
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
 bool amd_sched_dependency_optimized(struct dma_fence* fence,
struct amd_sched_entity *entity);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/5] drm/amdgpu:pass ctx->guilty address to entity init

2017-10-22 Thread Monk Liu
this way the real interested guilty is connected to entity->guilty
pointer, and we can use entity->pointer later in gpu recovery procedure

Change-Id: I09b392ebfc59254795e4fbd5816abd3d94a95853
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 774edc1..6a4178b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -744,6 +744,7 @@ struct amdgpu_ctx {
enum amd_sched_priority init_priority;
enum amd_sched_priority override_priority;
struct mutexlock;
+   atomic_tguilty;
 };
 
 struct amdgpu_ctx_mgr {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index bb5a46a..1bf4cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
continue;
 
r = amd_sched_entity_init(>sched, >rings[i].entity,
- rq, amdgpu_sched_jobs, NULL);
+ rq, amdgpu_sched_jobs, >guilty);
if (r)
goto failed;
}
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/psp: fix compile warnings on buffer address print

2017-10-22 Thread Evan Quan
drivers/gpu/drm/amd/amdgpu//psp_v3_1.c:389:13:
warning: format ‘%x’ expects argument of type ‘unsigned int’, but
argument 4 has type ‘struct psp_gfx_rb_frame *’ [-Wformat=]
   DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame
= %x\n",
 ^
./include/drm/drmP.h:178:36: note: in definition of macro ‘DRM_ERROR’
  drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__)
^
warning: format ‘%x’ expects argument of type ‘unsigned int’, but
argument 5 has type ‘struct psp_gfx_rb_frame *’ [-Wformat=]
   DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame
= %x\n",
 ^
./include/drm/drmP.h:178:36: note: in definition of macro ‘DRM_ERROR’
  drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__)
^
warning: format ‘%x’ expects argument of type ‘unsigned int’, but
argument 6 has type ‘struct psp_gfx_rb_frame *’ [-Wformat=]
   DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; write_frame
= %x\n",
 ^
./include/drm/drmP.h:178:36: note: in definition of macro ‘DRM_ERROR’
  drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__)

Change-Id: Ie807eba4594d588561a1d4427ccb2286cc5f5065
Signed-off-by: Evan Quan 
Reported-by: Rex Zhu 
---
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/psp_v3_1.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
index 26e10ab..6b56123 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
@@ -274,7 +274,7 @@ int psp_v10_0_cmd_submit(struct psp_context *psp,
write_frame = ring_buffer_start + (psp_write_ptr_reg / 
rb_frame_size_dw);
/* Check invalid write_frame ptr address */
if ((write_frame < ring_buffer_start) || (ring_buffer_end < 
write_frame)) {
-   DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; 
write_frame = %x\n",
+   DRM_ERROR("ring_buffer_start = %p; ring_buffer_end = %p; 
write_frame = %p\n",
ring_buffer_start,
ring_buffer_end,
write_frame);
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
index d217050..7008569 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
@@ -386,7 +386,7 @@ int psp_v3_1_cmd_submit(struct psp_context *psp,
write_frame = ring_buffer_start + (psp_write_ptr_reg / 
rb_frame_size_dw);
/* Check invalid write_frame ptr address */
if ((write_frame < ring_buffer_start) || (ring_buffer_end < 
write_frame)) {
-   DRM_ERROR("ring_buffer_start = %x; ring_buffer_end = %x; 
write_frame = %x\n",
+   DRM_ERROR("ring_buffer_start = %p; ring_buffer_end = %p; 
write_frame = %p\n",
ring_buffer_start,
ring_buffer_end,
write_frame);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 1/5] drm/amdgpu:cleanup job reset routine

2017-10-22 Thread Liu, Monk
Sorry, this patch is invalide, please ignore this one, will send out v2 for 
review

-Original Message-
From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of Monk 
Liu
Sent: 2017年10月23日 13:34
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 1/5] drm/amdgpu:cleanup job reset routine

merge the setting guilty on context into this function to avoid implement extra 
routine.

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  4 ++--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 -  
drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
amd_sched_job_kickout(>base);
 
/* only do job_reset on the hang ring if @job not NULL */
-   amd_sched_hw_job_reset(>sched);
+   amd_sched_hw_job_reset(>sched, NULL);
 
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
-   amd_sched_hw_job_reset(>sched);
+   amd_sched_hw_job_reset(>sched, NULL);
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index e4d3b4e..322b6c1 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
job->sched->ops->timedout_job(job);
 }
 
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
+static void amd_sched_set_guilty(struct amd_sched_job *s_job) {
+   if (atomic_inc_return(_job->karma) > s_job->sched->hang_limit)
+   if (s_job->s_entity->guilty)
+   atomic_set(s_job->s_entity->guilty, 1); }
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
+amd_sched_job *bad)
 {
struct amd_sched_job *s_job;
 
@@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched)
s_job->s_fence->parent = NULL;
atomic_dec(>hw_rq_count);
}
+
+   if (bad && bad == s_job) {
+   amd_sched_set_guilty(s_job);
+   }
}
spin_unlock(>job_list_lock);
 }
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 52c8e54..d26e778 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -170,7 +170,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
   struct amd_gpu_scheduler *sched,
   struct amd_sched_entity *entity,
   void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
+amd_sched_job *job);
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);  bool 
amd_sched_dependency_optimized(struct dma_fence* fence,
struct amd_sched_entity *entity);
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 4/5] drm/amdgpu:pass ctx->guilty address to entity init

2017-10-22 Thread Monk Liu
this way the real interested guilty is connected to entity->guilty
pointer, and we can use entity->pointer later in gpu recovery procedure

Change-Id: I09b392ebfc59254795e4fbd5816abd3d94a95853
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 774edc1..6a4178b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -744,6 +744,7 @@ struct amdgpu_ctx {
enum amd_sched_priority init_priority;
enum amd_sched_priority override_priority;
struct mutexlock;
+   atomic_tguilty;
 };
 
 struct amdgpu_ctx_mgr {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index bb5a46a..1bf4cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
continue;
 
r = amd_sched_entity_init(>sched, >rings[i].entity,
- rq, amdgpu_sched_jobs, NULL);
+ rq, amdgpu_sched_jobs, >guilty);
if (r)
goto failed;
}
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/5] drm/amd/scheduler:introduce guilty pointer member

2017-10-22 Thread Monk Liu
this member will be used later, it will points to
the real var inside of context and CS_SUBMIT & gpu schdduler
can decide if skip a job depends on context->guilty or *entity->guilty

Change-Id: I411e117a01c54286db0765fd2f6bf9d3bda01a3b
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 2 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 2 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 3 ++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++-
 10 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c184468..bb5a46a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -90,7 +90,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
continue;
 
r = amd_sched_entity_init(>sched, >rings[i].entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r)
goto failed;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcdfb8d..af65cb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -110,7 +110,7 @@ static int amdgpu_ttm_global_init(struct amdgpu_device 
*adev)
ring = adev->mman.buffer_funcs_ring;
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_KERNEL];
r = amd_sched_entity_init(>sched, >mman.entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r) {
DRM_ERROR("Failed setting up TTM BO move run queue.\n");
goto error_entity;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index e8bd50c..6604771 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -232,7 +232,7 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev)
ring = >uvd.ring;
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >uvd.entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up UVD run queue.\n");
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index b46280c..60a8fb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -176,7 +176,7 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned 
long size)
ring = >vce.ring[0];
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >vce.entity,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up VCE run queue.\n");
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 041e012..96df21c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -106,7 +106,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
ring = >vcn.ring_dec;
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >vcn.entity_dec,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up VCN dec run queue.\n");
return r;
@@ -115,7 +115,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
ring = >vcn.ring_enc[0];
rq = >sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
r = amd_sched_entity_init(>sched, >vcn.entity_enc,
- rq, amdgpu_sched_jobs);
+ rq, amdgpu_sched_jobs, NULL);
if (r != 0) {
DRM_ERROR("Failed setting up VCN enc run queue.\n");
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ef8b7a9..3e1959e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2637,7 +2637,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
ring = 

[PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init

2017-10-22 Thread Monk Liu
Change-Id: I44019f6475b1eaaba55633cf5f8bb84284f19a2c
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index f7fceb6..1d5cbe7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -90,6 +90,12 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, 
void *data)
goto free_chunk;
}
 
+   /* skip guilty context job */
+   if (atomic_read(>ctx->guilty) == 1) {
+   ret = -ECANCELED;
+   goto free_chunk;
+   }
+
mutex_lock(>ctx->lock);
 
/* get chunks */
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/5] drm/amdgpu:cleanup job reset routine

2017-10-22 Thread Monk Liu
merge the setting guilty on context into this function
to avoid implement extra routine.

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  4 ++--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 -
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
amd_sched_job_kickout(>base);
 
/* only do job_reset on the hang ring if @job not NULL */
-   amd_sched_hw_job_reset(>sched);
+   amd_sched_hw_job_reset(>sched, NULL);
 
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
-   amd_sched_hw_job_reset(>sched);
+   amd_sched_hw_job_reset(>sched, NULL);
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index e4d3b4e..322b6c1 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
job->sched->ops->timedout_job(job);
 }
 
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
+static void amd_sched_set_guilty(struct amd_sched_job *s_job)
+{
+   if (atomic_inc_return(_job->karma) > s_job->sched->hang_limit)
+   if (s_job->s_entity->guilty)
+   atomic_set(s_job->s_entity->guilty, 1);
+}
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *bad)
 {
struct amd_sched_job *s_job;
 
@@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched)
s_job->s_fence->parent = NULL;
atomic_dec(>hw_rq_count);
}
+
+   if (bad && bad == s_job) {
+   amd_sched_set_guilty(s_job);
+   }
}
spin_unlock(>job_list_lock);
 }
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 52c8e54..d26e778 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -170,7 +170,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
   struct amd_gpu_scheduler *sched,
   struct amd_sched_entity *entity,
   void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *job);
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
 bool amd_sched_dependency_optimized(struct dma_fence* fence,
struct amd_sched_entity *entity);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/5] drm/amdgpu:add hang_limit for sched(v2)

2017-10-22 Thread Monk Liu
since gpu_scheduler source domain cannot access amdgpu variable
so need create the hang_limit membewr for sched, and it can
refer it for the upcoming GPU RESET patches

v2:
make hang_limit a parameter of sched_init()

Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 6 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2167dac..f43319c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -446,7 +446,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
timeout = MAX_SCHEDULE_TIMEOUT;
}
r = amd_sched_init(>sched, _sched_ops,
-  num_hw_submission,
+  num_hw_submission, amdgpu_job_hang_limit,
   timeout, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 322b6c1..dfda486 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -686,13 +686,17 @@ static int amd_sched_main(void *param)
 */
 int amd_sched_init(struct amd_gpu_scheduler *sched,
   const struct amd_sched_backend_ops *ops,
-  unsigned hw_submission, long timeout, const char *name)
+  unsigned hw_submission,
+  unsigned hang_limit,
+  long timeout,
+  const char *name)
 {
int i;
sched->ops = ops;
sched->hw_submission_limit = hw_submission;
sched->name = name;
sched->timeout = timeout;
+   sched->hang_limit = hang_limit;
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++)
amd_sched_rq_init(>sched_rq[i]);
 
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index d26e778..774b5da 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -142,11 +142,12 @@ struct amd_gpu_scheduler {
struct task_struct  *thread;
struct list_headring_mirror_list;
spinlock_t  job_list_lock;
+   int hang_limit;
 };
 
 int amd_sched_init(struct amd_gpu_scheduler *sched,
   const struct amd_sched_backend_ops *ops,
-  uint32_t hw_submission, long timeout, const char *name);
+  uint32_t hw_submission, unsigned hang_limit, long timeout, 
const char *name);
 void amd_sched_fini(struct amd_gpu_scheduler *sched);
 
 int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.

2017-10-22 Thread Liu, Monk
If the deadlock issue could be solved I don't see why we give up kfifo and 
switch to SPSC ..

-Original Message-
From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of 
Andrey Grodzovsky
Sent: 2017年10月20日 21:32
To: amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey ; Koenig, Christian 

Subject: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.

It is intended to sabstitute the bounded fifo we are currently using.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/scheduler/spsc_queue.h | 120 +
 1 file changed, 120 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/scheduler/spsc_queue.h

diff --git a/drivers/gpu/drm/amd/scheduler/spsc_queue.h 
b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
new file mode 100644
index 000..a3394f1
--- /dev/null
+++ b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person 
+obtaining a
+ * copy of this software and associated documentation files (the 
+"Software"),
+ * to deal in the Software without restriction, including without 
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute, 
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom 
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef AMD_SCHEDULER_SPSC_QUEUE_H_
+#define AMD_SCHEDULER_SPSC_QUEUE_H_
+
+#include 
+
+/** SPSC lockless queue */
+
+struct spsc_node {
+
+   /* Stores spsc_node* */
+   struct spsc_node *next;
+};
+
+struct spsc_queue {
+
+struct spsc_node *head;
+
+   /* atomic pointer to struct spsc_node* */
+   atomic_long_t tail;
+
+   atomic_t job_count;
+};
+
+static inline void spsc_queue_init(struct spsc_queue *queue) {
+   queue->head = NULL;
+   atomic_long_set(>tail, (long)>head);
+   atomic_set(>job_count, 0);
+}
+
+static inline struct spsc_node *spsc_queue_peek(struct spsc_queue 
+*queue) {
+   return queue->head;
+}
+
+static inline int spsc_queue_count(struct spsc_queue *queue) {
+   return atomic_read(>job_count); }
+
+static inline bool spsc_queue_push(struct spsc_queue *queue, struct 
+spsc_node *node) {
+   struct spsc_node **tail;
+
+   node->next = NULL;
+
+   preempt_disable();
+
+   tail = (struct spsc_node **)atomic_long_xchg(>tail, 
(long)>next);
+   WRITE_ONCE(*tail, node);
+   atomic_inc(>job_count);
+
+   /*
+* In case of first element verify new node will be visible to the 
consumer
+* thread when we ping the kernel thread that there is new work to do.
+*/
+   smp_wmb();
+
+   preempt_enable();
+
+   return tail == >head;
+}
+
+
+static inline struct spsc_node *spsc_queue_pop(struct spsc_queue 
+*queue) {
+   struct spsc_node *next, *node;
+
+   /* Verify reading from memory and not the cache */
+   smp_rmb();
+
+   node = READ_ONCE(queue->head);
+
+   if (!node)
+   return NULL;
+
+   next = READ_ONCE(node->next);
+   WRITE_ONCE(queue->head, next);
+
+   if (unlikely(!next)) {
+   /* slowpath for the last element in the queue */
+
+   if (atomic_long_cmpxchg(>tail,
+   (long)>next,(long) >head) != 
(long)>next) {
+   /* Updating tail failed wait for new next to appear */
+   do {
+   smp_rmb();
+   }while (unlikely(!(queue->head = 
READ_ONCE(node->next;
+   }
+   }
+
+   atomic_dec(>job_count);
+   return node;
+}
+
+
+
+#endif /* AMD_SCHEDULER_SPSC_QUEUE_H_ */
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

2017-10-22 Thread Liu, Monk
Why not use a more simple way ?

Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() 
?
That could solve the deadlock from your description 

And the push order is already guaranteed by context->mutex (which is also a 
patch from you)


BR Monk

-Original Message-
From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of 
Andrey Grodzovsky
Sent: 2017年10月20日 21:32
To: amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey ; Koenig, Christian 

Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

Switch from kfifo to SPSC queue.

Bug:
Kfifo is limited at size, during GPU reset it would fill up to limit and the 
pushing thread (producer) would wait for the scheduler worker to consume the 
items in the fifo while holding reservation lock on a BO. The gpu reset thread 
on the other hand blocks the scheduler during reset. Before it unblocks the 
sceduler it might want to recover VRAM and so will try to reserve the same BO 
the producer thread is already holding creating a deadlock.

Fix:
Switch from kfifo to SPSC queue which is unlimited in size.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++---
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
 3 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h 
b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
index 8bd3810..86838a8 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
@@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
   __entry->id = sched_job->id;
   __entry->fence = _job->s_fence->finished;
   __entry->name = sched_job->sched->name;
-  __entry->job_count = kfifo_len(
-  _job->s_entity->job_queue) / 
sizeof(sched_job);
+  __entry->job_count = spsc_queue_count(
+  _job->s_entity->job_queue);
   __entry->hw_job_count = atomic_read(
   _job->sched->hw_rq_count);
   ),
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bbbce2..0c9cdc0 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -28,9 +28,14 @@
 #include 
 #include "gpu_scheduler.h"
 
+#include "spsc_queue.h"
+
 #define CREATE_TRACE_POINTS
 #include "gpu_sched_trace.h"
 
+#define to_amd_sched_job(sched_job)\
+   container_of((sched_job), struct amd_sched_job, queue_node)
+
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  
static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void 
amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 
+128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
  struct amd_sched_rq *rq,
  uint32_t jobs)
 {
-   int r;
-
if (!(sched && entity && rq))
return -EINVAL;
 
@@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 
spin_lock_init(>rq_lock);
spin_lock_init(>queue_lock);
-   r = kfifo_alloc(>job_queue, jobs * sizeof(void *), GFP_KERNEL);
-   if (r)
-   return r;
+   spsc_queue_init(>job_queue);
 
atomic_set(>fence_seq, 0);
entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ 
static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  
static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
rmb();
-   if (kfifo_is_empty(>job_queue))
+   if (spsc_queue_peek(>job_queue) == NULL)
return true;
 
return false;
@@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct 
amd_sched_entity *entity)
  */
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
-   if (kfifo_is_empty(>job_queue))
+   if (spsc_queue_peek(>job_queue) == NULL)
return false;
 
if (ACCESS_ONCE(entity->dependency))
@@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 */
kthread_park(sched->thread);
kthread_unpark(sched->thread);
-   while (kfifo_out(>job_queue, , sizeof(job))) {
+   while ((job = 
to_amd_sched_job(spsc_queue_pop(>job_queue 
+{
struct amd_sched_fence *s_fence = job->s_fence;
amd_sched_fence_scheduled(s_fence);

[PATCH] drm/ttm:fix memory leak due to individualize

2017-10-22 Thread Monk Liu
Change-Id: I6d06b81b8b894775e55e495e96f3c999b83cf2be
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/ttm/ttm_bo.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 97125cb..3c9c0d9 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -460,8 +460,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct 
ttm_buffer_object *bo)
if (reservation_object_test_signaled_rcu(>ttm_resv, true)) {
ttm_bo_del_from_lru(bo);
spin_unlock(>lru_lock);
-   if (bo->resv != >ttm_resv)
+   if (bo->resv != >ttm_resv) {
reservation_object_unlock(>ttm_resv);
+   reservation_object_fini(>ttm_resv);
+   }
+
ttm_bo_cleanup_memtype_use(bo);
return;
}
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [pull] amdgpu dc drm-next-4.15-dc

2017-10-22 Thread Dieter Nützel

Am 22.10.2017 23:48, schrieb Dieter Nützel:

Am 21.10.2017 23:22, schrieb Alex Deucher:

Hi Dave,

Last batch of new stuff for DC. Highlights:
- Fix some memory leaks
- S3 fixes
- Hotplug fixes
- Fix some CX multi-display issues
- MST fixes
- DML updates from the hw team
- Various code cleanups
- Misc bug fixes


Now this tree has the same fan regression as 'amd-staging-drm-next'
startet with 0944c350c8eddf4064e7abb881dd245032fdfa23.

Look here:
[amd-staging-drm-next] regression - no fan info (sensors) on RX580
https://lists.freedesktop.org/archives/amd-gfx/2017-October/014065.html

Second:
KDE's greeter 'kdm_greet' (login screen went into dpms) and KDE's
'screen blank' (energy saving / dpms off) never came back. All I can
do is a clean reboot. So I have to disable all 'dpms'.
But I could attach gdb remotely on it.
'kdm_greet' hang in 'poll'.
Nothing alarming in 'dmesg' and 'Xorg.0.log'. (Both available taken
from 'amd-staging-drm-next' if needed).


Hello Alex and Rex,

I've found good hint from Jan (randomsalad) on phoronix for the 'screen 
blank' (monitor standby mode):

https://www.phoronix.com/forums/forum/phoronix/latest-phoronix-articles/984483-amdgpu-dc-gets-a-final-batch-of-changes-before-linux-4-15?p=984555#post984555

My Reply:
https://www.phoronix.com/forums/forum/phoronix/latest-phoronix-articles/984483-amdgpu-dc-gets-a-final-batch-of-changes-before-linux-4-15?p=984581#post984581

I can swear, that I could 'return' one time (the first time, maybe due 
to only warm reboot) on 'drm-next-4.15-dc-wip' directly from within KDE 
session with replugging the video cable, but for all later tests on both 
kernels I have to blindly switching back to login screen (kdm_greet) and 
then replugging the video cable.


For me these regression started with 'amd-staging-drm-next' much earlier 
than with the latest commit.


Dieter
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [amd-staging-drm-next] compilation error with latest commit #1b006d838f78

2017-10-22 Thread Dieter Nützel

Am 17.10.2017 16:59, schrieb Michel Dänzer:

On 17/10/17 04:53 PM, Harry Wentland wrote:

On 2017-10-17 10:47 AM, Michel Dänzer wrote:

On 13/10/17 09:22 PM, Harry Wentland wrote:

On 2017-10-12 08:22 PM, Dieter Nützel wrote:


next (regression) compilation error:

drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_resource.c: In 
function ‘resource_map_pool_resources’:
drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_resource.c:1688:14: 
error: implicit declaration of function ‘acquire_first_split_pipe’; 
did you mean ‘acquire_first_free_pipe’? 
[-Werror=implicit-function-declaration]
   pipe_idx = acquire_first_split_pipe(>res_ctx, pool, 
stream);

  ^~~~
  acquire_first_free_pipe


It is wrongly (?) guarded behind:

#if defined(CONFIG_DRM_AMD_DC_DCN1_0)
static int acquire_first_split_pipe(
    struct resource_context *res_ctx,
    const struct resource_pool *pool,
    struct dc_stream_state *stream)
[snip]



Sent and merged a patch. This function only makes sense for DCN and 
shouldn't
be called otherwise. Thanks for reporting this. I gotta make sure to 
build

without the DCN flag in the future to avoid this.


Would it be possible to drop options like DRM_AMD_DC_FBC and
DRM_AMD_DC_DCN1_0 from amd-staging-drm-next (but especially 
upstream),

and just always compile the code?


DRM_AMD_DC_FBC should be pretty stable by now and can probably be 
dropped.

I'll check with Roma who implemented it.


Cool, thanks.


I'm running my RX580 under 'amd-staging-drm-next' and 
'drm-next-4.15-dc-wip' with it and don't see any problems so far.


Greetings,
Dieter
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH v4] drm/amd/powerplay: Remove unnecessary cast on void pointer

2017-10-22 Thread kbuild test robot
Hi Harsha,

[auto build test ERROR on v4.14-rc3]
[cannot apply to drm/drm-next next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Harsha-Sharma/drm-amd-powerplay-Remove-unnecessary-cast-on-void-pointer/20171017-011426
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/gpu/drm/amd/amdgpu/../powerplay/hwmgr/smu7_hwmgr.c: In function 
'smu7_get_pp_table_entry_callback_func_v1':
>> drivers/gpu/drm/amd/amdgpu/../powerplay/hwmgr/smu7_hwmgr.c:2954:47: error: 
>> initialization from incompatible pointer type 
>> [-Werror=incompatible-pointer-types]
 struct smu7_power_state  *smu7_power_state = &(power_state->hardware);
  ^
   cc1: some warnings being treated as errors

vim +2954 drivers/gpu/drm/amd/amdgpu/../powerplay/hwmgr/smu7_hwmgr.c

  2948  
  2949  static int smu7_get_pp_table_entry_callback_func_v1(struct pp_hwmgr 
*hwmgr,
  2950  void *state, struct pp_power_state *power_state,
  2951  void *pp_table, uint32_t classification_flag)
  2952  {
  2953  struct smu7_hwmgr *data = hwmgr->backend;
> 2954  struct smu7_power_state  *smu7_power_state = 
> &(power_state->hardware);
  2955  struct smu7_performance_level *performance_level;
  2956  ATOM_Tonga_State *state_entry = state;
  2957  ATOM_Tonga_POWERPLAYTABLE *powerplay_table = pp_table;
  2958  PPTable_Generic_SubTable_Header *sclk_dep_table =
  2959  (PPTable_Generic_SubTable_Header *)
  2960  (((unsigned long)powerplay_table) +
  2961  
le16_to_cpu(powerplay_table->usSclkDependencyTableOffset));
  2962  
  2963  ATOM_Tonga_MCLK_Dependency_Table *mclk_dep_table =
  2964  (ATOM_Tonga_MCLK_Dependency_Table *)
  2965  (((unsigned long)powerplay_table) +
  2966  
le16_to_cpu(powerplay_table->usMclkDependencyTableOffset));
  2967  
  2968  /* The following fields are not initialized here: id 
orderedList allStatesList */
  2969  power_state->classification.ui_label =
  2970  (le16_to_cpu(state_entry->usClassification) &
  2971  ATOM_PPLIB_CLASSIFICATION_UI_MASK) >>
  2972  ATOM_PPLIB_CLASSIFICATION_UI_SHIFT;
  2973  power_state->classification.flags = classification_flag;
  2974  /* NOTE: There is a classification2 flag in BIOS that is not 
being used right now */
  2975  
  2976  power_state->classification.temporary_state = false;
  2977  power_state->classification.to_be_deleted = false;
  2978  
  2979  power_state->validation.disallowOnDC =
  2980  (0 != 
(le32_to_cpu(state_entry->ulCapsAndSettings) &
  2981  ATOM_Tonga_DISALLOW_ON_DC));
  2982  
  2983  power_state->pcie.lanes = 0;
  2984  
  2985  power_state->display.disableFrameModulation = false;
  2986  power_state->display.limitRefreshrate = false;
  2987  power_state->display.enableVariBright =
  2988  (0 != 
(le32_to_cpu(state_entry->ulCapsAndSettings) &
  2989  ATOM_Tonga_ENABLE_VARIBRIGHT));
  2990  
  2991  power_state->validation.supportedPowerLevels = 0;
  2992  power_state->uvd_clocks.VCLK = 0;
  2993  power_state->uvd_clocks.DCLK = 0;
  2994  power_state->temperatures.min = 0;
  2995  power_state->temperatures.max = 0;
  2996  
  2997  performance_level = &(smu7_power_state->performance_levels
  2998  [smu7_power_state->performance_level_count++]);
  2999  
  3000  PP_ASSERT_WITH_CODE(
  3001  (smu7_power_state->performance_level_count < 
smum_get_mac_definition(hwmgr->smumgr, SMU_MAX_LEVELS_GRAPHICS)),
  3002  "Performance levels exceeds SMC limit!",
  3003  return -EINVAL);
  3004  
  3005  PP_ASSERT_WITH_CODE(
  3006  (smu7_power_state->performance_level_count <=
  3007  
hwmgr->platform_descriptor.hardwareActivityPerformanceLevels),
  3008  "Performance levels exceeds Driver limit!",
  3009  return -EINVAL);
  3010  
  3011  /* Performance levels are arranged from low to high. */
  3012  performance_level->memory_clock = mclk_dep_table->entries
  3013  [state_entry->ucMemoryClockIndexLow].ulMclk;
  3014