Re: [PATCH 5/5] drm/amdgpu:cleanup job reset routine(v2)

2017-10-23 Thread Christian König

Am 23.10.2017 um 07:46 schrieb Monk Liu:

merge the setting guilty on context into this function
to avoid implement extra routine.

v2:
go through entity list and compare the fence_ctx
before operate on the entity, otherwise the entity
may be just a wild pointer

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  4 ++--
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 31 ++-
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
  3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
amd_sched_job_kickout(&job->base);
  
  		/* only do job_reset on the hang ring if @job not NULL */

-   amd_sched_hw_job_reset(&ring->sched);
+   amd_sched_hw_job_reset(&ring->sched, NULL);
  
  		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */

amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
-   amd_sched_hw_job_reset(&ring->sched);
+   amd_sched_hw_job_reset(&ring->sched, NULL);
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 97fd255..39b6205 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
job->sched->ops->timedout_job(job);
  }
  
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)

+static void amd_sched_set_guilty(struct amd_sched_job *s_job)
+{
+   if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
+   if (s_job->s_entity->guilty)
+   atomic_set(s_job->s_entity->guilty, 1);
+}
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *bad)
  {
struct amd_sched_job *s_job;
+   struct amd_sched_entity *entity, *tmp;
+   int i;;
  
  	spin_lock(&sched->job_list_lock);

list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
@@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched)
}
}
spin_unlock(&sched->job_list_lock);
+
+   if (bad) {
+   bool found = false;
+
+   for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; 
i++ ) {
+   struct amd_sched_rq *rq = &sched->sched_rq[i];
+
+   spin_lock(&rq->lock);
+   list_for_each_entry_safe(entity, tmp, &rq->entities, 
list) {


No, need for the _safe version here since we don't modify the rq list.


+   if (bad->s_fence->scheduled.context == 
entity->fence_context) {
+   found = true;
+   amd_sched_set_guilty(bad);


I would prefer if we give the entity as separate parameter here. This 
way we can set job->entity to NULL or even remove the field entirely.


But that can be changed later on as well. Either way series is 
Reviewed-by: Christian König .


Thanks,
Christian.


+   break;
+   }
+   }
+   spin_unlock(&rq->lock);
+   if (found)
+   break;
+   }
+   }
  }
  
  void amd_sched_job_kickout(struct amd_sched_job *s_job)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index a05994c..be75172 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
   struct amd_gpu_scheduler *sched,
   struct amd_sched_entity *entity,
   void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *job);
  void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
  bool amd_sched_dependency_optimized(struct dma_fence* fence,
str

Re: [PATCH 5/5] drm/amdgpu:cleanup job reset routine(v2)

2017-10-23 Thread Chunming Zhou

The series is Reviewed-by: Chunming Zhou 


On 2017年10月23日 13:46, Monk Liu wrote:

merge the setting guilty on context into this function
to avoid implement extra routine.

v2:
go through entity list and compare the fence_ctx
before operate on the entity, otherwise the entity
may be just a wild pointer

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  4 ++--
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 31 ++-
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
  3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
amd_sched_job_kickout(&job->base);
  
  		/* only do job_reset on the hang ring if @job not NULL */

-   amd_sched_hw_job_reset(&ring->sched);
+   amd_sched_hw_job_reset(&ring->sched, NULL);
  
  		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */

amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
-   amd_sched_hw_job_reset(&ring->sched);
+   amd_sched_hw_job_reset(&ring->sched, NULL);
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 97fd255..39b6205 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
job->sched->ops->timedout_job(job);
  }
  
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)

+static void amd_sched_set_guilty(struct amd_sched_job *s_job)
+{
+   if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
+   if (s_job->s_entity->guilty)
+   atomic_set(s_job->s_entity->guilty, 1);
+}
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *bad)
  {
struct amd_sched_job *s_job;
+   struct amd_sched_entity *entity, *tmp;
+   int i;;
  
  	spin_lock(&sched->job_list_lock);

list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
@@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched)
}
}
spin_unlock(&sched->job_list_lock);
+
+   if (bad) {
+   bool found = false;
+
+   for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; 
i++ ) {
+   struct amd_sched_rq *rq = &sched->sched_rq[i];
+
+   spin_lock(&rq->lock);
+   list_for_each_entry_safe(entity, tmp, &rq->entities, 
list) {
+   if (bad->s_fence->scheduled.context == 
entity->fence_context) {
+   found = true;
+   amd_sched_set_guilty(bad);
+   break;
+   }
+   }
+   spin_unlock(&rq->lock);
+   if (found)
+   break;
+   }
+   }
  }
  
  void amd_sched_job_kickout(struct amd_sched_job *s_job)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index a05994c..be75172 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
   struct amd_gpu_scheduler *sched,
   struct amd_sched_entity *entity,
   void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *job);
  void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
  bool amd_sched_dependency_optimized(struct dma_fence* fence,
struct amd_sched_entity *entity);


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 5/5] drm/amdgpu:cleanup job reset routine(v2)

2017-10-22 Thread Monk Liu
merge the setting guilty on context into this function
to avoid implement extra routine.

v2:
go through entity list and compare the fence_ctx
before operate on the entity, otherwise the entity
may be just a wild pointer

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  4 ++--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 31 ++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
amd_sched_job_kickout(&job->base);
 
/* only do job_reset on the hang ring if @job not NULL */
-   amd_sched_hw_job_reset(&ring->sched);
+   amd_sched_hw_job_reset(&ring->sched, NULL);
 
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
-   amd_sched_hw_job_reset(&ring->sched);
+   amd_sched_hw_job_reset(&ring->sched, NULL);
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 97fd255..39b6205 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
job->sched->ops->timedout_job(job);
 }
 
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
+static void amd_sched_set_guilty(struct amd_sched_job *s_job)
+{
+   if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
+   if (s_job->s_entity->guilty)
+   atomic_set(s_job->s_entity->guilty, 1);
+}
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *bad)
 {
struct amd_sched_job *s_job;
+   struct amd_sched_entity *entity, *tmp;
+   int i;;
 
spin_lock(&sched->job_list_lock);
list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
@@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched)
}
}
spin_unlock(&sched->job_list_lock);
+
+   if (bad) {
+   bool found = false;
+
+   for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; 
i++ ) {
+   struct amd_sched_rq *rq = &sched->sched_rq[i];
+
+   spin_lock(&rq->lock);
+   list_for_each_entry_safe(entity, tmp, &rq->entities, 
list) {
+   if (bad->s_fence->scheduled.context == 
entity->fence_context) {
+   found = true;
+   amd_sched_set_guilty(bad);
+   break;
+   }
+   }
+   spin_unlock(&rq->lock);
+   if (found)
+   break;
+   }
+   }
 }
 
 void amd_sched_job_kickout(struct amd_sched_job *s_job)
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index a05994c..be75172 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
   struct amd_gpu_scheduler *sched,
   struct amd_sched_entity *entity,
   void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *job);
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
 bool amd_sched_dependency_optimized(struct dma_fence* fence,
struct amd_sched_entity *entity);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx