Re: [PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-17 Thread Koenig, Christian
Am 17.12.18 um 17:57 schrieb Grodzovsky, Andrey:
>
> On 12/17/2018 10:27 AM, Christian König wrote:
>> Am 10.12.18 um 22:43 schrieb Andrey Grodzovsky:
>>> Decauple sched threads stop and start and ring mirror
>>> list handling from the policy of what to do about the
>>> guilty jobs.
>>> When stoppping the sched thread and detaching sched fences
>>> from non signaled HW fenes wait for all signaled HW fences
>>> to complete before rerunning the jobs.
>>>
>>> v2: Fix resubmission of guilty job into HW after refactoring.
>>>
>>> Suggested-by: Christian Koenig 
>>> Signed-off-by: Andrey Grodzovsky 
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  17 +++--
>>>    drivers/gpu/drm/etnaviv/etnaviv_sched.c    |   8 +--
>>>    drivers/gpu/drm/scheduler/sched_main.c | 110
>>> ++---
>>>    drivers/gpu/drm/v3d/v3d_sched.c    |  11 +--
>>>    include/drm/gpu_scheduler.h    |  10 ++-
>>>    5 files changed, 95 insertions(+), 61 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index ef36cc5..42111d5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3292,17 +3292,16 @@ static int
>>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>    /* block all schedulers and reset given job's ring */
>>>    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>    struct amdgpu_ring *ring = adev->rings[i];
>>> +    bool park_only = job && job->base.sched != &ring->sched;
>>>      if (!ring || !ring->sched.thread)
>>>    continue;
>>>    -    kthread_park(ring->sched.thread);
>>> +    drm_sched_stop(&ring->sched, job ? &job->base : NULL,
>>> park_only);
>>>    -    if (job && job->base.sched != &ring->sched)
>>> +    if (park_only)
>>>    continue;
>>>    -    drm_sched_hw_job_reset(&ring->sched, job ? &job->base :
>>> NULL);
>>> -
>>>    /* after all hw jobs are reset, hw fence is meaningless, so
>>> force_completion */
>>>    amdgpu_fence_driver_force_completion(ring);
>>>    }
>>> @@ -3445,6 +3444,7 @@ static void
>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>>>      struct amdgpu_job *job)
>>>    {
>>>    int i;
>>> +    bool unpark_only;
>>>      for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>    struct amdgpu_ring *ring = adev->rings[i];
>>> @@ -3456,10 +3456,13 @@ static void
>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>>>     * or all rings (in the case @job is NULL)
>>>     * after above amdgpu_reset accomplished
>>>     */
>>> -    if ((!job || job->base.sched == &ring->sched) &&
>>> !adev->asic_reset_res)
>>> -    drm_sched_job_recovery(&ring->sched);
>>> +    unpark_only = (job && job->base.sched != &ring->sched) ||
>>> +   adev->asic_reset_res;
>>> +
>>> +    if (!unpark_only)
>>> +    drm_sched_resubmit_jobs(&ring->sched);
>>>    -    kthread_unpark(ring->sched.thread);
>>> +    drm_sched_start(&ring->sched, unpark_only);
>>>    }
>>>      if (!amdgpu_device_has_dc_support(adev)) {
>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> index 49a6763..fab3b51 100644
>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct
>>> drm_sched_job *sched_job)
>>>    }
>>>      /* block scheduler */
>>> -    kthread_park(gpu->sched.thread);
>>> -    drm_sched_hw_job_reset(&gpu->sched, sched_job);
>>> +    drm_sched_stop(&gpu->sched, sched_job, false);
>>>      /* get the GPU back into the init state */
>>>    etnaviv_core_dump(gpu);
>>>    etnaviv_gpu_recover_hang(gpu);
>>>    +    drm_sched_resubmit_jobs(&gpu->sched);
>>> +
>>>    /* restart scheduler after GPU is usable again */
>>> -    drm_sched_job_recovery(&gpu->sched);
>>> -    kthread_unpark(gpu->sched.thread);
>>> +    drm_sched_start(&gpu->sched);
>>>    }
>>>      static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> index dbb6906..cdf95e2 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -60,8 +60,6 @@
>>>      static void drm_sched_process_job(struct dma_fence *f, struct
>>> dma_fence_cb *cb);
>>>    -static void drm_sched_expel_job_unlocked(struct drm_sched_job
>>> *s_job);
>>> -
>>>    /**
>>>     * drm_sched_rq_init - initialize a given run queue struct
>>>     *
>>> @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct
>>> work_struct *work)
>>>     * @bad: bad scheduler job
>>>     *
>>>     */
>>> -void drm_sched_hw_job_reset(struct drm_gpu_

Re: [PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-17 Thread Grodzovsky, Andrey


On 12/17/2018 10:27 AM, Christian König wrote:
> Am 10.12.18 um 22:43 schrieb Andrey Grodzovsky:
>> Decauple sched threads stop and start and ring mirror
>> list handling from the policy of what to do about the
>> guilty jobs.
>> When stoppping the sched thread and detaching sched fences
>> from non signaled HW fenes wait for all signaled HW fences
>> to complete before rerunning the jobs.
>>
>> v2: Fix resubmission of guilty job into HW after refactoring.
>>
>> Suggested-by: Christian Koenig 
>> Signed-off-by: Andrey Grodzovsky 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  17 +++--
>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c    |   8 +--
>>   drivers/gpu/drm/scheduler/sched_main.c | 110 
>> ++---
>>   drivers/gpu/drm/v3d/v3d_sched.c    |  11 +--
>>   include/drm/gpu_scheduler.h    |  10 ++-
>>   5 files changed, 95 insertions(+), 61 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ef36cc5..42111d5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3292,17 +3292,16 @@ static int 
>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>   /* block all schedulers and reset given job's ring */
>>   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>   struct amdgpu_ring *ring = adev->rings[i];
>> +    bool park_only = job && job->base.sched != &ring->sched;
>>     if (!ring || !ring->sched.thread)
>>   continue;
>>   -    kthread_park(ring->sched.thread);
>> +    drm_sched_stop(&ring->sched, job ? &job->base : NULL, 
>> park_only);
>>   -    if (job && job->base.sched != &ring->sched)
>> +    if (park_only)
>>   continue;
>>   -    drm_sched_hw_job_reset(&ring->sched, job ? &job->base : 
>> NULL);
>> -
>>   /* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>>   amdgpu_fence_driver_force_completion(ring);
>>   }
>> @@ -3445,6 +3444,7 @@ static void 
>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>>     struct amdgpu_job *job)
>>   {
>>   int i;
>> +    bool unpark_only;
>>     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>   struct amdgpu_ring *ring = adev->rings[i];
>> @@ -3456,10 +3456,13 @@ static void 
>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>>    * or all rings (in the case @job is NULL)
>>    * after above amdgpu_reset accomplished
>>    */
>> -    if ((!job || job->base.sched == &ring->sched) && 
>> !adev->asic_reset_res)
>> -    drm_sched_job_recovery(&ring->sched);
>> +    unpark_only = (job && job->base.sched != &ring->sched) ||
>> +   adev->asic_reset_res;
>> +
>> +    if (!unpark_only)
>> +    drm_sched_resubmit_jobs(&ring->sched);
>>   -    kthread_unpark(ring->sched.thread);
>> +    drm_sched_start(&ring->sched, unpark_only);
>>   }
>>     if (!amdgpu_device_has_dc_support(adev)) {
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> index 49a6763..fab3b51 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct 
>> drm_sched_job *sched_job)
>>   }
>>     /* block scheduler */
>> -    kthread_park(gpu->sched.thread);
>> -    drm_sched_hw_job_reset(&gpu->sched, sched_job);
>> +    drm_sched_stop(&gpu->sched, sched_job, false);
>>     /* get the GPU back into the init state */
>>   etnaviv_core_dump(gpu);
>>   etnaviv_gpu_recover_hang(gpu);
>>   +    drm_sched_resubmit_jobs(&gpu->sched);
>> +
>>   /* restart scheduler after GPU is usable again */
>> -    drm_sched_job_recovery(&gpu->sched);
>> -    kthread_unpark(gpu->sched.thread);
>> +    drm_sched_start(&gpu->sched);
>>   }
>>     static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index dbb6906..cdf95e2 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -60,8 +60,6 @@
>>     static void drm_sched_process_job(struct dma_fence *f, struct 
>> dma_fence_cb *cb);
>>   -static void drm_sched_expel_job_unlocked(struct drm_sched_job 
>> *s_job);
>> -
>>   /**
>>    * drm_sched_rq_init - initialize a given run queue struct
>>    *
>> @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct 
>> work_struct *work)
>>    * @bad: bad scheduler job
>>    *
>>    */
>> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct 
>> drm_sched_job *bad)
>> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
>> drm_sched_job *bad,
>> +    bool park_only)
>>   {
>>   struct drm_s

Re: [PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-17 Thread Christian König

Am 10.12.18 um 22:43 schrieb Andrey Grodzovsky:

Decauple sched threads stop and start and ring mirror
list handling from the policy of what to do about the
guilty jobs.
When stoppping the sched thread and detaching sched fences
from non signaled HW fenes wait for all signaled HW fences
to complete before rerunning the jobs.

v2: Fix resubmission of guilty job into HW after refactoring.

Suggested-by: Christian Koenig 
Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  17 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c|   8 +--
  drivers/gpu/drm/scheduler/sched_main.c | 110 ++---
  drivers/gpu/drm/v3d/v3d_sched.c|  11 +--
  include/drm/gpu_scheduler.h|  10 ++-
  5 files changed, 95 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef36cc5..42111d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
+   bool park_only = job && job->base.sched != &ring->sched;
  
  		if (!ring || !ring->sched.thread)

continue;
  
-		kthread_park(ring->sched.thread);

+   drm_sched_stop(&ring->sched, job ? &job->base : NULL, 
park_only);
  
-		if (job && job->base.sched != &ring->sched)

+   if (park_only)
continue;
  
-		drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);

-
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
  struct amdgpu_job *job)
  {
int i;
+   bool unpark_only;
  
  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

struct amdgpu_ring *ring = adev->rings[i];
@@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
 * or all rings (in the case @job is NULL)
 * after above amdgpu_reset accomplished
 */
-   if ((!job || job->base.sched == &ring->sched) && 
!adev->asic_reset_res)
-   drm_sched_job_recovery(&ring->sched);
+   unpark_only = (job && job->base.sched != &ring->sched) ||
+  adev->asic_reset_res;
+
+   if (!unpark_only)
+   drm_sched_resubmit_jobs(&ring->sched);
  
-		kthread_unpark(ring->sched.thread);

+   drm_sched_start(&ring->sched, unpark_only);
}
  
  	if (!amdgpu_device_has_dc_support(adev)) {

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 49a6763..fab3b51 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct 
drm_sched_job *sched_job)
}
  
  	/* block scheduler */

-   kthread_park(gpu->sched.thread);
-   drm_sched_hw_job_reset(&gpu->sched, sched_job);
+   drm_sched_stop(&gpu->sched, sched_job, false);
  
  	/* get the GPU back into the init state */

etnaviv_core_dump(gpu);
etnaviv_gpu_recover_hang(gpu);
  
+	drm_sched_resubmit_jobs(&gpu->sched);

+
/* restart scheduler after GPU is usable again */
-   drm_sched_job_recovery(&gpu->sched);
-   kthread_unpark(gpu->sched.thread);
+   drm_sched_start(&gpu->sched);
  }
  
  static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index dbb6906..cdf95e2 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
  
  static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
  
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);

-
  /**
   * drm_sched_rq_init - initialize a given run queue struct
   *
@@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
   * @bad: bad scheduler job
   *
   */
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad,
+   bool park_only)
  {
struct drm_sched_job *s_job;
struct drm_sched_entity *entity, *tmp;
unsigned long flags;
+   struct list_head wait_list;
int i;
  
+	kthread_park(sched->thread);

+   if (park_only)
+   ret

[PATCH v3 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-10 Thread Andrey Grodzovsky
Decauple sched threads stop and start and ring mirror
list handling from the policy of what to do about the
guilty jobs.
When stoppping the sched thread and detaching sched fences
from non signaled HW fenes wait for all signaled HW fences
to complete before rerunning the jobs.

v2: Fix resubmission of guilty job into HW after refactoring.

Suggested-by: Christian Koenig 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  17 +++--
 drivers/gpu/drm/etnaviv/etnaviv_sched.c|   8 +--
 drivers/gpu/drm/scheduler/sched_main.c | 110 ++---
 drivers/gpu/drm/v3d/v3d_sched.c|  11 +--
 include/drm/gpu_scheduler.h|  10 ++-
 5 files changed, 95 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef36cc5..42111d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
+   bool park_only = job && job->base.sched != &ring->sched;
 
if (!ring || !ring->sched.thread)
continue;
 
-   kthread_park(ring->sched.thread);
+   drm_sched_stop(&ring->sched, job ? &job->base : NULL, 
park_only);
 
-   if (job && job->base.sched != &ring->sched)
+   if (park_only)
continue;
 
-   drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);
-
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
  struct amdgpu_job *job)
 {
int i;
+   bool unpark_only;
 
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
 * or all rings (in the case @job is NULL)
 * after above amdgpu_reset accomplished
 */
-   if ((!job || job->base.sched == &ring->sched) && 
!adev->asic_reset_res)
-   drm_sched_job_recovery(&ring->sched);
+   unpark_only = (job && job->base.sched != &ring->sched) ||
+  adev->asic_reset_res;
+
+   if (!unpark_only)
+   drm_sched_resubmit_jobs(&ring->sched);
 
-   kthread_unpark(ring->sched.thread);
+   drm_sched_start(&ring->sched, unpark_only);
}
 
if (!amdgpu_device_has_dc_support(adev)) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 49a6763..fab3b51 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct 
drm_sched_job *sched_job)
}
 
/* block scheduler */
-   kthread_park(gpu->sched.thread);
-   drm_sched_hw_job_reset(&gpu->sched, sched_job);
+   drm_sched_stop(&gpu->sched, sched_job, false);
 
/* get the GPU back into the init state */
etnaviv_core_dump(gpu);
etnaviv_gpu_recover_hang(gpu);
 
+   drm_sched_resubmit_jobs(&gpu->sched);
+
/* restart scheduler after GPU is usable again */
-   drm_sched_job_recovery(&gpu->sched);
-   kthread_unpark(gpu->sched.thread);
+   drm_sched_start(&gpu->sched);
 }
 
 static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index dbb6906..cdf95e2 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
 
 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb 
*cb);
 
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);
-
 /**
  * drm_sched_rq_init - initialize a given run queue struct
  *
@@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
  * @bad: bad scheduler job
  *
  */
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad,
+   bool park_only)
 {
struct drm_sched_job *s_job;
struct drm_sched_entity *entity, *tmp;
unsigned long flags;
+   struct list_head wait_list;
int i;
 
+   kthread_park(sched->thread);
+   if (park_only)
+