Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Luben Tuikov
On 2020-12-04 3:13 a.m., Christian König wrote:
> Thinking more about that I came to the conclusion that the whole 
> approach here isn't correct.
> 
> See even when the job has been completed or canceled we still want to 
> restart the timer.
> 
> The reason for this is that the timer is then not restarted for the 
> current job, but for the next job in the queue.

Got it. I'll make that change in patch 5/5 as this patch, 4/5,
only changes the timer timeout function from void to enum, and
doesn't affect behaviour.

> The only valid reason to not restart the timer is that the whole device 
> was hot plugged and we return -ENODEV here. E.g. what Andrey has been 
> working on.

Yes, perhaps something like DRM_TASK_STATUS_ENODEV.
We can add it now or later when Andrey adds his
hotplug/unplug patches.

Regards,
Luben

> 
> Regards,
> Christian.
> 
> Am 04.12.20 um 04:17 schrieb Luben Tuikov:
>> The driver's job timeout handler now returns
>> status indicating back to the DRM layer whether
>> the task (job) was successfully aborted or whether
>> more time should be given to the task to complete.
>>
>> Default behaviour as of this patch, is preserved,
>> except in obvious-by-comment case in the Panfrost
>> driver, as documented below.
>>
>> All drivers which make use of the
>> drm_sched_backend_ops' .timedout_job() callback
>> have been accordingly renamed and return the
>> would've-been default value of
>> DRM_TASK_STATUS_ALIVE to restart the task's
>> timeout timer--this is the old behaviour, and
>> is preserved by this patch.
>>
>> In the case of the Panfrost driver, its timedout
>> callback correctly first checks if the job had
>> completed in due time and if so, it now returns
>> DRM_TASK_STATUS_COMPLETE to notify the DRM layer
>> that the task can be moved to the done list, to be
>> freed later. In the other two subsequent checks,
>> the value of DRM_TASK_STATUS_ALIVE is returned, as
>> per the default behaviour.
>>
>> A more involved driver's solutions can be had
>> in subequent patches.
>>
>> Signed-off-by: Luben Tuikov 
>> Reported-by: kernel test robot 
>>
>> Cc: Alexander Deucher 
>> Cc: Andrey Grodzovsky 
>> Cc: Christian König 
>> Cc: Daniel Vetter 
>> Cc: Lucas Stach 
>> Cc: Russell King 
>> Cc: Christian Gmeiner 
>> Cc: Qiang Yu 
>> Cc: Rob Herring 
>> Cc: Tomeu Vizoso 
>> Cc: Steven Price 
>> Cc: Alyssa Rosenzweig 
>> Cc: Eric Anholt 
>>
>> v2: Use enum as the status of a driver's job
>>  timeout callback method.
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
>>   drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
>>   drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
>>   drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
>>   drivers/gpu/drm/v3d/v3d_sched.c | 32 +
>>   include/drm/gpu_scheduler.h | 20 +---
>>   7 files changed, 57 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index ff48101bab55..a111326cbdde 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -28,7 +28,7 @@
>>   #include "amdgpu.h"
>>   #include "amdgpu_trace.h"
>>   
>> -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>> +static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
>>   {
>>  struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>  struct amdgpu_job *job = to_amdgpu_job(s_job);
>> @@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job 
>> *s_job)
>>  amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) 
>> {
>>  DRM_ERROR("ring %s timeout, but soft recovered\n",
>>s_job->sched->name);
>> -return;
>> +return DRM_TASK_STATUS_ALIVE;
>>  }
>>   
>>  amdgpu_vm_get_task_info(ring->adev, job->pasid, );
>> @@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job 
>> *s_job)
>>   
>>  if (amdgpu_device_should_recover_gpu(ring->adev)) {
>>  amdgpu_device_gpu_recover(ring->adev, job);
>> +return DRM_TASK_STATUS_ALIVE;
>>  } else {
>>  drm_sched_suspend_timeout(>sched);
>>  if (amdgpu_sriov_vf(adev))
>>  adev->virt.tdr_debug = true;
>> +return DRM_TASK_STATUS_ALIVE;
>>  }
>>   }
>>   
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> index cd46c882269c..c49516942328 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> @@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct 
>> drm_sched_job *sched_job)
>>  return fence;
>>   }
>>   
>> -static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
>> +static enum drm_task_status 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Andrey Grodzovsky


On 12/7/20 2:19 PM, Christian König wrote:

Am 07.12.20 um 20:09 schrieb Andrey Grodzovsky:


On 12/7/20 1:04 PM, Christian König wrote:

Am 07.12.20 um 17:00 schrieb Andrey Grodzovsky:


On 12/7/20 6:13 AM, Christian König wrote:

Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole 
approach here isn't correct.


See even when the job has been completed or canceled we still want to 
restart the timer.


The reason for this is that the timer is then not restarted for the 
current job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole device 
was hot plugged and we return -ENODEV here. E.g. what Andrey has been 
working on.



We discussed this with Luben off line a few days ago but came to a 
conclusion that for the next job the timer restart in drm_sched_job_begin 
should do the work, no ?


Nope, drm_sched_job_begin() pushes the job to the hardware and starts the 
timeout in case the hardware was idle before.



drm_sched_job_begin only adds the job to ring mirror list and rearms the 
timer, I don't see how it is related to whether the HW was idle before ?


It doesn't rearm the timer. It initially starts the timer when the hardware 
is idle.



It schedules delayed work for the timer task if ring mirror list not empty. 
Am i missing something ?



Ok, let me explain from the beginning.

drm_sched_start_timeout() initially starts the timer, it does NOT rearms it! 
When the timer is already running it doesn't has any effect at all.


In a sense that delayed work cannot be enqueued while another instance is still 
in the queue I agree.

I forgot about this in the context of drm_sched_start_timeout.




When a job completes drm_sched_get_cleanup_job() cancels the timer, frees the 
job and then starts a new timer for the engine.


When a timeout happens the job is either canceled or give some extra time by 
putting it back on the pending list.


When the job is canceled the timer must be restarted for the next job, because 
drm_sched_job_begin() was already called long ago.



Now i get it. Next job might have called (and probably did) drm_sched_job_begin 
while previous timer work (currently executing one)
was still in the workqueue and so we cannot count on it to actually have 
restarted the timer and so we must do it.






When the job gets some extra time we should also restart the timer.



Same as above.

Thanks for clarifying this.

Andrey




The only case when the timer should not be restarted is when the device was 
hotplugged and is completely gone now.


I think the right approach to stop this messing with the ring mirror list is 
to avoid using the job altogether for recovery.


What we should do instead is to put the recovery information on the scheduler 
fence, because that is the object which stays alive after pushing the job to 
the hardware.






Christian.



Andrey




Christian.



Andrey




The function should probably be renamed to drm_sched_job_pushed() because 
it doesn't begin the execution in any way.


Christian.








Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Christian König

Am 07.12.20 um 20:09 schrieb Andrey Grodzovsky:


On 12/7/20 1:04 PM, Christian König wrote:

Am 07.12.20 um 17:00 schrieb Andrey Grodzovsky:


On 12/7/20 6:13 AM, Christian König wrote:

Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole 
approach here isn't correct.


See even when the job has been completed or canceled we still 
want to restart the timer.


The reason for this is that the timer is then not restarted for 
the current job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole 
device was hot plugged and we return -ENODEV here. E.g. what 
Andrey has been working on.



We discussed this with Luben off line a few days ago but came to a 
conclusion that for the next job the timer restart in 
drm_sched_job_begin should do the work, no ?


Nope, drm_sched_job_begin() pushes the job to the hardware and 
starts the timeout in case the hardware was idle before.



drm_sched_job_begin only adds the job to ring mirror list and rearms 
the timer, I don't see how it is related to whether the HW was idle 
before ?


It doesn't rearm the timer. It initially starts the timer when the 
hardware is idle.



It schedules delayed work for the timer task if ring mirror list not 
empty. Am i missing something ?



Ok, let me explain from the beginning.

drm_sched_start_timeout() initially starts the timer, it does NOT rearms 
it! When the timer is already running it doesn't has any effect at all.


When a job completes drm_sched_get_cleanup_job() cancels the timer, 
frees the job and then starts a new timer for the engine.


When a timeout happens the job is either canceled or give some extra 
time by putting it back on the pending list.


When the job is canceled the timer must be restarted for the next job, 
because drm_sched_job_begin() was already called long ago.


When the job gets some extra time we should also restart the timer.

The only case when the timer should not be restarted is when the device 
was hotplugged and is completely gone now.


I think the right approach to stop this messing with the ring mirror 
list is to avoid using the job altogether for recovery.


What we should do instead is to put the recovery information on the 
scheduler fence, because that is the object which stays alive after 
pushing the job to the hardware.


Christian.



Andrey




Christian.



Andrey




The function should probably be renamed to drm_sched_job_pushed() 
because it doesn't begin the execution in any way.


Christian.








Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 
+

  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static enum drm_task_status 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Andrey Grodzovsky


On 12/7/20 1:04 PM, Christian König wrote:

Am 07.12.20 um 17:00 schrieb Andrey Grodzovsky:


On 12/7/20 6:13 AM, Christian König wrote:

Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole approach 
here isn't correct.


See even when the job has been completed or canceled we still want to 
restart the timer.


The reason for this is that the timer is then not restarted for the 
current job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole device 
was hot plugged and we return -ENODEV here. E.g. what Andrey has been 
working on.



We discussed this with Luben off line a few days ago but came to a 
conclusion that for the next job the timer restart in drm_sched_job_begin 
should do the work, no ?


Nope, drm_sched_job_begin() pushes the job to the hardware and starts the 
timeout in case the hardware was idle before.



drm_sched_job_begin only adds the job to ring mirror list and rearms the 
timer, I don't see how it is related to whether the HW was idle before ?


It doesn't rearm the timer. It initially starts the timer when the hardware is 
idle.



It schedules delayed work for the timer task if ring mirror list not empty. Am i 
missing something ?


Andrey




Christian.



Andrey




The function should probably be renamed to drm_sched_job_pushed() because it 
doesn't begin the execution in any way.


Christian.








Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 +
  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job 
*s_job)

  {
  struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
  struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job 
*s_job)
  amdgpu_ring_soft_recovery(ring, job->vmid, 
s_job->s_fence->parent)) {

  DRM_ERROR("ring %s timeout, but soft recovered\n",
    s_job->sched->name);
-    return;
+    return DRM_TASK_STATUS_ALIVE;
  }
    amdgpu_vm_get_task_info(ring->adev, job->pasid, );
@@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job 
*s_job)

    if (amdgpu_device_should_recover_gpu(ring->adev)) {
  amdgpu_device_gpu_recover(ring->adev, job);
+    return DRM_TASK_STATUS_ALIVE;
  } else {
drm_sched_suspend_timeout(>sched);
  if (amdgpu_sriov_vf(adev))
  adev->virt.tdr_debug = true;
+    return DRM_TASK_STATUS_ALIVE;
  }
  }
  diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c

index cd46c882269c..c49516942328 100644
--- 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Christian König

Am 07.12.20 um 17:00 schrieb Andrey Grodzovsky:


On 12/7/20 6:13 AM, Christian König wrote:

Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole 
approach here isn't correct.


See even when the job has been completed or canceled we still want 
to restart the timer.


The reason for this is that the timer is then not restarted for the 
current job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole 
device was hot plugged and we return -ENODEV here. E.g. what Andrey 
has been working on.



We discussed this with Luben off line a few days ago but came to a 
conclusion that for the next job the timer restart in 
drm_sched_job_begin should do the work, no ?


Nope, drm_sched_job_begin() pushes the job to the hardware and starts 
the timeout in case the hardware was idle before.



drm_sched_job_begin only adds the job to ring mirror list and rearms 
the timer, I don't see how it is related to whether the HW was idle 
before ?


It doesn't rearm the timer. It initially starts the timer when the 
hardware is idle.


Christian.



Andrey




The function should probably be renamed to drm_sched_job_pushed() 
because it doesn't begin the execution in any way.


Christian.








Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 
+

  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static enum drm_task_status amdgpu_job_timedout(struct 
drm_sched_job *s_job)

  {
  struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
  struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct 
drm_sched_job *s_job)
  amdgpu_ring_soft_recovery(ring, job->vmid, 
s_job->s_fence->parent)) {

  DRM_ERROR("ring %s timeout, but soft recovered\n",
    s_job->sched->name);
-    return;
+    return DRM_TASK_STATUS_ALIVE;
  }
    amdgpu_vm_get_task_info(ring->adev, job->pasid, );
@@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct 
drm_sched_job *s_job)

    if (amdgpu_device_should_recover_gpu(ring->adev)) {
  amdgpu_device_gpu_recover(ring->adev, job);
+    return DRM_TASK_STATUS_ALIVE;
  } else {
  drm_sched_suspend_timeout(>sched);
  if (amdgpu_sriov_vf(adev))
  adev->virt.tdr_debug = true;
+    return DRM_TASK_STATUS_ALIVE;
  }
  }
  diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c

index cd46c882269c..c49516942328 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -82,7 +82,8 @@ static struct dma_fence 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Andrey Grodzovsky


On 12/7/20 6:13 AM, Christian König wrote:

Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole approach 
here isn't correct.


See even when the job has been completed or canceled we still want to 
restart the timer.


The reason for this is that the timer is then not restarted for the current 
job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole device was 
hot plugged and we return -ENODEV here. E.g. what Andrey has been working on.



We discussed this with Luben off line a few days ago but came to a conclusion 
that for the next job the timer restart in drm_sched_job_begin should do the 
work, no ?


Nope, drm_sched_job_begin() pushes the job to the hardware and starts the 
timeout in case the hardware was idle before.



drm_sched_job_begin only adds the job to ring mirror list and rearms the timer, 
I don't see how it is related to whether the HW was idle before ?


Andrey




The function should probably be renamed to drm_sched_job_pushed() because it 
doesn't begin the execution in any way.


Christian.








Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 +
  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
  {
  struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
  struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
  amdgpu_ring_soft_recovery(ring, job->vmid, 
s_job->s_fence->parent)) {

  DRM_ERROR("ring %s timeout, but soft recovered\n",
    s_job->sched->name);
-    return;
+    return DRM_TASK_STATUS_ALIVE;
  }
    amdgpu_vm_get_task_info(ring->adev, job->pasid, );
@@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job 
*s_job)

    if (amdgpu_device_should_recover_gpu(ring->adev)) {
  amdgpu_device_gpu_recover(ring->adev, job);
+    return DRM_TASK_STATUS_ALIVE;
  } else {
  drm_sched_suspend_timeout(>sched);
  if (amdgpu_sriov_vf(adev))
  adev->virt.tdr_debug = true;
+    return DRM_TASK_STATUS_ALIVE;
  }
  }
  diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c

index cd46c882269c..c49516942328 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct 
drm_sched_job *sched_job)

  return fence;
  }
  -static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
+static enum drm_task_status 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-07 Thread Christian König

Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole 
approach here isn't correct.


See even when the job has been completed or canceled we still want to 
restart the timer.


The reason for this is that the timer is then not restarted for the 
current job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole 
device was hot plugged and we return -ENODEV here. E.g. what Andrey 
has been working on.



We discussed this with Luben off line a few days ago but came to a 
conclusion that for the next job the timer restart in 
drm_sched_job_begin should do the work, no ?


Nope, drm_sched_job_begin() pushes the job to the hardware and starts 
the timeout in case the hardware was idle before.


The function should probably be renamed to drm_sched_job_pushed() 
because it doesn't begin the execution in any way.


Christian.



Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 
+

  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static enum drm_task_status amdgpu_job_timedout(struct 
drm_sched_job *s_job)

  {
  struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
  struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct 
drm_sched_job *s_job)
  amdgpu_ring_soft_recovery(ring, job->vmid, 
s_job->s_fence->parent)) {

  DRM_ERROR("ring %s timeout, but soft recovered\n",
    s_job->sched->name);
-    return;
+    return DRM_TASK_STATUS_ALIVE;
  }
    amdgpu_vm_get_task_info(ring->adev, job->pasid, );
@@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct 
drm_sched_job *s_job)

    if (amdgpu_device_should_recover_gpu(ring->adev)) {
  amdgpu_device_gpu_recover(ring->adev, job);
+    return DRM_TASK_STATUS_ALIVE;
  } else {
  drm_sched_suspend_timeout(>sched);
  if (amdgpu_sriov_vf(adev))
  adev->virt.tdr_debug = true;
+    return DRM_TASK_STATUS_ALIVE;
  }
  }
  diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c

index cd46c882269c..c49516942328 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -82,7 +82,8 @@ static struct dma_fence 
*etnaviv_sched_run_job(struct drm_sched_job *sched_job)

  return fence;
  }
  -static void etnaviv_sched_timedout_job(struct drm_sched_job 
*sched_job)
+static enum drm_task_status etnaviv_sched_timedout_job(struct 
drm_sched_job

+   *sched_job)
  {
  struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
  struct etnaviv_gpu *gpu = 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-04 Thread Andrey Grodzovsky


On 12/4/20 3:13 AM, Christian König wrote:
Thinking more about that I came to the conclusion that the whole approach here 
isn't correct.


See even when the job has been completed or canceled we still want to restart 
the timer.


The reason for this is that the timer is then not restarted for the current 
job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole device was 
hot plugged and we return -ENODEV here. E.g. what Andrey has been working on.



We discussed this with Luben off line a few days ago but came to a conclusion 
that for the next job the timer restart in drm_sched_job_begin should do the 
work, no ?


Andrey




Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 +
  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
  {
  struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
  struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
  amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
  DRM_ERROR("ring %s timeout, but soft recovered\n",
    s_job->sched->name);
-    return;
+    return DRM_TASK_STATUS_ALIVE;
  }
    amdgpu_vm_get_task_info(ring->adev, job->pasid, );
@@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
    if (amdgpu_device_should_recover_gpu(ring->adev)) {
  amdgpu_device_gpu_recover(ring->adev, job);
+    return DRM_TASK_STATUS_ALIVE;
  } else {
  drm_sched_suspend_timeout(>sched);
  if (amdgpu_sriov_vf(adev))
  adev->virt.tdr_debug = true;
+    return DRM_TASK_STATUS_ALIVE;
  }
  }
  diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c

index cd46c882269c..c49516942328 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct 
drm_sched_job *sched_job)

  return fence;
  }
  -static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
+static enum drm_task_status etnaviv_sched_timedout_job(struct drm_sched_job
+   *sched_job)
  {
  struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
  struct etnaviv_gpu *gpu = submit->gpu;
@@ -120,9 +121,16 @@ static void etnaviv_sched_timedout_job(struct 
drm_sched_job *sched_job)

    drm_sched_resubmit_jobs(>sched);
  +    /* Tell the DRM scheduler that this task needs
+ * more time.
+ */
+    drm_sched_start(>sched, true);
+    return DRM_TASK_STATUS_ALIVE;
+
  out_no_timeout:
 

Re: [PATCH 4/5] drm/scheduler: Job timeout handler returns status (v2)

2020-12-04 Thread Christian König
Thinking more about that I came to the conclusion that the whole 
approach here isn't correct.


See even when the job has been completed or canceled we still want to 
restart the timer.


The reason for this is that the timer is then not restarted for the 
current job, but for the next job in the queue.


The only valid reason to not restart the timer is that the whole device 
was hot plugged and we return -ENODEV here. E.g. what Andrey has been 
working on.


Regards,
Christian.

Am 04.12.20 um 04:17 schrieb Luben Tuikov:

The driver's job timeout handler now returns
status indicating back to the DRM layer whether
the task (job) was successfully aborted or whether
more time should be given to the task to complete.

Default behaviour as of this patch, is preserved,
except in obvious-by-comment case in the Panfrost
driver, as documented below.

All drivers which make use of the
drm_sched_backend_ops' .timedout_job() callback
have been accordingly renamed and return the
would've-been default value of
DRM_TASK_STATUS_ALIVE to restart the task's
timeout timer--this is the old behaviour, and
is preserved by this patch.

In the case of the Panfrost driver, its timedout
callback correctly first checks if the job had
completed in due time and if so, it now returns
DRM_TASK_STATUS_COMPLETE to notify the DRM layer
that the task can be moved to the done list, to be
freed later. In the other two subsequent checks,
the value of DRM_TASK_STATUS_ALIVE is returned, as
per the default behaviour.

A more involved driver's solutions can be had
in subequent patches.

Signed-off-by: Luben Tuikov 
Reported-by: kernel test robot 

Cc: Alexander Deucher 
Cc: Andrey Grodzovsky 
Cc: Christian König 
Cc: Daniel Vetter 
Cc: Lucas Stach 
Cc: Russell King 
Cc: Christian Gmeiner 
Cc: Qiang Yu 
Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Eric Anholt 

v2: Use enum as the status of a driver's job
 timeout callback method.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++-
  drivers/gpu/drm/lima/lima_sched.c   |  4 +++-
  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ---
  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
  drivers/gpu/drm/v3d/v3d_sched.c | 32 +
  include/drm/gpu_scheduler.h | 20 +---
  7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index ff48101bab55..a111326cbdde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  
-static void amdgpu_job_timedout(struct drm_sched_job *s_job)

+static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
  {
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) 
{
DRM_ERROR("ring %s timeout, but soft recovered\n",
  s_job->sched->name);
-   return;
+   return DRM_TASK_STATUS_ALIVE;
}
  
  	amdgpu_vm_get_task_info(ring->adev, job->pasid, );

@@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
  
  	if (amdgpu_device_should_recover_gpu(ring->adev)) {

amdgpu_device_gpu_recover(ring->adev, job);
+   return DRM_TASK_STATUS_ALIVE;
} else {
drm_sched_suspend_timeout(>sched);
if (amdgpu_sriov_vf(adev))
adev->virt.tdr_debug = true;
+   return DRM_TASK_STATUS_ALIVE;
}
  }
  
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c

index cd46c882269c..c49516942328 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct 
drm_sched_job *sched_job)
return fence;
  }
  
-static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)

+static enum drm_task_status etnaviv_sched_timedout_job(struct drm_sched_job
+  *sched_job)
  {
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct etnaviv_gpu *gpu = submit->gpu;
@@ -120,9 +121,16 @@ static void etnaviv_sched_timedout_job(struct 
drm_sched_job *sched_job)
  
  	drm_sched_resubmit_jobs(>sched);
  
+	/* Tell the DRM scheduler that this task needs

+* more time.
+*/
+   drm_sched_start(>sched, true);
+   return DRM_TASK_STATUS_ALIVE;
+
  out_no_timeout:
/* restart scheduler after GPU is usable again */
drm_sched_start(>sched, true);
+