RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time

2022-07-29 Thread Lazar, Lijo
[AMD Official Use Only - General]

XGMI connected GPUs share the same reset domain software object. If the devices 
are not connected with each other, there is one domain per device.

Thanks,
Lijo

-Original Message-
From: Zhao, Victor  
Sent: Friday, July 29, 2022 2:58 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily ; Grodzovsky, Andrey 

Subject: RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time

[AMD Official Use Only - General]

Hi Lijo,

Can you be more specific why other gpus can be affected? I don’t have a xgmi 
system on my side. I thought reset_domain is per device.


Thanks,
Victor



-Original Message-
From: Lazar, Lijo  
Sent: Friday, July 29, 2022 2:11 PM
To: Zhao, Victor ; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily ; Grodzovsky, Andrey 

Subject: Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time



On 7/28/2022 4:00 PM, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring 
> tests and cp halt wait during ip suspending for reset as they are 
> going to fail and cost more time on reset
> 
> v2: add a hang flag to indicate the reset comes from a job timeout, 
> skip ring test and cp halt wait in this case
> 
> Signed-off-by: Victor Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +--
>   5 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..c735a17c6afb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_rlc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   /* delay 0.1 second to enable gfx off feature */
>   #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
> @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>  RESET_QUEUES, 0, 0);
>   
> - if (adev->gfx.kiq.ring.sched.ready)
> + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
> +adev->reset_domain->hang))
>   r = amdgpu_ring_test_helper(kiq_ring);

On a system with multiple GPUs interconnected, this will affect other GPUs as 
well on which job was not really running. I guess your usecase here is device 
specific.

Thanks,
Lijo

>   spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6c3e7290153f..bb40880a557f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
> drm_sched_job *s_job)
>   }
>   
>   memset(&ti, 0, sizeof(struct amdgpu_task_info));
> + adev->reset_domain->hang = true;
>   
>   if (amdgpu_gpu_recovery &&
>   amdgpu_ring_soft_recovery(ring, job->vmid, 
> s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat 
> amdgpu_job_timedout(struct drm_sched_job *s_job)
>   }
>   
>   exit:
> + adev->reset_domain->hang = false;
>   drm_dev_exit(idx);
>   return DRM_GPU_SCHED_STAT_NOMINAL;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 9da5ead50c90..b828fe773f50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -155,6 +155,7 @@ struct amdgpu_reset_domain 
> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   atomic_set(&reset_domain->in_gpu_reset, 0);
>   atomic_set(&reset_domain->reset_res, 0);
>   init_rwsem(&reset_domain->sem);
> + reset_domain->hang = false;
>   
>   return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc4b2eeb24cf..29e324add552 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
>   struct rw_semaphore sem;
>   atomic_t in_gpu_reset;
>   atomic_t reset_res;
> + bool hang;
>   };
>   
>   
> diff --git a/drivers/gpu/d

RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time

2022-07-29 Thread Zhao, Victor
[AMD Official Use Only - General]

Hi Lijo,

Can you be more specific why other gpus can be affected? I don’t have a xgmi 
system on my side. I thought reset_domain is per device.


Thanks,
Victor



-Original Message-
From: Lazar, Lijo  
Sent: Friday, July 29, 2022 2:11 PM
To: Zhao, Victor ; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily ; Grodzovsky, Andrey 

Subject: Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time



On 7/28/2022 4:00 PM, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring 
> tests and cp halt wait during ip suspending for reset as they are 
> going to fail and cost more time on reset
> 
> v2: add a hang flag to indicate the reset comes from a job timeout, 
> skip ring test and cp halt wait in this case
> 
> Signed-off-by: Victor Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +--
>   5 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..c735a17c6afb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_rlc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   /* delay 0.1 second to enable gfx off feature */
>   #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
> @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>  RESET_QUEUES, 0, 0);
>   
> - if (adev->gfx.kiq.ring.sched.ready)
> + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
> +adev->reset_domain->hang))
>   r = amdgpu_ring_test_helper(kiq_ring);

On a system with multiple GPUs interconnected, this will affect other GPUs as 
well on which job was not really running. I guess your usecase here is device 
specific.

Thanks,
Lijo

>   spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6c3e7290153f..bb40880a557f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
> drm_sched_job *s_job)
>   }
>   
>   memset(&ti, 0, sizeof(struct amdgpu_task_info));
> + adev->reset_domain->hang = true;
>   
>   if (amdgpu_gpu_recovery &&
>   amdgpu_ring_soft_recovery(ring, job->vmid, 
> s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat 
> amdgpu_job_timedout(struct drm_sched_job *s_job)
>   }
>   
>   exit:
> + adev->reset_domain->hang = false;
>   drm_dev_exit(idx);
>   return DRM_GPU_SCHED_STAT_NOMINAL;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 9da5ead50c90..b828fe773f50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -155,6 +155,7 @@ struct amdgpu_reset_domain 
> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   atomic_set(&reset_domain->in_gpu_reset, 0);
>   atomic_set(&reset_domain->reset_res, 0);
>   init_rwsem(&reset_domain->sem);
> + reset_domain->hang = false;
>   
>   return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc4b2eeb24cf..29e324add552 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
>   struct rw_semaphore sem;
>   atomic_t in_gpu_reset;
>   atomic_t reset_res;
> + bool hang;
>   };
>   
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..a384e04d916c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -29,6 +29,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_psp.h"
> +#include "amdgpu_reset.h"
>   #include &q

Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time

2022-07-28 Thread Lazar, Lijo




On 7/28/2022 4:00 PM, Victor Zhao wrote:

In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case

Signed-off-by: Victor Zhao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +--
  5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..c735a17c6afb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -27,6 +27,7 @@
  #include "amdgpu_gfx.h"
  #include "amdgpu_rlc.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  
  /* delay 0.1 second to enable gfx off feature */

  #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
@@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
   RESET_QUEUES, 0, 0);
  
-	if (adev->gfx.kiq.ring.sched.ready)

+   if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
adev->reset_domain->hang))
r = amdgpu_ring_test_helper(kiq_ring);


On a system with multiple GPUs interconnected, this will affect other 
GPUs as well on which job was not really running. I guess your usecase 
here is device specific.


Thanks,
Lijo


spin_unlock(&adev->gfx.kiq.ring_lock);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index 6c3e7290153f..bb40880a557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
}
  
  	memset(&ti, 0, sizeof(struct amdgpu_task_info));

+   adev->reset_domain->hang = true;
  
  	if (amdgpu_gpu_recovery &&

amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) 
{
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
}
  
  exit:

+   adev->reset_domain->hang = false;
drm_dev_exit(idx);
return DRM_GPU_SCHED_STAT_NOMINAL;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 9da5ead50c90..b828fe773f50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -155,6 +155,7 @@ struct amdgpu_reset_domain 
*amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
atomic_set(&reset_domain->in_gpu_reset, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
+   reset_domain->hang = false;
  
  	return reset_domain;

  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index cc4b2eeb24cf..29e324add552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
struct rw_semaphore sem;
atomic_t in_gpu_reset;
atomic_t reset_res;
+   bool hang;
  };
  
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

index fafbad3cf08d..a384e04d916c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -29,6 +29,7 @@
  #include "amdgpu.h"
  #include "amdgpu_gfx.h"
  #include "amdgpu_psp.h"
+#include "amdgpu_reset.h"
  #include "nv.h"
  #include "nvd.h"
  
@@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)

WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
}
  
+	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)

+   return 0;
+
for (i = 0; i < adev->usec_timeout; i++) {
if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
break;
@@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct 
amdgpu_device *adev)
for (i = 0; i < adev->gfx.num_gfx_rings; i++)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
   PREEMPT_QUEUES, 0, 0);
-
-   return amdgpu_ring_test_helper(kiq_ring);
+   if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
+   return amdgpu_ring_test_helper(kiq_ring);
+   else
+   return 0;
  }
  #endif
  
@@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
  
  		return 0;

}
+
gfx_v10_0_cp_enable(adev, false);
 

Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time

2022-07-28 Thread Andrey Grodzovsky



On 2022-07-28 06:30, Victor Zhao wrote:

In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case

Signed-off-by: Victor Zhao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +--
  5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..c735a17c6afb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -27,6 +27,7 @@
  #include "amdgpu_gfx.h"
  #include "amdgpu_rlc.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  
  /* delay 0.1 second to enable gfx off feature */

  #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
@@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
   RESET_QUEUES, 0, 0);
  
-	if (adev->gfx.kiq.ring.sched.ready)

+   if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
adev->reset_domain->hang))



I think it's enough to look at adev->reset_domain->hang and you can drop 
the amdgpu_in_reset check.



r = amdgpu_ring_test_helper(kiq_ring);
spin_unlock(&adev->gfx.kiq.ring_lock);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index 6c3e7290153f..bb40880a557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
}
  
  	memset(&ti, 0, sizeof(struct amdgpu_task_info));

+   adev->reset_domain->hang = true;
  
  	if (amdgpu_gpu_recovery &&

amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) 
{
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
}
  
  exit:

+   adev->reset_domain->hang = false;
drm_dev_exit(idx);
return DRM_GPU_SCHED_STAT_NOMINAL;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 9da5ead50c90..b828fe773f50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -155,6 +155,7 @@ struct amdgpu_reset_domain 
*amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
atomic_set(&reset_domain->in_gpu_reset, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
+   reset_domain->hang = false;
  
  	return reset_domain;

  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index cc4b2eeb24cf..29e324add552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
struct rw_semaphore sem;
atomic_t in_gpu_reset;
atomic_t reset_res;
+   bool hang;
  };
  
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

index fafbad3cf08d..a384e04d916c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -29,6 +29,7 @@
  #include "amdgpu.h"
  #include "amdgpu_gfx.h"
  #include "amdgpu_psp.h"
+#include "amdgpu_reset.h"
  #include "nv.h"
  #include "nvd.h"
  
@@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)

WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
}
  
+	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)

+   return 0;
+



Same as above



for (i = 0; i < adev->usec_timeout; i++) {
if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
break;
@@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct 
amdgpu_device *adev)
for (i = 0; i < adev->gfx.num_gfx_rings; i++)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
   PREEMPT_QUEUES, 0, 0);
-
-   return amdgpu_ring_test_helper(kiq_ring);
+   if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))



Same as above

Andrey



+   return amdgpu_ring_test_helper(kiq_ring);
+   else
+   return 0;
  }
  #endif
  
@@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
  
  		return 0;

}
+
gfx_v10_0_cp_enable(adev, false);
gfx_v10_0_enable_gui_idle_inter

[PATCH v2 6/6] drm/amdgpu: reduce reset time

2022-07-28 Thread Victor Zhao
In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case

Signed-off-by: Victor Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +--
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..c735a17c6afb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -27,6 +27,7 @@
 #include "amdgpu_gfx.h"
 #include "amdgpu_rlc.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 /* delay 0.1 second to enable gfx off feature */
 #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
@@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
   RESET_QUEUES, 0, 0);
 
-   if (adev->gfx.kiq.ring.sched.ready)
+   if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
adev->reset_domain->hang))
r = amdgpu_ring_test_helper(kiq_ring);
spin_unlock(&adev->gfx.kiq.ring_lock);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 6c3e7290153f..bb40880a557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
}
 
memset(&ti, 0, sizeof(struct amdgpu_task_info));
+   adev->reset_domain->hang = true;
 
if (amdgpu_gpu_recovery &&
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) 
{
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
}
 
 exit:
+   adev->reset_domain->hang = false;
drm_dev_exit(idx);
return DRM_GPU_SCHED_STAT_NOMINAL;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 9da5ead50c90..b828fe773f50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -155,6 +155,7 @@ struct amdgpu_reset_domain 
*amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
atomic_set(&reset_domain->in_gpu_reset, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
+   reset_domain->hang = false;
 
return reset_domain;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index cc4b2eeb24cf..29e324add552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
struct rw_semaphore sem;
atomic_t in_gpu_reset;
atomic_t reset_res;
+   bool hang;
 };
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index fafbad3cf08d..a384e04d916c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -29,6 +29,7 @@
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
 #include "amdgpu_psp.h"
+#include "amdgpu_reset.h"
 #include "nv.h"
 #include "nvd.h"
 
@@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device 
*adev, bool enable)
WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
}
 
+   if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
+   return 0;
+
for (i = 0; i < adev->usec_timeout; i++) {
if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
break;
@@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct 
amdgpu_device *adev)
for (i = 0; i < adev->gfx.num_gfx_rings; i++)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
   PREEMPT_QUEUES, 0, 0);
-
-   return amdgpu_ring_test_helper(kiq_ring);
+   if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
+   return amdgpu_ring_test_helper(kiq_ring);
+   else
+   return 0;
 }
 #endif
 
@@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
 
return 0;
}
+
gfx_v10_0_cp_enable(adev, false);
gfx_v10_0_enable_gui_idle_interrupt(adev, false);
 
-- 
2.25.1