RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time
[AMD Official Use Only - General] XGMI connected GPUs share the same reset domain software object. If the devices are not connected with each other, there is one domain per device. Thanks, Lijo -Original Message- From: Zhao, Victor Sent: Friday, July 29, 2022 2:58 PM To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org Cc: Deng, Emily ; Grodzovsky, Andrey Subject: RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time [AMD Official Use Only - General] Hi Lijo, Can you be more specific why other gpus can be affected? I don’t have a xgmi system on my side. I thought reset_domain is per device. Thanks, Victor -Original Message- From: Lazar, Lijo Sent: Friday, July 29, 2022 2:11 PM To: Zhao, Victor ; amd-gfx@lists.freedesktop.org Cc: Deng, Emily ; Grodzovsky, Andrey Subject: Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time On 7/28/2022 4:00 PM, Victor Zhao wrote: > In multi container use case, reset time is important, so skip ring > tests and cp halt wait during ip suspending for reset as they are > going to fail and cost more time on reset > > v2: add a hang flag to indicate the reset comes from a job timeout, > skip ring test and cp halt wait in this case > > Signed-off-by: Victor Zhao > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +-- > 5 files changed, 15 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index 222d3d7ea076..c735a17c6afb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -27,6 +27,7 @@ > #include "amdgpu_gfx.h" > #include "amdgpu_rlc.h" > #include "amdgpu_ras.h" > +#include "amdgpu_reset.h" > > /* delay 0.1 second to enable gfx off feature */ > #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) > @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) > kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], > RESET_QUEUES, 0, 0); > > - if (adev->gfx.kiq.ring.sched.ready) > + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && > +adev->reset_domain->hang)) > r = amdgpu_ring_test_helper(kiq_ring); On a system with multiple GPUs interconnected, this will affect other GPUs as well on which job was not really running. I guess your usecase here is device specific. Thanks, Lijo > spin_unlock(&adev->gfx.kiq.ring_lock); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 6c3e7290153f..bb40880a557f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct > drm_sched_job *s_job) > } > > memset(&ti, 0, sizeof(struct amdgpu_task_info)); > + adev->reset_domain->hang = true; > > if (amdgpu_gpu_recovery && > amdgpu_ring_soft_recovery(ring, job->vmid, > s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat > amdgpu_job_timedout(struct drm_sched_job *s_job) > } > > exit: > + adev->reset_domain->hang = false; > drm_dev_exit(idx); > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index 9da5ead50c90..b828fe773f50 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -155,6 +155,7 @@ struct amdgpu_reset_domain > *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d > atomic_set(&reset_domain->in_gpu_reset, 0); > atomic_set(&reset_domain->reset_res, 0); > init_rwsem(&reset_domain->sem); > + reset_domain->hang = false; > > return reset_domain; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index cc4b2eeb24cf..29e324add552 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -84,6 +84,7 @@ struct amdgpu_reset_domain { > struct rw_semaphore sem; > atomic_t in_gpu_reset; > atomic_t reset_res; > + bool hang; > }; > > > diff --git a/drivers/gpu/d
RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time
[AMD Official Use Only - General] Hi Lijo, Can you be more specific why other gpus can be affected? I don’t have a xgmi system on my side. I thought reset_domain is per device. Thanks, Victor -Original Message- From: Lazar, Lijo Sent: Friday, July 29, 2022 2:11 PM To: Zhao, Victor ; amd-gfx@lists.freedesktop.org Cc: Deng, Emily ; Grodzovsky, Andrey Subject: Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time On 7/28/2022 4:00 PM, Victor Zhao wrote: > In multi container use case, reset time is important, so skip ring > tests and cp halt wait during ip suspending for reset as they are > going to fail and cost more time on reset > > v2: add a hang flag to indicate the reset comes from a job timeout, > skip ring test and cp halt wait in this case > > Signed-off-by: Victor Zhao > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +-- > 5 files changed, 15 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index 222d3d7ea076..c735a17c6afb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -27,6 +27,7 @@ > #include "amdgpu_gfx.h" > #include "amdgpu_rlc.h" > #include "amdgpu_ras.h" > +#include "amdgpu_reset.h" > > /* delay 0.1 second to enable gfx off feature */ > #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) > @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) > kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], > RESET_QUEUES, 0, 0); > > - if (adev->gfx.kiq.ring.sched.ready) > + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && > +adev->reset_domain->hang)) > r = amdgpu_ring_test_helper(kiq_ring); On a system with multiple GPUs interconnected, this will affect other GPUs as well on which job was not really running. I guess your usecase here is device specific. Thanks, Lijo > spin_unlock(&adev->gfx.kiq.ring_lock); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 6c3e7290153f..bb40880a557f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct > drm_sched_job *s_job) > } > > memset(&ti, 0, sizeof(struct amdgpu_task_info)); > + adev->reset_domain->hang = true; > > if (amdgpu_gpu_recovery && > amdgpu_ring_soft_recovery(ring, job->vmid, > s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat > amdgpu_job_timedout(struct drm_sched_job *s_job) > } > > exit: > + adev->reset_domain->hang = false; > drm_dev_exit(idx); > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index 9da5ead50c90..b828fe773f50 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -155,6 +155,7 @@ struct amdgpu_reset_domain > *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d > atomic_set(&reset_domain->in_gpu_reset, 0); > atomic_set(&reset_domain->reset_res, 0); > init_rwsem(&reset_domain->sem); > + reset_domain->hang = false; > > return reset_domain; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index cc4b2eeb24cf..29e324add552 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -84,6 +84,7 @@ struct amdgpu_reset_domain { > struct rw_semaphore sem; > atomic_t in_gpu_reset; > atomic_t reset_res; > + bool hang; > }; > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index fafbad3cf08d..a384e04d916c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -29,6 +29,7 @@ > #include "amdgpu.h" > #include "amdgpu_gfx.h" > #include "amdgpu_psp.h" > +#include "amdgpu_reset.h" > #include &q
Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time
On 7/28/2022 4:00 PM, Victor Zhao wrote: In multi container use case, reset time is important, so skip ring tests and cp halt wait during ip suspending for reset as they are going to fail and cost more time on reset v2: add a hang flag to indicate the reset comes from a job timeout, skip ring test and cp halt wait in this case Signed-off-by: Victor Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +-- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 222d3d7ea076..c735a17c6afb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -27,6 +27,7 @@ #include "amdgpu_gfx.h" #include "amdgpu_rlc.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" /* delay 0.1 second to enable gfx off feature */ #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], RESET_QUEUES, 0, 0); - if (adev->gfx.kiq.ring.sched.ready) + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang)) r = amdgpu_ring_test_helper(kiq_ring); On a system with multiple GPUs interconnected, this will affect other GPUs as well on which job was not really running. I guess your usecase here is device specific. Thanks, Lijo spin_unlock(&adev->gfx.kiq.ring_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 6c3e7290153f..bb40880a557f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } memset(&ti, 0, sizeof(struct amdgpu_task_info)); + adev->reset_domain->hang = true; if (amdgpu_gpu_recovery && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } exit: + adev->reset_domain->hang = false; drm_dev_exit(idx); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 9da5ead50c90..b828fe773f50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d atomic_set(&reset_domain->in_gpu_reset, 0); atomic_set(&reset_domain->reset_res, 0); init_rwsem(&reset_domain->sem); + reset_domain->hang = false; return reset_domain; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index cc4b2eeb24cf..29e324add552 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -84,6 +84,7 @@ struct amdgpu_reset_domain { struct rw_semaphore sem; atomic_t in_gpu_reset; atomic_t reset_res; + bool hang; }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index fafbad3cf08d..a384e04d916c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -29,6 +29,7 @@ #include "amdgpu.h" #include "amdgpu_gfx.h" #include "amdgpu_psp.h" +#include "amdgpu_reset.h" #include "nv.h" #include "nvd.h" @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable) WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp); } + if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable) + return 0; + for (i = 0; i < adev->usec_timeout; i++) { if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0) break; @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_gfx_rings; i++) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i], PREEMPT_QUEUES, 0, 0); - - return amdgpu_ring_test_helper(kiq_ring); + if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang)) + return amdgpu_ring_test_helper(kiq_ring); + else + return 0; } #endif @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle) return 0; } + gfx_v10_0_cp_enable(adev, false);
Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time
On 2022-07-28 06:30, Victor Zhao wrote: In multi container use case, reset time is important, so skip ring tests and cp halt wait during ip suspending for reset as they are going to fail and cost more time on reset v2: add a hang flag to indicate the reset comes from a job timeout, skip ring test and cp halt wait in this case Signed-off-by: Victor Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +-- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 222d3d7ea076..c735a17c6afb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -27,6 +27,7 @@ #include "amdgpu_gfx.h" #include "amdgpu_rlc.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" /* delay 0.1 second to enable gfx off feature */ #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], RESET_QUEUES, 0, 0); - if (adev->gfx.kiq.ring.sched.ready) + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang)) I think it's enough to look at adev->reset_domain->hang and you can drop the amdgpu_in_reset check. r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&adev->gfx.kiq.ring_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 6c3e7290153f..bb40880a557f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } memset(&ti, 0, sizeof(struct amdgpu_task_info)); + adev->reset_domain->hang = true; if (amdgpu_gpu_recovery && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } exit: + adev->reset_domain->hang = false; drm_dev_exit(idx); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 9da5ead50c90..b828fe773f50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d atomic_set(&reset_domain->in_gpu_reset, 0); atomic_set(&reset_domain->reset_res, 0); init_rwsem(&reset_domain->sem); + reset_domain->hang = false; return reset_domain; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index cc4b2eeb24cf..29e324add552 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -84,6 +84,7 @@ struct amdgpu_reset_domain { struct rw_semaphore sem; atomic_t in_gpu_reset; atomic_t reset_res; + bool hang; }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index fafbad3cf08d..a384e04d916c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -29,6 +29,7 @@ #include "amdgpu.h" #include "amdgpu_gfx.h" #include "amdgpu_psp.h" +#include "amdgpu_reset.h" #include "nv.h" #include "nvd.h" @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable) WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp); } + if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable) + return 0; + Same as above for (i = 0; i < adev->usec_timeout; i++) { if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0) break; @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_gfx_rings; i++) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i], PREEMPT_QUEUES, 0, 0); - - return amdgpu_ring_test_helper(kiq_ring); + if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang)) Same as above Andrey + return amdgpu_ring_test_helper(kiq_ring); + else + return 0; } #endif @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle) return 0; } + gfx_v10_0_cp_enable(adev, false); gfx_v10_0_enable_gui_idle_inter
[PATCH v2 6/6] drm/amdgpu: reduce reset time
In multi container use case, reset time is important, so skip ring tests and cp halt wait during ip suspending for reset as they are going to fail and cost more time on reset v2: add a hang flag to indicate the reset comes from a job timeout, skip ring test and cp halt wait in this case Signed-off-by: Victor Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 11 +-- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 222d3d7ea076..c735a17c6afb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -27,6 +27,7 @@ #include "amdgpu_gfx.h" #include "amdgpu_rlc.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" /* delay 0.1 second to enable gfx off feature */ #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], RESET_QUEUES, 0, 0); - if (adev->gfx.kiq.ring.sched.ready) + if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang)) r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&adev->gfx.kiq.ring_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 6c3e7290153f..bb40880a557f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } memset(&ti, 0, sizeof(struct amdgpu_task_info)); + adev->reset_domain->hang = true; if (amdgpu_gpu_recovery && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } exit: + adev->reset_domain->hang = false; drm_dev_exit(idx); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 9da5ead50c90..b828fe773f50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d atomic_set(&reset_domain->in_gpu_reset, 0); atomic_set(&reset_domain->reset_res, 0); init_rwsem(&reset_domain->sem); + reset_domain->hang = false; return reset_domain; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index cc4b2eeb24cf..29e324add552 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -84,6 +84,7 @@ struct amdgpu_reset_domain { struct rw_semaphore sem; atomic_t in_gpu_reset; atomic_t reset_res; + bool hang; }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index fafbad3cf08d..a384e04d916c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -29,6 +29,7 @@ #include "amdgpu.h" #include "amdgpu_gfx.h" #include "amdgpu_psp.h" +#include "amdgpu_reset.h" #include "nv.h" #include "nvd.h" @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable) WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp); } + if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable) + return 0; + for (i = 0; i < adev->usec_timeout; i++) { if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0) break; @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_gfx_rings; i++) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i], PREEMPT_QUEUES, 0, 0); - - return amdgpu_ring_test_helper(kiq_ring); + if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang)) + return amdgpu_ring_test_helper(kiq_ring); + else + return 0; } #endif @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle) return 0; } + gfx_v10_0_cp_enable(adev, false); gfx_v10_0_enable_gui_idle_interrupt(adev, false); -- 2.25.1