Re: [PATCH] drm/amdgpu: revert "fix system hang issue during GPU reset"

2020-08-13 Thread Matt Coffin
On 8/12/20 9:55 AM, Alex Deucher wrote:
> This also broke GPU overclocking.

The fix for the `pp_od_clk_voltage` interface was actually quite simple,
and the bug was limited to that function. The patch just messed up the
return value (which was supposed to be the # of bytes consumed). It was
just returning 0 on success, which caused the userspace client to keep
on trying to send the data.

I suck at email lists, and don't know how to link it here, but I
submitted a patch just now to repair it, if that's the only blocker on this.

Thanks for the work guys,
Matt
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: revert "fix system hang issue during GPU reset"

2020-08-12 Thread Alex Deucher
On Wed, Aug 12, 2020 at 11:54 AM Christian König
 wrote:
>
> The whole approach wasn't thought through till the end.
>
> We already had a reset lock like this in the past and it caused the same 
> problems like this one.
>
> Completely revert the patch for now and add individual trylock protection to 
> the hardware access functions as necessary.
>
> This reverts commit edad8312cbbf9a33c86873fc4093664f150dd5c1.
>
> Signed-off-by: Christian König 

This also broke GPU overclocking.

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   9 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  40 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|   2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |   2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |   2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |   2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |   7 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c|   4 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   |   4 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  14 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  57 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |   4 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |   6 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  14 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |   4 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c| 353 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   |   4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   |   4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h  |   4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |  11 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |   3 +-
>  drivers/gpu/drm/amd/amdgpu/atom.c |   1 -
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|  10 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c |   6 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  10 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|   4 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c |   2 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c |   2 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |   7 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  13 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  13 +-
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  16 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   4 -
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |   4 +-
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c|   2 +-
>  .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c|   2 +-
>  39 files changed, 184 insertions(+), 469 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f9d97f61aa5..9c6fb38ce59d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -952,9 +952,9 @@ struct amdgpu_device {
> boolin_suspend;
> boolin_hibernate;
>
> -   atomic_tin_gpu_reset;
> +   boolin_gpu_reset;
> enum pp_mp1_state   mp1_state;
> -   struct rw_semaphore reset_sem;
> +   struct mutex  lock_reset;
> struct amdgpu_doorbell_index doorbell_index;
>
> struct mutexnotifier_lock;
> @@ -1269,9 +1269,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
> *adev)
> return adev->gmc.tmz_enabled;
>  }
>
> -static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
> -{
> -   return atomic_read(>in_gpu_reset) ? true : false;
> -}
> -
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 9738dccb1c2c..0effc1d46824 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
> size_t size,
> if (cp_mqd_gfx9)
> bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
>
> -   if (!down_read_trylock(>reset_sem))
> -   return -EIO;
> -
> r = amdgpu_bo_create(adev, , );
> if (r) {
> dev_err(adev->dev,
> "failed to allocate BO for amdkfd (%d)\n", r);
> -   goto err;
> +   return r;
> }
>
> /* map the buffer */
> @@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
> size_t size,
>
> amdgpu_bo_unreserve(bo);
>
> -   up_read(>reset_sem);
> return 0;
>
>  allocate_mem_kmap_bo_failed:
> @@ -295,25 +291,19 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
> size_t size,
> 

[PATCH] drm/amdgpu: revert "fix system hang issue during GPU reset"

2020-08-12 Thread Christian König
The whole approach wasn't thought through till the end.

We already had a reset lock like this in the past and it caused the same 
problems like this one.

Completely revert the patch for now and add individual trylock protection to 
the hardware access functions as necessary.

This reverts commit edad8312cbbf9a33c86873fc4093664f150dd5c1.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  40 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|   2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |   2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |   2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |   2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |   7 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c|   4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   |   4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  57 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |   4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |   4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c| 353 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h  |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |  11 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |   3 +-
 drivers/gpu/drm/amd/amdgpu/atom.c |   1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|  10 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c |   6 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  10 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|   4 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c |   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c |   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |   7 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  13 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  13 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  16 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   4 -
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |   4 +-
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c|   2 +-
 .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c|   2 +-
 39 files changed, 184 insertions(+), 469 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1f9d97f61aa5..9c6fb38ce59d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -952,9 +952,9 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   atomic_tin_gpu_reset;
+   boolin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct rw_semaphore reset_sem;
+   struct mutex  lock_reset;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
@@ -1269,9 +1269,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
-static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
-{
-   return atomic_read(>in_gpu_reset) ? true : false;
-}
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 9738dccb1c2c..0effc1d46824 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
size_t size,
if (cp_mqd_gfx9)
bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
 
-   if (!down_read_trylock(>reset_sem))
-   return -EIO;
-
r = amdgpu_bo_create(adev, , );
if (r) {
dev_err(adev->dev,
"failed to allocate BO for amdkfd (%d)\n", r);
-   goto err;
+   return r;
}
 
/* map the buffer */
@@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t 
size,
 
amdgpu_bo_unreserve(bo);
 
-   up_read(>reset_sem);
return 0;
 
 allocate_mem_kmap_bo_failed:
@@ -295,25 +291,19 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
size_t size,
amdgpu_bo_unreserve(bo);
 allocate_mem_reserve_bo_failed:
amdgpu_bo_unref();
-err:
-   up_read(>reset_sem);
+
return r;
 }
 
 void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
 {
-   struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
 
-