Re: [PATCH] drm/amdgpu: refine codes to avoid reentering GPU recovery
Am 19.08.20 um 11:34 schrieb Dennis Li: if other threads have holden the reset lock, recovery will fail to try_lock. Therefore we introduce atomic hive->in_reset and adev->in_gpu_reset, to avoid reentering GPU recovery. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 8ba389780001..0fba65efdb48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -952,7 +952,7 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; @@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline bool amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->in_gpu_reset) ? true : false; Please drop the "? true : false" part. Apart from that looks good to me, but I'm wondering if a mutex wouldn't be better than an atomic here. Christian. +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..b872cdb0b705 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; #if 0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..832a200bb62f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b5..d0940121a6a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; struct vi_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 4cd851fc5c82..64fdb6a81c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 35fed75a4397..79b397800cbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_ poll_wait(file, &adev->autodump.gpu_hang, poll_table); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return POLLIN | POLLRDNORM | POLLWRNORM; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6573e1112462..78fd2c9a7b7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].status.hw == true) break; - if (adev->in_gpu_reset || adev->in_suspend) { + if (amdgpu_in_reset(adev) || adev->in_suspend) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM)) return true; - if (!adev->in_gpu_reset) + i
[PATCH] drm/amdgpu: refine codes to avoid reentering GPU recovery
if other threads have holden the reset lock, recovery will fail to try_lock. Therefore we introduce atomic hive->in_reset and adev->in_gpu_reset, to avoid reentering GPU recovery. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 8ba389780001..0fba65efdb48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -952,7 +952,7 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; @@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline bool amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->in_gpu_reset) ? true : false; +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..b872cdb0b705 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; #if 0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..832a200bb62f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b5..d0940121a6a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; struct vi_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 4cd851fc5c82..64fdb6a81c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 35fed75a4397..79b397800cbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_ poll_wait(file, &adev->autodump.gpu_hang, poll_table); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return POLLIN | POLLRDNORM | POLLWRNORM; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6573e1112462..78fd2c9a7b7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].status.hw == true) break; - if (adev->in_gpu_reset || adev->in_suspend) { + if (amdgpu_in_reset(adev) || adev->in_suspend) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM)) return true; - if (!adev->in_gpu_reset) + if (!amdgpu_in_reset(adev)) return false; /* @@ -3053,6 +3053,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,