Re: [PATCH] drm/amdgpu: refine codes to avoid reentering GPU recovery

2020-08-19 Thread Christian König

Am 19.08.20 um 11:34 schrieb Dennis Li:

if other threads have holden the reset lock, recovery will
fail to try_lock. Therefore we introduce atomic hive->in_reset
and adev->in_gpu_reset, to avoid reentering GPU recovery.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8ba389780001..0fba65efdb48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -952,7 +952,7 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
  
-	boolin_gpu_reset;

+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct mutex  lock_reset;
struct amdgpu_doorbell_index doorbell_index;
@@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
 return adev->gmc.tmz_enabled;
  }
  
+static inline bool amdgpu_in_reset(struct amdgpu_device *adev)

+{
+   return atomic_read(&adev->in_gpu_reset) ? true : false;


Please drop the "? true : false" part.

Apart from that looks good to me, but I'm wondering if a mutex wouldn't 
be better than an atomic here.


Christian.


+}
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..b872cdb0b705 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
  
-	if (adev->in_gpu_reset)

+   if (amdgpu_in_reset(adev))
return -EIO;
  
  #if 0

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..832a200bb62f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
  
-	if (adev->in_gpu_reset)

+   if (amdgpu_in_reset(adev))
return -EIO;
  
  	acquire_queue(kgd, pipe_id, queue_id);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index ccd635b812b5..d0940121a6a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
  
-	if (adev->in_gpu_reset)

+   if (amdgpu_in_reset(adev))
return -EIO;
  
  	acquire_queue(kgd, pipe_id, queue_id);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 4cd851fc5c82..64fdb6a81c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
  
-	if (adev->in_gpu_reset)

+   if (amdgpu_in_reset(adev))
return -EIO;
  
  	acquire_queue(kgd, pipe_id, queue_id);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 35fed75a4397..79b397800cbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct 
file *file, struct poll_
  
  	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
  
-	if (adev->in_gpu_reset)

+   if (amdgpu_in_reset(adev))
return POLLIN | POLLRDNORM | POLLWRNORM;
  
  	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6573e1112462..78fd2c9a7b7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].status.hw == true)
break;
  
-			if (adev->in_gpu_reset || adev->in_suspend) {

+   if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = 
adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed 
%d\n",
@@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct 
amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
  
-	if (!adev->in_gpu_reset)

+   i

[PATCH] drm/amdgpu: refine codes to avoid reentering GPU recovery

2020-08-19 Thread Dennis Li
if other threads have holden the reset lock, recovery will
fail to try_lock. Therefore we introduce atomic hive->in_reset
and adev->in_gpu_reset, to avoid reentering GPU recovery.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8ba389780001..0fba65efdb48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -952,7 +952,7 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   boolin_gpu_reset;
+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct mutex  lock_reset;
struct amdgpu_doorbell_index doorbell_index;
@@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
+static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(&adev->in_gpu_reset) ? true : false;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..b872cdb0b705 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
 #if 0
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..832a200bb62f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index ccd635b812b5..d0940121a6a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 4cd851fc5c82..64fdb6a81c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 35fed75a4397..79b397800cbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct 
file *file, struct poll_
 
poll_wait(file, &adev->autodump.gpu_hang, poll_table);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return POLLIN | POLLRDNORM | POLLWRNORM;
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6573e1112462..78fd2c9a7b7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].status.hw == true)
break;
 
-   if (adev->in_gpu_reset || adev->in_suspend) {
+   if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = 
adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> 
failed %d\n",
@@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct 
amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
 
-   if (!adev->in_gpu_reset)
+   if (!amdgpu_in_reset(adev))
return false;
 
/*
@@ -3053,6 +3053,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,