[PATCH] drm/amdkfd: use unmap all queues for poison consumption

2022-01-29 Thread Tao Zhou
Replace reset queue for specific PASID with unmap all queues, reset
queue could break CP scheduler.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e8bc28009c22..dca0b5fac1db 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -109,8 +109,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
 
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
-   if (dev->dqm->ops.reset_queues)
-   ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+   ret = kfd_process_vm_fault(dev->dqm, pasid);
break;
case SOC15_INTSRC_SDMA_ECC:
default:
-- 
2.17.1



[PATCH] drm/amdgpu: fix list add issue in vram reserve

2022-01-29 Thread Tao Zhou
The parameter order in the list_add_tail is incorrect, it causes the
reuse of ras reserved page.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 7a2b487db57c..6c99ef700cc8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -281,7 +281,7 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr 
*mgr,
rsv->mm_node.size = size >> PAGE_SHIFT;
 
spin_lock(>lock);
-   list_add_tail(>reservations_pending, >node);
+   list_add_tail(>node, >reservations_pending);
amdgpu_vram_mgr_do_reserve(>manager);
spin_unlock(>lock);
 
-- 
2.17.1



RE: [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-01-29 Thread Zhou1, Tao
[AMD Official Use Only]

The series is:

Reviewed-by: Tao Zhou  -Original Message-
> From: Chai, Thomas 
> Sent: Sunday, January 30, 2022 3:12 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Clements,
> John ; Chai, Thomas 
> Subject: [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by
> infinite loop
> 
> 1. The infinite loop case only occurs on multiple cards support
>ras functions.
> 2. The explanation of root cause refer to commit 76641cbbf196
>("drm/amdgpu: Add judgement to avoid infinite loop").
> 3. Create new node to manage each unique ras instance to guarantee
>each device .ras_list is completely independent.
> 4. Fixes: commit 7a6b8ab3231b51 ("drm/amdgpu: Unify ras block
>interface for each ras block").
> 5. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU,
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 
> 89
> ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 
> 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 0018:ac908fa87d80
> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 91e4ab8d6e20
> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
> 000e RDI: 91e4ab8c [  262.166252] RBP:
> ac908fa87da8 R08: 0007 R09: 0001
> [  262.166254] R10: 91e4930b64ec R11:  R12:
> 000e [  262.166256] R13: 91e4aa356df8 R14: c1394320
> R15: 0003 [  262.166258] FS:  ()
> GS:92238fb4() knlGS: [  262.166261] CS:  0010
> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0
> [  262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
> [  262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70
> [  262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? process_one_work+0x3c0/0x3c0
> [  262.166563]  kthread+0x12b/0x150 [  262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++--
> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
>  2 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 9d7c778c1a2d..9b94c9c4960c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
>   "mca_iohc",
>  };
> 
> +struct amdgpu_ras_block_list {
> + /* ras block link */
> + struct list_head node;
> +
> + struct amdgpu_ras_block_object *ras_obj; };
> +
>  const char *get_ras_block_str(struct ras_common_if *ras_block)  {
>   if (!ras_block)
> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
>   int loop_cnt = 0;
> - struct amdgpu_ras_block_object *obj, *tmp;
> + struct amdgpu_ras_block_list *node, *tmp;
> + struct amdgpu_ras_block_object *obj;
> 
>   if (block >= AMDGPU_RAS_BLOCK__LAST)
>   return NULL;
> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   if (!amdgpu_ras_is_supported(adev, block))
>   return NULL;
> 
> - list_for_each_entry_safe(obj, tmp, >ras_list, node) {
> + list_for_each_entry_safe(node, tmp, >ras_list, node) {
> + if (!node->ras_obj) {
> + dev_warn(adev->dev, "Warning: abnormal ras list
> node.\n");
> + continue;
> + }
> +
> + obj = node->ras_obj;
>   if (obj->ras_block_match) {
>   if (obj->ras_block_match(obj, block, sub_block_index)
> == 0)
>   return obj;
> @@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> 
>  int amdgpu_ras_fini(struct amdgpu_device *adev)  {
> + struct amdgpu_ras_block_list *ras_node, *tmp;
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> 
>   if (!adev->ras_enabled || !con)
> @@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>   amdgpu_ras_set_context(adev, 

[PATCH V2 2/2] Revert "drm/amdgpu: Add judgement to avoid infinite loop"

2022-01-29 Thread yipechai
The commit 8583c8983f1b ("drm/amdgpu: Fixed the defect of
soft lock caused by infinite loop") had fixed this defect.

Revert workaround commit 76641cbbf196 ("drm/amdgpu: Add
judgement to avoid infinite loop").

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9b94c9c4960c..5558df3b21f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -886,7 +886,6 @@ static int amdgpu_ras_block_match_default(struct 
amdgpu_ras_block_object *block_
 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct 
amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t 
sub_block_index)
 {
-   int loop_cnt = 0;
struct amdgpu_ras_block_list *node, *tmp;
struct amdgpu_ras_block_object *obj;
 
@@ -910,9 +909,6 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
if (amdgpu_ras_block_match_default(obj, block) == 0)
return obj;
}
-
-   if (++loop_cnt >= AMDGPU_RAS_BLOCK__LAST)
-   break;
}
 
return NULL;
-- 
2.25.1



[PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-01-29 Thread yipechai
1. The infinite loop case only occurs on multiple cards support
   ras functions.
2. The explanation of root cause refer to commit 76641cbbf196
   ("drm/amdgpu: Add judgement to avoid infinite loop").
3. Create new node to manage each unique ras instance to guarantee
   each device .ras_list is completely independent.
4. Fixes: commit 7a6b8ab3231b51 ("drm/amdgpu: Unify ras block
   interface for each ras block").
5. The soft locked logs are as follows:
[  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE 
5.13.0-27-generic #29~20.04.1-Ubuntu
[  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, BIOS 
T20200717143848 07/17/2020
[  262.165698] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[  262.165980] RIP: 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu]
[  262.166239] Code: 68 d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 
32 44 89 fa 44 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 
28 4d 89 f5 48 83 e8 28 48 39 d3 74 25 49 89 c6 49 8b 45
[  262.166243] RSP: 0018:ac908fa87d80 EFLAGS: 0202
[  262.166247] RAX: c1394248 RBX: 91e4ab8d6e20 RCX: c1394248
[  262.166249] RDX: 91e4aa356e20 RSI: 000e RDI: 91e4ab8c
[  262.166252] RBP: ac908fa87da8 R08: 0007 R09: 0001
[  262.166254] R10: 91e4930b64ec R11:  R12: 000e
[  262.166256] R13: 91e4aa356df8 R14: c1394320 R15: 0003
[  262.166258] FS:  () GS:92238fb4() 
knlGS:
[  262.166261] CS:  0010 DS:  ES:  CR0: 80050033
[  262.166264] CR2: 0001004865d0 CR3: 00406d796000 CR4: 00350ee0
[  262.166267] Call Trace:
[  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
[  262.166529]  ? psi_task_switch+0xd2/0x250
[  262.166537]  ? __switch_to+0x11d/0x460
[  262.166542]  ? __switch_to_asm+0x36/0x70
[  262.166549]  process_one_work+0x220/0x3c0
[  262.166556]  worker_thread+0x4d/0x3f0
[  262.166560]  ? process_one_work+0x3c0/0x3c0
[  262.166563]  kthread+0x12b/0x150
[  262.166568]  ? set_kthread_struct+0x40/0x40
[  262.166571]  ret_from_fork+0x22/0x30

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9d7c778c1a2d..9b94c9c4960c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
"mca_iohc",
 };
 
+struct amdgpu_ras_block_list {
+   /* ras block link */
+   struct list_head node;
+
+   struct amdgpu_ras_block_object *ras_obj;
+};
+
 const char *get_ras_block_str(struct ras_common_if *ras_block)
 {
if (!ras_block)
@@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
enum amdgpu_ras_block block, uint32_t 
sub_block_index)
 {
int loop_cnt = 0;
-   struct amdgpu_ras_block_object *obj, *tmp;
+   struct amdgpu_ras_block_list *node, *tmp;
+   struct amdgpu_ras_block_object *obj;
 
if (block >= AMDGPU_RAS_BLOCK__LAST)
return NULL;
@@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
if (!amdgpu_ras_is_supported(adev, block))
return NULL;
 
-   list_for_each_entry_safe(obj, tmp, >ras_list, node) {
+   list_for_each_entry_safe(node, tmp, >ras_list, node) {
+   if (!node->ras_obj) {
+   dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
+   continue;
+   }
+
+   obj = node->ras_obj;
if (obj->ras_block_match) {
if (obj->ras_block_match(obj, block, sub_block_index) 
== 0)
return obj;
@@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
 
 int amdgpu_ras_fini(struct amdgpu_device *adev)
 {
+   struct amdgpu_ras_block_list *ras_node, *tmp;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
if (!adev->ras_enabled || !con)
@@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
amdgpu_ras_set_context(adev, NULL);
kfree(con);
 
+   /* Clear ras blocks from ras_list and free ras block list node */
+   list_for_each_entry_safe(ras_node, tmp, >ras_list, node) {
+   list_del(_node->node);
+   kfree(ras_node);
+   }
+
return 0;
 }
 
@@ -2754,14 +2775,22 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj)
 {
+   struct 

RE: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-01-29 Thread Chai, Thomas
OK

-Original Message-
From: Zhou1, Tao  
Sent: Sunday, January 30, 2022 11:20 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by 
infinite loop

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Saturday, January 29, 2022 8:34 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused 
> by infinite loop
> 
> 1. The infinite loop case only occurs on multiple cards support
>ras functions.
> 2. The explanation of root cause refer to 76641cbbf196523b5752c6cf68f86.
> 3. Create new node to manage each unique ras instance to guarantee
>each device .ras_list is completely independent.
> 4. Fixes:7a6b8ab3231b511915cb94cac1debabf093.
> 5. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, 
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events 
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 
> 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 
> 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 
> 89 f5 48 83 e8 28 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 
> 0018:ac908fa87d80
> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 
> 91e4ab8d6e20
> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
> 000e RDI: 91e4ab8c [  262.166252] RBP:
> ac908fa87da8 R08: 0007 R09: 0001 [  
> 262.166254] R10: 91e4930b64ec R11:  R12:
> 000e [  262.166256] R13: 91e4aa356df8 R14: 
> c1394320
> R15: 0003 [  262.166258] FS:  ()
> GS:92238fb4() knlGS: [  262.166261] CS:  
> 0010
> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0 [  
> 262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [  
> 262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70 [  
> 262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? 
> process_one_work+0x3c0/0x3c0 [  262.166563]  kthread+0x12b/0x150 [  
> 262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++--
> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
>  2 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 9d7c778c1a2d..b0aa67308c31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
>   "mca_iohc",
>  };
> 
> +struct amdgpu_ras_block_list {
> + /* ras block link */
> + struct list_head node;
> +
> + struct amdgpu_ras_block_object *ras_obj; };
> +
>  const char *get_ras_block_str(struct ras_common_if *ras_block)  {
>   if (!ras_block)
> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object 
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
>   int loop_cnt = 0;
> - struct amdgpu_ras_block_object *obj, *tmp;
> + struct amdgpu_ras_block_list *node, *tmp;
> + struct amdgpu_ras_block_object *obj;
> 
>   if (block >= AMDGPU_RAS_BLOCK__LAST)
>   return NULL;
> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object 
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   if (!amdgpu_ras_is_supported(adev, block))
>   return NULL;
> 
> - list_for_each_entry_safe(obj, tmp, >ras_list, node) {
> + list_for_each_entry_safe(node, tmp, >ras_list, node) {
> + if (!node->ras_obj) {
> + DRM_ERROR("Warning: abnormal ras list node");
[Tao]: dev_warn is recommended.

> + continue;
> + }
> +
> + obj = node->ras_obj;
>   if (obj->ras_block_match) {
>   if (obj->ras_block_match(obj, block, sub_block_index) 
> == 0)
>   return obj;
> @@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device 
> *adev)
> 
>  int amdgpu_ras_fini(struct amdgpu_device *adev)  {
> + struct amdgpu_ras_block_list *ras_node, *tmp;
>   struct amdgpu_ras *con = 

RE: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-01-29 Thread Zhou1, Tao
[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Saturday, January 29, 2022 8:34 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Clements,
> John ; Chai, Thomas 
> Subject: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by
> infinite loop
> 
> 1. The infinite loop case only occurs on multiple cards support
>ras functions.
> 2. The explanation of root cause refer to 76641cbbf196523b5752c6cf68f86.
> 3. Create new node to manage each unique ras instance to guarantee
>each device .ras_list is completely independent.
> 4. Fixes:7a6b8ab3231b511915cb94cac1debabf093.
> 5. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU,
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 
> 89
> ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 
> 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 0018:ac908fa87d80
> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 91e4ab8d6e20
> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
> 000e RDI: 91e4ab8c [  262.166252] RBP:
> ac908fa87da8 R08: 0007 R09: 0001
> [  262.166254] R10: 91e4930b64ec R11:  R12:
> 000e [  262.166256] R13: 91e4aa356df8 R14: c1394320
> R15: 0003 [  262.166258] FS:  ()
> GS:92238fb4() knlGS: [  262.166261] CS:  0010
> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0
> [  262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
> [  262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70
> [  262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? process_one_work+0x3c0/0x3c0
> [  262.166563]  kthread+0x12b/0x150 [  262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++--
> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
>  2 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 9d7c778c1a2d..b0aa67308c31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
>   "mca_iohc",
>  };
> 
> +struct amdgpu_ras_block_list {
> + /* ras block link */
> + struct list_head node;
> +
> + struct amdgpu_ras_block_object *ras_obj; };
> +
>  const char *get_ras_block_str(struct ras_common_if *ras_block)  {
>   if (!ras_block)
> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
>   int loop_cnt = 0;
> - struct amdgpu_ras_block_object *obj, *tmp;
> + struct amdgpu_ras_block_list *node, *tmp;
> + struct amdgpu_ras_block_object *obj;
> 
>   if (block >= AMDGPU_RAS_BLOCK__LAST)
>   return NULL;
> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   if (!amdgpu_ras_is_supported(adev, block))
>   return NULL;
> 
> - list_for_each_entry_safe(obj, tmp, >ras_list, node) {
> + list_for_each_entry_safe(node, tmp, >ras_list, node) {
> + if (!node->ras_obj) {
> + DRM_ERROR("Warning: abnormal ras list node");
[Tao]: dev_warn is recommended.

> + continue;
> + }
> +
> + obj = node->ras_obj;
>   if (obj->ras_block_match) {
>   if (obj->ras_block_match(obj, block, sub_block_index)
> == 0)
>   return obj;
> @@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> 
>  int amdgpu_ras_fini(struct amdgpu_device *adev)  {
> + struct amdgpu_ras_block_list *ras_node, *tmp;
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> 
>   if (!adev->ras_enabled || !con)
> @@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>   amdgpu_ras_set_context(adev, NULL);
>   kfree(con);
> 
> + /* Clear ras blocks from ras_list and free ras block list node */
> + 

RE: [PATCH] drm/amdgpu: Fix uninitialized variable use warning

2022-01-29 Thread Chen, Guchun
[Public]

Reviewed-by: Guchun Chen 

Hi @Lijo Lazar,

Can you pls submit your patch to drm-next soon? This indeed fixs the regession 
by rlc indiect reg access related patches.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Friday, January 28, 2022 2:40 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; kernel test robot 
; Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Fix uninitialized variable use warning

Fix uninitialized variable use
warning: variable 'reg_access_ctrl' is uninitialized when used here 
[-Wuninitialized]
 scratch_reg0 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg0;

Fixes: 51263163eb3f("drm/amdgpu: add helper for rlcg indirect reg
access")

Reported-by: kernel test robot 
Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 80c25176c993..c13765218919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -875,6 +875,7 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device 
*adev, u32 offset, u32 v
return 0;
}
 
+   reg_access_ctrl = >gfx.rlc.reg_access_ctrl;
scratch_reg0 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg0;
scratch_reg1 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg1;
scratch_reg2 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg2;
-- 
2.25.1


[PATCH 2/2] Revert "drm/amdgpu: Add judgement to avoid infinite loop"

2022-01-29 Thread yipechai
The commit 8583c8983f1b ("drm/amdgpu: Fixed the defect of
soft lock caused by infinite loop") had fixed this defect.

Revert workaround commit 76641cbbf196 ("drm/amdgpu: Add
judgement to avoid infinite loop").

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b0aa67308c31..5a43a220e9fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -886,7 +886,6 @@ static int amdgpu_ras_block_match_default(struct 
amdgpu_ras_block_object *block_
 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct 
amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t 
sub_block_index)
 {
-   int loop_cnt = 0;
struct amdgpu_ras_block_list *node, *tmp;
struct amdgpu_ras_block_object *obj;
 
@@ -910,9 +909,6 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
if (amdgpu_ras_block_match_default(obj, block) == 0)
return obj;
}
-
-   if (++loop_cnt >= AMDGPU_RAS_BLOCK__LAST)
-   break;
}
 
return NULL;
-- 
2.25.1



[PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-01-29 Thread yipechai
1. The infinite loop case only occurs on multiple cards support
   ras functions.
2. The explanation of root cause refer to 76641cbbf196523b5752c6cf68f86.
3. Create new node to manage each unique ras instance to guarantee
   each device .ras_list is completely independent.
4. Fixes:7a6b8ab3231b511915cb94cac1debabf093.
5. The soft locked logs are as follows:
[  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE 
5.13.0-27-generic #29~20.04.1-Ubuntu
[  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, BIOS 
T20200717143848 07/17/2020
[  262.165698] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[  262.165980] RIP: 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu]
[  262.166239] Code: 68 d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 
32 44 89 fa 44 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 
28 4d 89 f5 48 83 e8 28 48 39 d3 74 25 49 89 c6 49 8b 45
[  262.166243] RSP: 0018:ac908fa87d80 EFLAGS: 0202
[  262.166247] RAX: c1394248 RBX: 91e4ab8d6e20 RCX: c1394248
[  262.166249] RDX: 91e4aa356e20 RSI: 000e RDI: 91e4ab8c
[  262.166252] RBP: ac908fa87da8 R08: 0007 R09: 0001
[  262.166254] R10: 91e4930b64ec R11:  R12: 000e
[  262.166256] R13: 91e4aa356df8 R14: c1394320 R15: 0003
[  262.166258] FS:  () GS:92238fb4() 
knlGS:
[  262.166261] CS:  0010 DS:  ES:  CR0: 80050033
[  262.166264] CR2: 0001004865d0 CR3: 00406d796000 CR4: 00350ee0
[  262.166267] Call Trace:
[  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
[  262.166529]  ? psi_task_switch+0xd2/0x250
[  262.166537]  ? __switch_to+0x11d/0x460
[  262.166542]  ? __switch_to_asm+0x36/0x70
[  262.166549]  process_one_work+0x220/0x3c0
[  262.166556]  worker_thread+0x4d/0x3f0
[  262.166560]  ? process_one_work+0x3c0/0x3c0
[  262.166563]  kthread+0x12b/0x150
[  262.166568]  ? set_kthread_struct+0x40/0x40
[  262.166571]  ret_from_fork+0x22/0x30

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9d7c778c1a2d..b0aa67308c31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
"mca_iohc",
 };
 
+struct amdgpu_ras_block_list {
+   /* ras block link */
+   struct list_head node;
+
+   struct amdgpu_ras_block_object *ras_obj;
+};
+
 const char *get_ras_block_str(struct ras_common_if *ras_block)
 {
if (!ras_block)
@@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
enum amdgpu_ras_block block, uint32_t 
sub_block_index)
 {
int loop_cnt = 0;
-   struct amdgpu_ras_block_object *obj, *tmp;
+   struct amdgpu_ras_block_list *node, *tmp;
+   struct amdgpu_ras_block_object *obj;
 
if (block >= AMDGPU_RAS_BLOCK__LAST)
return NULL;
@@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
if (!amdgpu_ras_is_supported(adev, block))
return NULL;
 
-   list_for_each_entry_safe(obj, tmp, >ras_list, node) {
+   list_for_each_entry_safe(node, tmp, >ras_list, node) {
+   if (!node->ras_obj) {
+   DRM_ERROR("Warning: abnormal ras list node");
+   continue;
+   }
+
+   obj = node->ras_obj;
if (obj->ras_block_match) {
if (obj->ras_block_match(obj, block, sub_block_index) 
== 0)
return obj;
@@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
 
 int amdgpu_ras_fini(struct amdgpu_device *adev)
 {
+   struct amdgpu_ras_block_list *ras_node, *tmp;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
if (!adev->ras_enabled || !con)
@@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
amdgpu_ras_set_context(adev, NULL);
kfree(con);
 
+   /* Clear ras blocks from ras_list and free ras block list node */
+   list_for_each_entry_safe(ras_node, tmp, >ras_list, node) {
+   list_del(_node->node);
+   kfree(ras_node);
+   }
+
return 0;
 }
 
@@ -2754,14 +2775,22 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj)
 {
+   struct amdgpu_ras_block_list *ras_node;
if (!adev || !ras_block_obj)
return -EINVAL;
 

Re: [PATCH RESEND] drm/amd/display: Force link_rate as LINK_RATE_RBR2 for 2018 15" Apple Retina panels

2022-01-29 Thread Aditya Garg


> On 28-Jan-2022, at 8:33 PM, Harry Wentland  wrote:
> 
> I think either leaving the 2017 quirk in its original place or moving it down 
> works. I don't have a strong preference.
I’d better leave it in the original place then
> 
> With the comment style addressed this patch is
Sending a v2 now with this issue addressed.
> Reviewed-by: Harry Wentland 
> 
> Harry




[PATCH 1/2] drm/amd/amdgpu/amdgpu_uvd: Fix forgotten unmap buffer object

2022-01-29 Thread zhanglianjie
after the buffer object is successfully mapped, call amdgpu_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index 6f8de11a17f1..9cc23b220537 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -834,6 +834,7 @@ static int amdgpu_uvd_cs_msg(struct amdgpu_uvd_cs_ctx *ctx,
handle = msg[2];

if (handle == 0) {
+   amdgpu_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -892,6 +893,7 @@ static int amdgpu_uvd_cs_msg(struct amdgpu_uvd_cs_ctx *ctx,
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
}

+   amdgpu_bo_kunmap(bo);
return -EINVAL;
 }

--
2.20.1






[PATCH -next] drm/amd/display: clean up some inconsistent indenting

2022-01-29 Thread Yang Li
Eliminate the follow smatch warning:
drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c:2246
dp_perform_8b_10b_link_training() warn: inconsistent indenting

Reported-by: Abaci Robot 
Signed-off-by: Yang Li 
---
 drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
index daaec3164875..34ffcd5bb1d7 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
@@ -2243,11 +2243,11 @@ static enum link_training_result 
dp_perform_8b_10b_link_training(
 
if (status == LINK_TRAINING_SUCCESS) {
status = perform_clock_recovery_sequence(link, link_res, 
lt_settings, DPRX);
-   if (status == LINK_TRAINING_SUCCESS) {
-   status = perform_channel_equalization_sequence(link,
-   link_res,
-   lt_settings,
-   DPRX);
+   if (status == LINK_TRAINING_SUCCESS) {
+   status = perform_channel_equalization_sequence(link,
+  link_res,
+  
lt_settings,
+  DPRX);
}
}
 
-- 
2.20.1.7.g153144c



[PATCH v2] drm/amd/display: Force link_rate as LINK_RATE_RBR2 for 2018 15" Apple Retina panels

2022-01-29 Thread Aditya Garg
From: Aun-Ali Zaidi 
 
The eDP link rate reported by the DP_MAX_LINK_RATE dpcd register (0xa) is
contradictory to the highest rate supported reported by
EDID (0xc = LINK_RATE_RBR2). The effects of this compounded with commit
'4a8ca46bae8a ("drm/amd/display: Default max bpc to 16 for eDP")' results
in no display modes being found and a dark panel.

For now, simply force the maximum supported link rate for the eDP attached
2018 15" Apple Retina panels.

Additionally, we must also check the firmware revision since the device ID
reported by the DPCD is identical to that of the more capable 16,1,
incorrectly quirking it. We also use said firmware check to quirk the
refreshed 15,1 models with Vega graphics as they use a slightly newer
firmware version.

Tested-by: Aun-Ali Zaidi 
Reviewed-by: Harry Wentland 
Signed-off-by: Aun-Ali Zaidi 
Signed-off-by: Aditya Garg 
---
v2 :- Use C styled comments
 .../gpu/drm/amd/display/dc/core/dc_link_dp.c  | 20 +++
 1 file changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
index 05e216524..086f7ee2c 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
@@ -5597,6 +5597,26 @@ static bool retrieve_link_cap(struct dc_link *link)
dp_hw_fw_revision.ieee_fw_rev,
sizeof(dp_hw_fw_revision.ieee_fw_rev));
 
+   /* Quirk for Apple MBP 2018 15" Retina panels: wrong DP_MAX_LINK_RATE */
+   {
+   uint8_t str_mbp_2018[] = { 101, 68, 21, 103, 98, 97 };
+   uint8_t fwrev_mbp_2018[] = { 7, 4 };
+   uint8_t fwrev_mbp_2018_vega[] = { 8, 4 };
+
+   /* We also check for the firmware revision as 16,1 models have 
an
+* identical device id and are incorrectly quirked otherwise.
+*/
+   if ((link->dpcd_caps.sink_dev_id == 0x0010fa) &&
+   !memcmp(link->dpcd_caps.sink_dev_id_str, str_mbp_2018,
+sizeof(str_mbp_2018)) &&
+   (!memcmp(link->dpcd_caps.sink_fw_revision, fwrev_mbp_2018,
+sizeof(fwrev_mbp_2018)) ||
+   !memcmp(link->dpcd_caps.sink_fw_revision, 
fwrev_mbp_2018_vega,
+sizeof(fwrev_mbp_2018_vega {
+   link->reported_link_cap.link_rate = LINK_RATE_RBR2;
+   }
+   }
+
memset(>dpcd_caps.dsc_caps, '\0',
sizeof(link->dpcd_caps.dsc_caps));
memset(>dpcd_caps.fec_cap, '\0', sizeof(link->dpcd_caps.fec_cap));
-- 
2.25.1




[PATCH 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-01-29 Thread zhanglianjie
after the buffer object is successfully mapped, call radeon_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 377f9cdb5b53..c5482f7793db 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
handle = msg[2];

if (handle == 0) {
+   radeon_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -559,11 +560,10 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return 0;

default:
-
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-   return -EINVAL;
}

+   radeon_bo_kunmap(bo);
BUG();
return -EINVAL;
 }
--
2.20.1





RE: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

2022-01-29 Thread Chai, Thomas
It have solution to solve this defect,   I am debugging the modifications. 

-Original Message-
From: Zhou1, Tao  
Sent: Saturday, January 29, 2022 3:54 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

[AMD Official Use Only]

For quick workaround, I agree with the solution. But regarding the root cause, 
the list is still messed up.
Can we make ras_list to be a global variable across all cards, and add list 
empty check (or add a flag to indicate the register status of ras block) before 
list add to avoid redundant register?

Regards,
Tao

> -Original Message-
> From: Chai, Thomas 
> Sent: Saturday, January 29, 2022 11:53 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop
> 
> 1. The infinite loop causing soft lock occurs on multiple amdgpu cards
>supporting ras feature.
> 2. This a workaround patch. It is valid for multiple amdgpu cards of the
>same type.
> 3. The root cause is that each GPU card device has a separate .ras_list
>link header, but the instance and linked list node of each ras block
>are unique. When each device is initialized, each ras instance will
>repeatedly add link node to the device every time. In this way, only
>the .ras_list of the last initialized device is completely correct.
>the .ras_list->prev and .ras_list->next of the device initialzied
>before can still point to the correct ras instance, but the prev
>pointer and next pointer of the pointed ras instance both point to
>the last initialized device's .ras_ list instead of the beginning
>.ras_ list. When using list_for_each_entry_safe searches for
>non-existent Ras nodes on devices other than the last device, the
>last ras instance next pointer cannot always be equal to the
>beginning .ras_list, so that the loop cannot be terminated, the
>program enters a infinite loop.
>  BTW: Since the data and initialization process of each card are the same,
>   the link list between ras instances will not be destroyed every time
>   the device is initialized.
>  4. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, 
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events 
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 
> 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 
> 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 
> 89 f5 48 83 e8 28 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 
> 0018:ac908fa87d80
> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 
> 91e4ab8d6e20
> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
> 000e RDI: 91e4ab8c [  262.166252] RBP:
> ac908fa87da8 R08: 0007 R09: 0001 [  
> 262.166254] R10: 91e4930b64ec R11:  R12:
> 000e [  262.166256] R13: 91e4aa356df8 R14: 
> c1394320
> R15: 0003 [  262.166258] FS:  ()
> GS:92238fb4() knlGS: [  262.166261] CS:  
> 0010
> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0 [  
> 262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [  
> 262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70 [  
> 262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? 
> process_one_work+0x3c0/0x3c0 [  262.166563]  kthread+0x12b/0x150 [  
> 262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d4e07d0acb66..3d533ef0783d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -884,6 +884,7 @@ static int amdgpu_ras_block_match_default(struct
> amdgpu_ras_block_object *block_  static struct amdgpu_ras_block_object 
> *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
>   enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
> + int loop_cnt = 0;
>   struct amdgpu_ras_block_object *obj, *tmp;
> 
>   if (block >= AMDGPU_RAS_BLOCK__LAST) @@ -900,6 +901,9 @@ static 
> struct