RE: [PATCH] drm/amdkfd: add sdma poison consumption handling

2021-06-04 Thread Li, Dennis
[AMD Official Use Only]

This patch looks good to me.

Reviewed-by: Dennis Li

-Original Message-
From: Zhang, Hawking  
Sent: Friday, June 4, 2021 12:58 PM
To: amd-gfx@lists.freedesktop.org; Li, Dennis ; Deucher, 
Alexander ; Kuehling, Felix 
Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdkfd: add sdma poison consumption handling

Follow the same apporach as GFX to handle SDMA poison consumption. Send SIGBUS 
to application when receives SDMA_ECC interrupt and issue gpu reset either mode 
2 or mode 1 to get the engine back

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 7 ++-
 drivers/gpu/drm/amd/amdkfd/soc15_int.h  | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 02dd12774261..2e2b616c1bb7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -309,8 +309,13 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
   client_id == SOC15_IH_CLIENTID_SDMA5 ||
   client_id == SOC15_IH_CLIENTID_SDMA6 ||
   client_id == SOC15_IH_CLIENTID_SDMA7) {
-   if (source_id == SOC15_INTSRC_SDMA_TRAP)
+   if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 
0xfff, 28);
+   } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
+   kfd_signal_poison_consumed_event(dev, pasid);
+   amdgpu_amdkfd_gpu_reset(dev->kgd);
+   return;
+   }
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
   client_id == SOC15_IH_CLIENTID_VMC1 ||
   client_id == SOC15_IH_CLIENTID_UTCL2) { diff --git 
a/drivers/gpu/drm/amd/amdkfd/soc15_int.h 
b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
index 0bc0b25cb410..daf3c44547d3 100644
--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -30,6 +30,7 @@
 #define SOC15_INTSRC_SQ_INTERRUPT_MSG  239
 #define SOC15_INTSRC_VMC_FAULT 0
 #define SOC15_INTSRC_SDMA_TRAP 224
+#define SOC15_INTSRC_SDMA_ECC  220
 
 
 #define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 6/7] drm/amdgpu: enable ras error count query and reset for HDP

2021-04-29 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

This patch looks good to me.

Reviewed-by: Dennis Li 

-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 29, 2021 2:26 PM
To: Deucher, Alexander ; Li, Dennis 
; Clements, John ; 
amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 6/7] drm/amdgpu: enable ras error count query and reset for HDP

add hdp block ras error query and reset support in amdgpu ras error count query 
and reset interface

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  4 
 drivers/gpu/drm/amd/amdgpu/soc15.c  |  3 ---
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ae9fb20..984e827 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -890,6 +890,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
adev->gmc.xgmi.ras_funcs->query_ras_error_count)
adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, 
&err_data);
break;
+   case AMDGPU_RAS_BLOCK__HDP:
+   if (adev->hdp.ras_funcs &&
+   adev->hdp.ras_funcs->query_ras_error_count)
+   adev->hdp.ras_funcs->query_ras_error_count(adev, 
&err_data);
+   break;
default:
break;
}
@@ -967,6 +972,11 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
if (adev->sdma.funcs->reset_ras_error_count)
adev->sdma.funcs->reset_ras_error_count(adev);
break;
+   case AMDGPU_RAS_BLOCK__HDP:
+   if (adev->hdp.ras_funcs &&
+   adev->hdp.ras_funcs->reset_ras_error_count)
+   adev->hdp.ras_funcs->reset_ras_error_count(adev);
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 8e0cab5..3daf806 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1276,6 +1276,10 @@ static int gmc_v9_0_late_init(void *handle)
adev->mmhub.ras_funcs->reset_ras_error_count)
adev->mmhub.ras_funcs->reset_ras_error_count(adev);
 
+   if (adev->hdp.ras_funcs &&
+   adev->hdp.ras_funcs->reset_ras_error_count)
+   adev->hdp.ras_funcs->reset_ras_error_count(adev);
+
r = amdgpu_gmc_ras_late_init(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index d80e12b..28e9f6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1521,9 +1521,6 @@ static int soc15_common_late_init(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_get_irq(adev);
 
-   if (adev->hdp.funcs->reset_ras_error_count)
-   adev->hdp.funcs->reset_ras_error_count(adev);
-
if (adev->nbio.ras_funcs &&
adev->nbio.ras_funcs->ras_late_init)
r = adev->nbio.ras_funcs->ras_late_init(adev);
--
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 7/7] drm/amdgpu: retired reset_ras_error_count from hdp callbacks

2021-04-29 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

This patch looks good to me.

Reviewed-by: Dennis Li 

-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 29, 2021 2:26 PM
To: Deucher, Alexander ; Li, Dennis 
; Clements, John ; 
amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 7/7] drm/amdgpu: retired reset_ras_error_count from hdp 
callbacks

It was moved to hdp ras callbacks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h | 1 -
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index ba6f272..7ec99d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -35,7 +35,6 @@ struct amdgpu_hdp_funcs {
void (*flush_hdp)(struct amdgpu_device *adev, struct amdgpu_ring *ring);
void (*invalidate_hdp)(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
void (*update_clock_gating)(struct amdgpu_device *adev, bool enable);
void (*get_clock_gating_state)(struct amdgpu_device *adev, u32 *flags);
void (*init_registers)(struct amdgpu_device *adev); diff --git 
a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
index 330c0f0..74b90cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
@@ -159,7 +159,6 @@ const struct amdgpu_hdp_ras_funcs hdp_v4_0_ras_funcs = {  
const struct amdgpu_hdp_funcs hdp_v4_0_funcs = {
.flush_hdp = hdp_v4_0_flush_hdp,
.invalidate_hdp = hdp_v4_0_invalidate_hdp,
-   .reset_ras_error_count = hdp_v4_0_reset_ras_error_count,
.update_clock_gating = hdp_v4_0_update_clock_gating,
.get_clock_gating_state = hdp_v4_0_get_clockgating_state,
.init_registers = hdp_v4_0_init_registers,
--
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 5/7] drm/amdgpu: init/fini hdp v4_0 ras

2021-04-29 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

This patch looks good to me.

Reviewed-by: Dennis Li 

-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 29, 2021 2:26 PM
To: Deucher, Alexander ; Li, Dennis 
; Clements, John ; 
amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 5/7] drm/amdgpu: init/fini hdp v4_0 ras

invoke hdp v4_0 ras init in gmc late_init phase while ras fini in gmc sw_fini 
phase

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index dfa67c2..697ab26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -455,6 +455,13 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
return r;
}
 
+   if (adev->hdp.ras_funcs &&
+   adev->hdp.ras_funcs->ras_late_init) {
+   r = adev->hdp.ras_funcs->ras_late_init(adev);
+   if (r)
+   return r;
+   }
+
return 0;
 }
 
@@ -471,6 +478,10 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
if (adev->gmc.xgmi.ras_funcs &&
adev->gmc.xgmi.ras_funcs->ras_fini)
adev->gmc.xgmi.ras_funcs->ras_fini(adev);
+
+   if (adev->hdp.ras_funcs &&
+   adev->hdp.ras_funcs->ras_fini)
+   adev->hdp.ras_funcs->ras_fini(adev);
 }
 
/*
--
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 4/7] drm/amdgpu: initialize hdp v4_0 ras functions

2021-04-29 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

This patch looks good to me.

Reviewed-by: Dennis Li 

-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 29, 2021 2:26 PM
To: Deucher, Alexander ; Li, Dennis 
; Clements, John ; 
amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 4/7] drm/amdgpu: initialize hdp v4_0 ras functions

hdp v4_0 support ras features

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 4da8b3d..8e0cab5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -53,6 +53,7 @@
 #include "mmhub_v1_7.h"
 #include "umc_v6_1.h"
 #include "umc_v6_0.h"
+#include "hdp_v4_0.h"
 
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 
@@ -1210,6 +1211,11 @@ static void gmc_v9_0_set_gfxhub_funcs(struct 
amdgpu_device *adev)
adev->gfxhub.funcs = &gfxhub_v1_0_funcs;  }
 
+static void gmc_v9_0_set_hdp_ras_funcs(struct amdgpu_device *adev) {
+   adev->hdp.ras_funcs = &hdp_v4_0_ras_funcs; }
+
 static int gmc_v9_0_early_init(void *handle)  {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -1230,6 
+1236,7 @@ static int gmc_v9_0_early_init(void *handle)
gmc_v9_0_set_mmhub_funcs(adev);
gmc_v9_0_set_mmhub_ras_funcs(adev);
gmc_v9_0_set_gfxhub_funcs(adev);
+   gmc_v9_0_set_hdp_ras_funcs(adev);
 
adev->gmc.shared_aperture_start = 0x2000ULL;
adev->gmc.shared_aperture_end =
--
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 3/7] drm/amdgpu: implement hdp v4_0 ras functions

2021-04-29 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

This patch looks good to me.

Reviewed-by: Dennis Li 

-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 29, 2021 2:26 PM
To: Deucher, Alexander ; Li, Dennis 
; Clements, John ; 
amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 3/7] drm/amdgpu: implement hdp v4_0 ras functions

implement hdp v4_0 ras functions, including ras init/fini, 
query/reset_error_counter

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c | 30 --  
drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h |  1 +
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
index edbd35d..330c0f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
@@ -59,12 +59,31 @@ static void hdp_v4_0_invalidate_hdp(struct amdgpu_device 
*adev,
HDP, 0, mmHDP_READ_CACHE_INVALIDATE), 1);  }
 
+static void hdp_v4_0_query_ras_error_count(struct amdgpu_device *adev,
+  void *ras_error_status)
+{
+   struct ras_err_data *err_data = (struct ras_err_data 
+*)ras_error_status;
+
+   err_data->ue_count = 0;
+   err_data->ce_count = 0;
+
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
+   return;
+
+   /* HDP SRAM errors are uncorrectable ones (i.e. fatal errors) */
+   err_data->ue_count += RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT); };
+
 static void hdp_v4_0_reset_ras_error_count(struct amdgpu_device *adev)  {
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
return;
-   /*read back hdp ras counter to reset it to 0 */
-   RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
+
+   if (adev->asic_type >= CHIP_ALDEBARAN)
+   WREG32_SOC15(HDP, 0, mmHDP_EDC_CNT, 0);
+   else
+   /*read back hdp ras counter to reset it to 0 */
+   RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
 }
 
 static void hdp_v4_0_update_clock_gating(struct amdgpu_device *adev, @@ -130,6 
+149,13 @@ static void hdp_v4_0_init_registers(struct amdgpu_device *adev)
WREG32_SOC15(HDP, 0, mmHDP_NONSURFACE_BASE_HI, (adev->gmc.vram_start >> 
40));  }
 
+const struct amdgpu_hdp_ras_funcs hdp_v4_0_ras_funcs = {
+   .ras_late_init = amdgpu_hdp_ras_late_init,
+   .ras_fini = amdgpu_hdp_ras_fini,
+   .query_ras_error_count = hdp_v4_0_query_ras_error_count,
+   .reset_ras_error_count = hdp_v4_0_reset_ras_error_count, };
+
 const struct amdgpu_hdp_funcs hdp_v4_0_funcs = {
.flush_hdp = hdp_v4_0_flush_hdp,
.invalidate_hdp = hdp_v4_0_invalidate_hdp, diff --git 
a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h
index d1e6399..dc3a1b8 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h
@@ -27,5 +27,6 @@
 #include "soc15_common.h"
 
 extern const struct amdgpu_hdp_funcs hdp_v4_0_funcs;
+extern const struct amdgpu_hdp_ras_funcs hdp_v4_0_ras_funcs;
 
 #endif
--
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/7] drm/amdgpu: add helpers for hdp ras init/fini

2021-04-29 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

>>+ r = amdgpu_ras_late_init(adev, adev->hdp.ras_if,
>>+  &fs_info, &ih_info);
>>+ if (r || !amdgpu_ras_is_supported(adev, adev->hdp.ras_if->block)) {
>>+ kfree(adev->hdp.ras_if);
>>+ adev->hdp.ras_if = NULL;
>>+ }

It is better to move amdgpu_ras_is_supported more early, to avoid redundant 
memory allocation when HDP doesn't support RAS. Except  this, it looks good to 
me.

Reviewed-by: Dennis Li 

-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 29, 2021 2:26 PM
To: Deucher, Alexander ; Li, Dennis 
; Clements, John ; 
amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 2/7] drm/amdgpu: add helpers for hdp ras init/fini

hdp ras init/fini are common functions that can be shared among hdp generations

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 69 + 
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h |  2 +
 3 files changed, 72 insertions(+), 1 deletion(-)  create mode 100644 
drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index ee85e8a..418e674 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -56,7 +56,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_gmc.o amdgpu_mmhub.o amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o 
amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
-   amdgpu_fw_attestation.o amdgpu_securedisplay.o
+   amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_hdp.o
 
 amdgpu-$(CONFIG_PERF_EVENTS) += amdgpu_pmu.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
new file mode 100644
index 000..1d50d53
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person 
+obtaining a
+ * copy of this software and associated documentation files (the 
+"Software"),
+ * to deal in the Software without restriction, including without 
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute, 
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom 
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev) {
+   int r;
+   struct ras_ih_if ih_info = {
+   .cb = NULL,
+   };
+   struct ras_fs_if fs_info = {
+   .sysfs_name = "hdp_err_count",
+   };
+
+   if (!adev->hdp.ras_if) {
+   adev->hdp.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
+   if (!adev->hdp.ras_if)
+   return -ENOMEM;
+   adev->hdp.ras_if->block = AMDGPU_RAS_BLOCK__HDP;
+   adev->hdp.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->hdp.ras_if->sub_block_index = 0;
+   strcpy(adev->hdp.ras_if->name, "hdp");
+   }
+   ih_info.head = fs_info.head = *adev->hdp.ras_if;
+   r = amdgpu_ras_late_init(adev, adev->hdp.ras_if,
+&fs_info, &ih_info);
+   if (r || !amdgpu_ras_is_supported(adev, adev->hdp.ras_if->block)) {
+   kfree(adev->hdp.ras_if);
+   adev->hdp.ras_if = NULL;
+   }

It is better to move amdgpu_ras_is_supported more early, to avoid redundant 
memory allocation when HDP doesn't support RAS. 

+
+   return r;
+}
+
+void amdgpu_hdp_ras_fini(struct amdgpu_device *adev) {
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP) &&
+   adev->hdp.ras_if) {
+   struct ras_common_if *ras_if = adev->hdp.ras_if;
+   struct ras_ih_if ih_info = {

RE: [PATCH] drm/amdgpu: disable gfx ras by default in aldebaran

2021-04-22 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Looks good to me.

Reviewed-by: Dennis Li 

Best  Regards
Dennis Li
-Original Message-
From: Hawking Zhang  
Sent: Thursday, April 22, 2021 10:06 PM
To: amd-gfx@lists.freedesktop.org; Li, Dennis 
Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: disable gfx ras by default in aldebaran

aldebaran gfx ras is still under development

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bb0d027..f62873f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2066,8 +2066,7 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev,
} else {
/* driver only manages a few IP blocks RAS feature
 * when GPU is connected cpu through XGMI */
-   *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX |
-   1 << AMDGPU_RAS_BLOCK__SDMA |
+   *hw_supported |= (1 << AMDGPU_RAS_BLOCK__SDMA |
1 << AMDGPU_RAS_BLOCK__MMHUB);
}
 
-- 
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: correct default gfx wdt timeout setting

2021-04-15 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Dennis Li 

-Original Message-
From: Zhang, Hawking  
Sent: Friday, April 16, 2021 2:46 PM
To: amd-gfx@lists.freedesktop.org; Li, Dennis 
Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: correct default gfx wdt timeout setting

When gfx wdt was configured to fatal_disable, the timeout period should be 
configured to 0x0 (timeout
disabled)

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f856a2e82f94..7d9b954bc0c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -189,7 +189,7 @@ uint amdgpu_ras_mask = 0x;  int 
amdgpu_bad_page_threshold = -1;  struct amdgpu_watchdog_timer 
amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
-   .period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
+   .period = 0x0, /* default to 0x0 (timeout disable) */
 };
 
 /**
@@ -566,7 +566,7 @@ module_param_named(timeout_fatal_disable, 
amdgpu_watchdog_timer.timeout_fatal_di
  * DOC: timeout_period (uint)
  * Modify the watchdog timeout max_cycles as (1 << period)
  */
-MODULE_PARM_DESC(timeout_period, "watchdog timeout period (1 to 0x23(default), 
timeout maxCycles = (1 << period)");
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0 = timeout 
+disabled, 1 ~ 0x23 = timeout maxcycles = (1 << period)");
 module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
 
 /**
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

2021-04-13 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian and Andrey,
  We maybe try to implement "wait" callback function of dma_fence_ops, when 
GPU reset or unplug happen, make this callback return - ENODEV, to notify the 
caller device lost. 

 * Must return -ERESTARTSYS if the wait is intr = true and the wait was
 * interrupted, and remaining jiffies if fence has signaled, or 0 if 
wait
 * timed out. Can also return other error values on custom 
implementations,
 * which should be treated as if the fence is signaled. For example a 
hardware
 * lockup could be reported like that.
 *
 * This callback is optional.
 */
signed long (*wait)(struct dma_fence *fence,
bool intr, signed long timeout);

Best Regards
Dennis Li
-Original Message-
From: Christian König  
Sent: Tuesday, April 13, 2021 3:10 PM
To: Grodzovsky, Andrey ; Koenig, Christian 
; Li, Dennis ; 
amd-gfx@lists.freedesktop.org; Deucher, Alexander ; 
Kuehling, Felix ; Zhang, Hawking 
; Daniel Vetter 
Subject: Re: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

Am 12.04.21 um 22:01 schrieb Andrey Grodzovsky:
>
> On 2021-04-12 3:18 p.m., Christian König wrote:
>> Am 12.04.21 um 21:12 schrieb Andrey Grodzovsky:
>>> [SNIP]
>>>>>
>>>>> So what's the right approach ? How we guarantee that when running 
>>>>> amdgpu_fence_driver_force_completion we will signal all the HW 
>>>>> fences and not racing against some more fences insertion into that 
>>>>> array ?
>>>>>
>>>>
>>>> Well I would still say the best approach would be to insert this 
>>>> between the front end and the backend and not rely on signaling 
>>>> fences while holding the device srcu.
>>>
>>>
>>> My question is, even now, when we run 
>>> amdgpu_fence_driver_fini_hw->amdgpu_fence_wait_empty or 
>>> amdgpu_fence_driver_fini_hw->amdgpu_fence_driver_force_completion,
>>> what there prevents a race with another fence being at the same time 
>>> emitted and inserted into the fence array ? Looks like nothing.
>>>
>>
>> Each ring can only be used by one thread at the same time, this 
>> includes emitting fences as well as other stuff.
>>
>> During GPU reset we make sure nobody writes to the rings by stopping 
>> the scheduler and taking the GPU reset lock (so that nobody else can 
>> start the scheduler again).
>
>
> What about direct submissions not through scheduler - 
> amdgpu_job_submit_direct, I don't see how this is protected.

Those only happen during startup and GPU reset.

>>
>>>>
>>>> BTW: Could it be that the device SRCU protects more than one device 
>>>> and we deadlock because of this?
>>>
>>>
>>> I haven't actually experienced any deadlock until now but, yes, 
>>> drm_unplug_srcu is defined as static in drm_drv.c and so in the 
>>> presence  of multiple devices from same or different drivers we in 
>>> fact are dependent on all their critical sections i guess.
>>>
>>
>> Shit, yeah the devil is a squirrel. So for A+I laptops we actually 
>> need to sync that up with Daniel and the rest of the i915 guys.
>>
>> IIRC we could actually have an amdgpu device in a docking station 
>> which needs hotplug and the driver might depend on waiting for the
>> i915 driver as well.
>
>
> Can't we propose a patch to make drm_unplug_srcu per drm_device ? I 
> don't see why it has to be global and not per device thing.

I'm really wondering the same thing for quite a while now.

Adding Daniel as well, maybe he knows why the drm_unplug_srcu is global.

Regards,
Christian.

>
> Andrey
>
>
>>
>> Christian.
>>
>>> Andrey
>>>
>>>
>>>>
>>>> Christian.
>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>     /* Past this point no more fence are submitted to HW ring 
>>>>>>>>> and hence we can safely call force signal on all that are 
>>>>>>>>> currently there.
>>>>>>>>>  * Any subsequently created  HW fences will be returned 
>>>>>>>>> signaled with an error code right away
>>>>>>>&

RE: [PATCH Review 1/1] drm/amdgpu: support sdma error injection

2021-04-01 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Reivewed-by: Dennis Li 

-Original Message-
From: Stanley.Yang  
Sent: Thursday, April 1, 2021 7:14 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 
; Li, Dennis ; Yang, Stanley 

Subject: [PATCH Review 1/1] drm/amdgpu: support sdma error injection

Signed-off-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0e16683876aa..d9d292c79cfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -927,6 +927,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
ret = -EINVAL;
break;
case AMDGPU_RAS_BLOCK__UMC:
+   case AMDGPU_RAS_BLOCK__SDMA:
case AMDGPU_RAS_BLOCK__MMHUB:
case AMDGPU_RAS_BLOCK__PCIE_BIF:
ret = psp_ras_trigger_error(&adev->psp, &block_info);
-- 
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 1/2] drm/amd/pm: drop redundant and unneeded BACO APIs

2021-03-19 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Evan,
  It is better to replace all nv_asic_supports_baco callings with 
amdgpu_dpm_is_baco_supported, and remove nv_asic_supports_baco function. 

>>> static bool nv_asic_supports_baco(struct amdgpu_device *adev)  {
>>> -   struct smu_context *smu = &adev->smu;
>>>-
>>>-if (smu_baco_is_support(smu))
>>>-return true;
>>>-else
>>>-return false;
>>>+return amdgpu_dpm_is_baco_supported(adev);
>>>}

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Evan Quan
Sent: Friday, March 19, 2021 5:12 PM
To: amd-gfx@lists.freedesktop.org
Cc: Quan, Evan 
Subject: [PATCH 1/2] drm/amd/pm: drop redundant and unneeded BACO APIs

Use other APIs which are with the same functionality but much more clean.

Change-Id: I5e9e0ab5d39b49b02434f18e12392b13931396be
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/nv.c   | 20 +
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h   |  9 ---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 95 ---
 3 files changed, 3 insertions(+), 121 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c 
index e9cc3201054f..2670ae00c2e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -517,19 +517,12 @@ static int nv_asic_mode2_reset(struct amdgpu_device *adev)
 
 static bool nv_asic_supports_baco(struct amdgpu_device *adev)  {
-   struct smu_context *smu = &adev->smu;
-
-   if (smu_baco_is_support(smu))
-   return true;
-   else
-   return false;
+   return amdgpu_dpm_is_baco_supported(adev);
 }
 
 static enum amd_reset_method
 nv_asic_reset_method(struct amdgpu_device *adev)  {
-   struct smu_context *smu = &adev->smu;
-
if (amdgpu_reset_method == AMD_RESET_METHOD_MODE1 ||
amdgpu_reset_method == AMD_RESET_METHOD_MODE2 ||
amdgpu_reset_method == AMD_RESET_METHOD_BACO || @@ -548,7 +541,7 @@ 
nv_asic_reset_method(struct amdgpu_device *adev)
case CHIP_DIMGREY_CAVEFISH:
return AMD_RESET_METHOD_MODE1;
default:
-   if (smu_baco_is_support(smu))
+   if (amdgpu_dpm_is_baco_supported(adev))
return AMD_RESET_METHOD_BACO;
else
return AMD_RESET_METHOD_MODE1;
@@ -558,7 +551,6 @@ nv_asic_reset_method(struct amdgpu_device *adev)  static 
int nv_asic_reset(struct amdgpu_device *adev)  {
int ret = 0;
-   struct smu_context *smu = &adev->smu;
 
switch (nv_asic_reset_method(adev)) {
case AMD_RESET_METHOD_PCI:
@@ -567,13 +559,7 @@ static int nv_asic_reset(struct amdgpu_device *adev)
break;
case AMD_RESET_METHOD_BACO:
dev_info(adev->dev, "BACO reset\n");
-
-   ret = smu_baco_enter(smu);
-   if (ret)
-   return ret;
-   ret = smu_baco_exit(smu);
-   if (ret)
-   return ret;
+   ret = amdgpu_dpm_baco_reset(adev);
break;
case AMD_RESET_METHOD_MODE2:
dev_info(adev->dev, "MODE2 reset\n"); diff --git 
a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 517f333fbc4b..02675155028d 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1285,15 +1285,6 @@ int smu_get_power_limit(struct smu_context *smu,
uint32_t *limit,
enum smu_ppt_limit_level limit_level);
 
-int smu_set_azalia_d3_pme(struct smu_context *smu);
-
-bool smu_baco_is_support(struct smu_context *smu);
-
-int smu_baco_get_state(struct smu_context *smu, enum smu_baco_state *state);
-
-int smu_baco_enter(struct smu_context *smu); -int smu_baco_exit(struct 
smu_context *smu);
-
 bool smu_mode1_reset_is_support(struct smu_context *smu);  bool 
smu_mode2_reset_is_support(struct smu_context *smu);  int 
smu_mode1_reset(struct smu_context *smu); diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 4120d28f782b..1bb0c0966e3d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2682,48 +2682,6 @@ static int smu_set_xgmi_pstate(void *handle,
return ret;
 }
 
-int smu_set_azalia_d3_pme(struct smu_context *smu) -{
-   int ret = 0;
-
-   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
-   return -EOPNOTSUPP;
-
-   mutex_lock(&smu->mutex);
-
-   if (smu->ppt_funcs->set_azalia_d3_pme)
-   ret = smu->ppt_funcs->set_azalia_d3_pme(smu);
-
-   mutex_unlock(&smu->mutex);
-
-   return ret;
-}
-
-/*
- * On system suspending or resetting, the dpm_enabled
- * flag will be cleared. So that those SMU services which
- * are not supported will be gated.
- *
- * However, the baco/mod

RE: [PATCH] drm/amdgpu: Fix the page fault issue in amdgpu_irq_fini

2021-03-19 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Emily,
  What about refine struct amdgpu_irq_src with refcount? Your change could 
fix this issue, but it is unreadable. 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Deng, Emily
Sent: Friday, March 19, 2021 9:38 AM
To: Christian König ; 
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: Fix the page fault issue in amdgpu_irq_fini

[AMD Official Use Only - Internal Distribution Only]

[AMD Official Use Only - Internal Distribution Only]

>-Original Message-
>From: Christian König 
>Sent: Thursday, March 18, 2021 7:52 PM
>To: Deng, Emily ; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the page fault issue in 
>amdgpu_irq_fini
>
>Am 18.03.21 um 12:48 schrieb Emily Deng:
>> For some source, it will be shared by some client ID and source ID.
>> To fix the page fault issue, set all those to null.
>>
>> Signed-off-by: Emily Deng 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 16 +---
>>   1 file changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> index af026109421a..623b1ac6231d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> @@ -359,7 +359,7 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
>>*/
>>   void amdgpu_irq_fini(struct amdgpu_device *adev)
>>   {
>> -unsigned i, j;
>> +unsigned i, j, m, n;
>>
>>   if (adev->irq.installed) {
>>   drm_irq_uninstall(adev_to_drm(adev));
>> @@ -380,12 +380,22 @@ void amdgpu_irq_fini(struct amdgpu_device
>*adev)
>>   if (!src)
>>   continue;
>>
>> -kfree(src->enabled_types);
>> +if (src->enabled_types)
>> +kfree(src->enabled_types);
>
>A NULL check before kfree() is unecessary and will be complained about 
>by the static checkers.
Sorry, will remove this.
>
>> +
>>   src->enabled_types = NULL;
>> +
>
>Unrelated white space change.
Sorry, will remove this also.
>
>>   if (src->data) {
>>   kfree(src->data);
>>   kfree(src);
>> -adev->irq.client[i].sources[j] = NULL;
>> +}
>> +
>> +for (m = 0; m < AMDGPU_IRQ_CLIENTID_MAX; ++m) { if 
>> +(!adev->irq.client[m].sources) continue; for (n = 0; n < 
>> +AMDGPU_MAX_IRQ_SRC_ID;
>++n)
>> +if (adev->irq.client[m].sources[n] ==
>src)
>> +adev->irq.client[m].sources[n]
>= NULL;
>
>Hui what? The memory you set to NULL here is freed on the line below.
>
>Accessing it after that would be illegal, so why do you want to set it to NULL?
[Emily] It is in the loop "for (j = 0; j < AMDGPU_MAX_IRQ_SRC_ID; ++j) {", 
shouldn't have been freed in this loop. Only set " 
adev->irq.client[i].sources[j] = NULL;" is not enough, as it maybe have other 
client ID and src ID will share the same src. Also need to set those to NULL.
>
>Christian.
>
>>   }
>>   }
>>   kfree(adev->irq.client[i].sources);

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CDennis.Li%40amd.com%7C3e94ebf6f1d34e31898e08d8ea77b613%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637517147175478166%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=m2sQW4Ncv40K97wxOgC%2BSFiT8yhy6996E%2FR%2FMWLoh64%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

2021-03-18 Thread Li, Dennis
>>> The GPU reset doesn't complete the fences we wait for. It only completes 
>>> the hardware fences as part of the reset.
>>> So waiting for a fence while holding the reset lock is illegal and needs to 
>>> be avoided.
I understood your concern. It is more complex for DRM GFX, therefore I abandon 
adding lock protection for DRM ioctls now. Maybe we can try to add all kernel  
dma_fence waiting in a list, and signal all in recovery threads. Do you have 
same concern for compute cases?

>>> Lockdep also complains about this when it is used correctly. The only 
>>> reason it doesn't complain here is because you use an atomic+wait_event 
>>> instead of a locking primitive.
Agree. This approach will escape the monitor of lockdep.  Its goal is to block 
other threads when GPU recovery thread start. But I couldn’t find a better 
method to solve this problem. Do you have some suggestion?

Best Regards
Dennis Li

From: Koenig, Christian 
Sent: Thursday, March 18, 2021 4:59 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking 
Subject: AW: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

Exactly that's what you don't seem to understand.

The GPU reset doesn't complete the fences we wait for. It only completes the 
hardware fences as part of the reset.

So waiting for a fence while holding the reset lock is illegal and needs to be 
avoided.

Lockdep also complains about this when it is used correctly. The only reason it 
doesn't complain here is because you use an atomic+wait_event instead of a 
locking primitive.

Regards,
Christian.


Von: Li, Dennis mailto:dennis...@amd.com>>
Gesendet: Donnerstag, 18. März 2021 09:28
An: Koenig, Christian 
mailto:christian.koe...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>; Deucher, 
Alexander mailto:alexander.deuc...@amd.com>>; 
Kuehling, Felix mailto:felix.kuehl...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>
Betreff: RE: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

>>> Those two steps need to be exchanged or otherwise it is possible that new 
>>> delayed work items etc are started before the lock is taken.
What about adding check for adev->in_gpu_reset in work item? If exchange the 
two steps, it maybe introduce the deadlock.  For example, the user thread hold 
the read lock and waiting for the fence, if recovery thread try to hold write 
lock and then complete fences, in this case, recovery thread will always be 
blocked.

Best Regards
Dennis Li
-Original Message-
From: Koenig, Christian 
mailto:christian.koe...@amd.com>>
Sent: Thursday, March 18, 2021 3:54 PM
To: Li, Dennis mailto:dennis...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Deucher, 
Alexander mailto:alexander.deuc...@amd.com>>; 
Kuehling, Felix mailto:felix.kuehl...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>
Subject: Re: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

Am 18.03.21 um 08:23 schrieb Dennis Li:
> We have defined two variables in_gpu_reset and reset_sem in adev object. The 
> atomic type variable in_gpu_reset is used to avoid recovery thread reenter 
> and make lower functions return more earlier when recovery start, but 
> couldn't block recovery thread when it access hardware. The r/w semaphore 
> reset_sem is used to solve these synchronization issues between recovery 
> thread and other threads.
>
> The original solution locked registers' access in lower functions, which will 
> introduce following issues:
>
> 1) many lower functions are used in both recovery thread and others. Firstly 
> we must harvest these functions, it is easy to miss someones. Secondly these 
> functions need select which lock (read lock or write lock) will be used, 
> according to the thread it is running in. If the thread context isn't 
> considered, the added lock will easily introduce deadlock. Besides that, in 
> most time, developer easily forget to add locks for new functions.
>
> 2) performance drop. More lower functions are more frequently called.
>
> 3) easily introduce false positive lockdep complaint, because write lock has 
> big range in recovery thread, but low level functions will hold read lock may 
> be protected by other locks in other threads.
>
> Therefore the new solution will try to add lock protection for ioctls of kfd. 
> Its goal is that there are no threads except for recovery thread or its 
> children (for xgmi) to access hardware when doing GPU reset and resume. So 
> refine recovery thread as the following:
>
> Step

RE: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

2021-03-18 Thread Li, Dennis
>>> Those two steps need to be exchanged or otherwise it is possible that new 
>>> delayed work items etc are started before the lock is taken.
What about adding check for adev->in_gpu_reset in work item? If exchange the 
two steps, it maybe introduce the deadlock.  For example, the user thread hold 
the read lock and waiting for the fence, if recovery thread try to hold write 
lock and then complete fences, in this case, recovery thread will always be 
blocked. 

Best Regards
Dennis Li
-Original Message-
From: Koenig, Christian  
Sent: Thursday, March 18, 2021 3:54 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking 
Subject: Re: [PATCH 0/4] Refine GPU recovery sequence to enhance its stability

Am 18.03.21 um 08:23 schrieb Dennis Li:
> We have defined two variables in_gpu_reset and reset_sem in adev object. The 
> atomic type variable in_gpu_reset is used to avoid recovery thread reenter 
> and make lower functions return more earlier when recovery start, but 
> couldn't block recovery thread when it access hardware. The r/w semaphore 
> reset_sem is used to solve these synchronization issues between recovery 
> thread and other threads.
>
> The original solution locked registers' access in lower functions, which will 
> introduce following issues:
>
> 1) many lower functions are used in both recovery thread and others. Firstly 
> we must harvest these functions, it is easy to miss someones. Secondly these 
> functions need select which lock (read lock or write lock) will be used, 
> according to the thread it is running in. If the thread context isn't 
> considered, the added lock will easily introduce deadlock. Besides that, in 
> most time, developer easily forget to add locks for new functions.
>
> 2) performance drop. More lower functions are more frequently called.
>
> 3) easily introduce false positive lockdep complaint, because write lock has 
> big range in recovery thread, but low level functions will hold read lock may 
> be protected by other locks in other threads.
>
> Therefore the new solution will try to add lock protection for ioctls of kfd. 
> Its goal is that there are no threads except for recovery thread or its 
> children (for xgmi) to access hardware when doing GPU reset and resume. So 
> refine recovery thread as the following:
>
> Step 0: atomic_cmpxchg(&adev->in_gpu_reset, 0, 1)
> 1). if failed, it means system had a recovery thread running, current 
> thread exit directly;
> 2). if success, enter recovery thread;
>
> Step 1: cancel all delay works, stop drm schedule, complete all unreceived 
> fences and so on. It try to stop or pause other threads.
>
> Step 2: call down_write(&adev->reset_sem) to hold write lock, which will 
> block recovery thread until other threads release read locks.

Those two steps need to be exchanged or otherwise it is possible that new 
delayed work items etc are started before the lock is taken.

Just to make it clear until this is fixed the whole patch set is a NAK.

Regards,
Christian.

>
> Step 3: normally, there is only recovery threads running to access hardware, 
> it is safe to do gpu reset now.
>
> Step 4: do post gpu reset, such as call all ips' resume functions;
>
> Step 5: atomic set adev->in_gpu_reset as 0, wake up other threads and release 
> write lock. Recovery thread exit normally.
>
> Other threads call the amdgpu_read_lock to synchronize with recovery thread. 
> If it finds that in_gpu_reset is 1, it should release read lock if it has 
> holden one, and then blocks itself to wait for recovery finished event. If 
> thread successfully hold read lock and in_gpu_reset is 0, it continues. It 
> will exit normally or be stopped by recovery thread in step 1.
>
> Dennis Li (4):
>drm/amdgpu: remove reset lock from low level functions
>drm/amdgpu: refine the GPU recovery sequence
>drm/amdgpu: instead of using down/up_read directly
>drm/amdkfd: add reset lock protection for kfd entry functions
>
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   6 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  14 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 173 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|   8 -
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|   4 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |   9 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |   5 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |   5 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 172 -
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   4 +
>   .../amd/amdkfd/kfd_process_queue_manager.c|  17 ++
>   12 files changed, 345 insertions(+), 75 deletions(-)
>

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: capture invalid hardware access v2

2021-03-10 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,

>> We can only complete the low level hardware fences, but not the scheduler 
>> fences since those need to wait for the resubmission of the jobs and with it 
>> finishing the GPU reset.
Correct. I run a deadlock test and captured this case today.  Are there chances 
to let driver complete scheduler fences? If decide to do GPU reset, the pending 
rendering job in fact should be recreated because of vram lost. Currently, user 
application will recreate context when if driver report VRAM lost.

-Original Message-
From: Christian König 
Sent: Wednesday, March 10, 2021 9:03 PM
To: Li, Dennis ; Grodzovsky, Andrey 
; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2

> I assumed that pre-reset functions could complete all fences

And exactly that's incorrect.

We can only complete the low level hardware fences, but not the scheduler 
fences since those need to wait for the resubmission of the jobs and with it 
finishing the GPU reset.

Regards,
Christian.

Am 10.03.21 um 07:40 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
>>>> But how this will help if TDR thread will start after you both took read 
>>>> lock and checked that adev->in_gpu_reset is false ? Since TDR  now takes 
>>>> write lock only after suspending HW and waiting for all fences there is 
>>>> nothing that prevents both threads (e.g IOCTL and TDR) to access registers 
>>>> concurrently.
> When read thread both took read lock and checked that adev->in_gpu_reset is 
> false, it means that recovery thread still not start, and read thread will 
> continue, it will exit normally, or exit because of fence completed by 
> pre-reset functions. I assumed that pre-reset functions could complete all 
> fences, which could make related threads exit and release read locks. About 
> this, Christian has different thinking, I am trying to understand his 
> concern.  TDR will be blocked until all read locks are released.
>
> Best Regards
> Dennis Li
> -Original Message-
> From: Grodzovsky, Andrey 
> Sent: Wednesday, March 10, 2021 1:42 PM
> To: Li, Dennis ; Christian König
> ; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2
>
> But how this will help if TDR thread will start after you both took read lock 
> and checked that adev->in_gpu_reset is false ? Since TDR  now takes write 
> lock only after suspending HW and waiting for all fences there is nothing 
> that prevents both threads (e.g IOCTL and TDR) to access registers 
> concurrently.
>
> Andrey
>
> On 2021-03-09 9:59 p.m., Li, Dennis wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi, Andrey,
>>>>> Is the problem here that HW is suspended while some other threads that 
>>>>> rely on the read side lock still access HW ? Mostly what I am thinking 
>>>>> about are IOCTls - we can't 'wait for them to complete' but they might be 
>>>>> accessing HW when we start suspend.
>> In read side, when the reader held the read lock, it will also check whether 
>> adev->in_gpu_reset is 1, if so, it will release read clock and is waiting 
>> for recovery finish event.
>
>> Best Regards
>> Dennis Li
>>
>> -Original Message-
>> From: Grodzovsky, Andrey 
>> Sent: Wednesday, March 10, 2021 2:26 AM
>> To: Christian König ; Li, Dennis
>> ; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2
>>
>> On 2021-03-09 12:47 p.m., Christian König wrote:
>>> No it won't. Accessing the hardware without the lock is ok as long
>>> as the write side isn't taken.
>> Oh, forgot about the trylock part, sorry...
>>
>>> But that approach is illegal anyway because we suspend the hardware
>>> without proper protection from concurrent access.
>> For my understanding and from looking again at his steps related to
>> this
>>
>> Step 0: atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) - [AG] protects
>> from other TDR threads
>>
>> Step 1: cancel all delay works, stop drm schedule, complete all
>> unreceived fences and so on. Call amdgpu_device_pre_asic_reset...
>> e.t.c
>> - [AG] this is the HW suspend part
>>
>> Step 2: call down_write(&adev->reset_sem) to hold write lock, which will 
>> block recovery thread until other threads release read locks.
>>
>> Is the problem here that HW is suspended while some other threads that rely 
>> on the read side

RE: [PATCH] drm/amdgpu: capture invalid hardware access v2

2021-03-09 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

>>> But how this will help if TDR thread will start after you both took read 
>>> lock and checked that adev->in_gpu_reset is false ? Since TDR  now takes 
>>> write lock only after suspending HW and waiting for all fences there is 
>>> nothing that prevents both threads (e.g IOCTL and TDR) to access registers 
>>> concurrently.
When read thread both took read lock and checked that adev->in_gpu_reset is 
false, it means that recovery thread still not start, and read thread will 
continue, it will exit normally, or exit because of fence completed by 
pre-reset functions. I assumed that pre-reset functions could complete all 
fences, which could make related threads exit and release read locks. About 
this, Christian has different thinking, I am trying to understand his concern.  
TDR will be blocked until all read locks are released.

Best Regards
Dennis Li
-Original Message-
From: Grodzovsky, Andrey 
Sent: Wednesday, March 10, 2021 1:42 PM
To: Li, Dennis ; Christian König 
; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2

But how this will help if TDR thread will start after you both took read lock 
and checked that adev->in_gpu_reset is false ? Since TDR  now takes write lock 
only after suspending HW and waiting for all fences there is nothing that 
prevents both threads (e.g IOCTL and TDR) to access registers concurrently.

Andrey

On 2021-03-09 9:59 p.m., Li, Dennis wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Andrey,
>>>> Is the problem here that HW is suspended while some other threads that 
>>>> rely on the read side lock still access HW ? Mostly what I am thinking 
>>>> about are IOCTls - we can't 'wait for them to complete' but they might be 
>>>> accessing HW when we start suspend.
> In read side, when the reader held the read lock, it will also check whether 
> adev->in_gpu_reset is 1, if so, it will release read clock and is waiting for 
> recovery finish event.


>
> Best Regards
> Dennis Li
>
> -Original Message-
> From: Grodzovsky, Andrey 
> Sent: Wednesday, March 10, 2021 2:26 AM
> To: Christian König ; Li, Dennis
> ; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2
>
> On 2021-03-09 12:47 p.m., Christian König wrote:
>> No it won't. Accessing the hardware without the lock is ok as long as
>> the write side isn't taken.
>
> Oh, forgot about the trylock part, sorry...
>
>>
>> But that approach is illegal anyway because we suspend the hardware
>> without proper protection from concurrent access.
>
> For my understanding and from looking again at his steps related to
> this
>
> Step 0: atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) - [AG] protects from
> other TDR threads
>
> Step 1: cancel all delay works, stop drm schedule, complete all
> unreceived fences and so on. Call amdgpu_device_pre_asic_reset...
> e.t.c
> - [AG] this is the HW suspend part
>
> Step 2: call down_write(&adev->reset_sem) to hold write lock, which will 
> block recovery thread until other threads release read locks.
>
> Is the problem here that HW is suspended while some other threads that rely 
> on the read side lock still access HW ? Mostly what I am thinking about are 
> IOCTls - we can't 'wait for them to complete' but they might be accessing HW 
> when we start suspend.
>
> Andrey
>
>
>>
>> Christian.
>>
>> Am 09.03.21 um 17:40 schrieb Andrey Grodzovsky:
>>> Because he takes the write side lock post amdgpu_pre_asic_reset -
>>> where HW suspend sequence happens (touching registers) - so i think
>>> it will assert.
>>>
>>> Andrey
>>>
>>> On 2021-03-09 7:56 a.m., Christian König wrote:
>>>> Hi Dennis,
>>>>
>>>> why do you think that this will always assert in reset thread?
>>>>
>>>> In the reset thread while we are holding the reset lock write side
>>>> lockdep_assert_held() should be satisfied and not cause any splat
>>>> in the system log.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 09.03.21 um 03:03 schrieb Li, Dennis:
>>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>>
>>>>> Hi, Christian,
>>>>>  amdgpu_device_skip_hw_access will always assert in reset
>>>>> thread, which seems not a good idea.
>>>>>
>>>>> Best Regards
>>>>> Dennis Li
&

RE: [PATCH] drm/amdgpu: capture invalid hardware access v2

2021-03-09 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
  Because register r/w functions are also used in ISR, we need add in_irq() 
check. I updated this change in v3.

Best Regards
Dennis Li
-Original Message-
From: Christian König 
Sent: Tuesday, March 9, 2021 8:57 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey 
Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2

Hi Dennis,

why do you think that this will always assert in reset thread?

In the reset thread while we are holding the reset lock write side
lockdep_assert_held() should be satisfied and not cause any splat in the system 
log.

Regards,
Christian.

Am 09.03.21 um 03:03 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Christian,
> amdgpu_device_skip_hw_access will always assert in reset thread, 
> which seems not a good idea.
>
> Best Regards
> Dennis Li
> -Original Message-
> From: Christian König 
> Sent: Tuesday, March 9, 2021 2:07 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey ; Li, Dennis
> 
> Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2
>
> From: Dennis Li 
>
> When recovery thread has begun GPU reset, there should be not other threads 
> to access hardware, otherwise system randomly hang.
>
> v2 (chk): rewritten from scratch, use trylock and lockdep instead of
>hand wiring the logic.
>
> Signed-off-by: Dennis Li 
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +-
>   1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..c990af6a43ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct amdgpu_device 
> *adev, loff_t pos,
>   /*
>* register access helper functions.
>*/
> +
> +/* Check if hw access should be skipped because of hotplug or device
> +error */ static bool amdgpu_device_skip_hw_access(struct
> +amdgpu_device
> +*adev) {
> +if (adev->in_pci_err_recovery)
> +return true;
> +
> +#ifdef CONFIG_LOCKDEP
> +/*
> + * This is a bit complicated to understand, so worth a comment. What
> +we assert
> + * here is that the GPU reset is not running on another thread in parallel.
> + *
> + * For this we trylock the read side of the reset semaphore, if that
> +succeeds
> + * we know that the reset is not running in paralell.
> + *
> + * If the trylock fails we assert that we are either already holding
> +the read
> + * side of the lock or are the reset thread itself and hold the write
> +side of
> + * the lock.
> + */
> +if (down_read_trylock(&adev->reset_sem))
> +up_read(&adev->reset_sem);
> +else
> +lockdep_assert_held(&adev->reset_sem);
> +#endif
> +
> +return false;
> +}
> +
>   /**
>* amdgpu_device_rreg - read a memory mapped IO or indirect register
>*
> @@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,  {
>   uint32_t ret;
>
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if ((reg * 4) < adev->rmmio_size) {
> @@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
>*/
>   uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
> { -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (offset < adev->rmmio_size)
> @@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, 
> uint32_t offset)
>*/
>   void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset,
> uint8_t value)  { -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (offset < adev->rmmio_size)
> @@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   uint32_t reg, uint32_t v,
>   uint32_t acc_flags)
>   {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if ((reg * 4) < adev->rmmio_size) {
> @@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  void 
> amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>uint32_t reg, uint32_t v)
>   {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (amdgpu_sriov_fullaccess(adev) && @@ -475,7 +503,7 @@ void
> amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>*/
>   u32 amdgp

RE: [PATCH] drm/amdgpu: capture invalid hardware access v2

2021-03-09 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey,
>>> Is the problem here that HW is suspended while some other threads that rely 
>>> on the read side lock still access HW ? Mostly what I am thinking about are 
>>> IOCTls - we can't 'wait for them to complete' but they might be accessing 
>>> HW when we start suspend.
In read side, when the reader held the read lock, it will also check whether 
adev->in_gpu_reset is 1, if so, it will release read clock and is waiting for 
recovery finish event.

Best Regards
Dennis Li

-Original Message-
From: Grodzovsky, Andrey 
Sent: Wednesday, March 10, 2021 2:26 AM
To: Christian König ; Li, Dennis 
; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2

On 2021-03-09 12:47 p.m., Christian König wrote:
> No it won't. Accessing the hardware without the lock is ok as long as
> the write side isn't taken.

Oh, forgot about the trylock part, sorry...

>
> But that approach is illegal anyway because we suspend the hardware
> without proper protection from concurrent access.

For my understanding and from looking again at his steps related to this

Step 0: atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) - [AG] protects from other 
TDR threads

Step 1: cancel all delay works, stop drm schedule, complete all unreceived 
fences and so on. Call amdgpu_device_pre_asic_reset... e.t.c
- [AG] this is the HW suspend part

Step 2: call down_write(&adev->reset_sem) to hold write lock, which will block 
recovery thread until other threads release read locks.

Is the problem here that HW is suspended while some other threads that rely on 
the read side lock still access HW ? Mostly what I am thinking about are IOCTls 
- we can't 'wait for them to complete' but they might be accessing HW when we 
start suspend.

Andrey


>
> Christian.
>
> Am 09.03.21 um 17:40 schrieb Andrey Grodzovsky:
>> Because he takes the write side lock post amdgpu_pre_asic_reset -
>> where HW suspend sequence happens (touching registers) - so i think
>> it will assert.
>>
>> Andrey
>>
>> On 2021-03-09 7:56 a.m., Christian König wrote:
>>> Hi Dennis,
>>>
>>> why do you think that this will always assert in reset thread?
>>>
>>> In the reset thread while we are holding the reset lock write side
>>> lockdep_assert_held() should be satisfied and not cause any splat in
>>> the system log.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 09.03.21 um 03:03 schrieb Li, Dennis:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>> Hi, Christian,
>>>>     amdgpu_device_skip_hw_access will always assert in reset
>>>> thread, which seems not a good idea.
>>>>
>>>> Best Regards
>>>> Dennis Li
>>>> -Original Message-
>>>> From: Christian König 
>>>> Sent: Tuesday, March 9, 2021 2:07 AM
>>>> To: amd-gfx@lists.freedesktop.org
>>>> Cc: Grodzovsky, Andrey ; Li, Dennis
>>>> 
>>>> Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2
>>>>
>>>> From: Dennis Li 
>>>>
>>>> When recovery thread has begun GPU reset, there should be not other
>>>> threads to access hardware, otherwise system randomly hang.
>>>>
>>>> v2 (chk): rewritten from scratch, use trylock and lockdep instead
>>>> of
>>>>hand wiring the logic.
>>>>
>>>> Signed-off-by: Dennis Li 
>>>> Signed-off-by: Christian König 
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74
>>>> +-
>>>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index e247c3a2ec08..c990af6a43ca 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct
>>>> amdgpu_device *adev, loff_t pos,
>>>>   /*
>>>>* register access helper functions.
>>>>*/
>>>> +
>>>> +/* Check if hw access should be skipped because of hotplug or
>>>> +device error */ static bool amdgpu_device_skip_hw_access(struct
>>>> +amdgpu_device
>>>> +*adev) {
>>>> +if (adev->in_pci_err_recovery)
>>>> +return true;
&g

RE: [PATCH] drm/amdgpu: capture invalid hardware access v2

2021-03-08 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
   amdgpu_device_skip_hw_access will always assert in reset thread, which 
seems not a good idea.

Best Regards
Dennis Li
-Original Message-
From: Christian König 
Sent: Tuesday, March 9, 2021 2:07 AM
To: amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey ; Li, Dennis 

Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2

From: Dennis Li 

When recovery thread has begun GPU reset, there should be not other threads to 
access hardware, otherwise system randomly hang.

v2 (chk): rewritten from scratch, use trylock and lockdep instead of
  hand wiring the logic.

Signed-off-by: Dennis Li 
Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +-
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..c990af6a43ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, 
loff_t pos,
 /*
  * register access helper functions.
  */
+
+/* Check if hw access should be skipped because of hotplug or device
+error */ static bool amdgpu_device_skip_hw_access(struct amdgpu_device
+*adev) {
+if (adev->in_pci_err_recovery)
+return true;
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * This is a bit complicated to understand, so worth a comment. What we assert
+ * here is that the GPU reset is not running on another thread in parallel.
+ *
+ * For this we trylock the read side of the reset semaphore, if that succeeds
+ * we know that the reset is not running in paralell.
+ *
+ * If the trylock fails we assert that we are either already holding the read
+ * side of the lock or are the reset thread itself and hold the write side of
+ * the lock.
+ */
+if (down_read_trylock(&adev->reset_sem))
+up_read(&adev->reset_sem);
+else
+lockdep_assert_held(&adev->reset_sem);
+#endif
+
+return false;
+}
+
 /**
  * amdgpu_device_rreg - read a memory mapped IO or indirect register
  *
@@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,  {
 uint32_t ret;

-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return 0;

 if ((reg * 4) < adev->rmmio_size) {
@@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
  */
 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return 0;

 if (offset < adev->rmmio_size)
@@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, 
uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t 
value)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return;

 if (offset < adev->rmmio_size)
@@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 uint32_t reg, uint32_t v,
 uint32_t acc_flags)
 {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return;

 if ((reg * 4) < adev->rmmio_size) {
@@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  void 
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
  uint32_t reg, uint32_t v)
 {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return;

 if (amdgpu_sriov_fullaccess(adev) &&
@@ -475,7 +503,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
  */
 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return 0;

 if ((reg * 4) < adev->rio_mem_size)
@@ -497,7 +525,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return;

 if ((reg * 4) < adev->rio_mem_size)
@@ -519,7 +547,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, 
u32 v)
  */
 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return 0;

 if (index < adev->doorbell.num_doorbells) { @@ -542,7 +570,7 @@ u32 
amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
  */
 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return;

 if (index < adev->doorbell.num_doorbells) { @@ -563,7 +591,7 @@ void 
amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
  */
 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)  {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
 return 0;

 if (index < adev->doorbell.num_doorbells) { @@

RE: [PATCH] drm/amdgpu: remove unnecessary reading for epprom header

2021-02-25 Thread Li, Dennis
Hi, Hawking,
  Agree with your suggestion, and it could further simplify our codes. I 
will refactor them again. 

Best Regards
Dennis Li
-Original Message-
From: Zhang, Hawking  
Sent: Friday, February 26, 2021 12:30 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Chen, Guchun 
; Koenig, Christian 
Cc: Li, Dennis 
Subject: RE: [PATCH] drm/amdgpu: remove unnecessary reading for epprom header

[AMD Public Use]

What about merge this function with amdgpu_ras_check_err_threshold?

Regards,
Hawking

-Original Message-
From: Dennis Li  
Sent: Friday, February 26, 2021 09:26
To: amd-gfx@lists.freedesktop.org; Chen, Guchun ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH] drm/amdgpu: remove unnecessary reading for epprom header

If the number of badpage records exceed the threshold, driver has updated both 
epprom header and control->tbl_hdr.header before gpu reset, therefore GPU 
recovery thread no need to read epprom header directly.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 19d9aa76cfbf..4310ad63890c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -439,41 +439,19 @@ int amdgpu_ras_eeprom_check_err_threshold(
bool *exceed_err_limit)
 {
struct amdgpu_device *adev = to_amdgpu_device(control);
-   unsigned char buff[EEPROM_ADDRESS_SIZE +
-   EEPROM_TABLE_HEADER_SIZE] = { 0 };
-   struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
-   struct i2c_msg msg = {
-   .addr = control->i2c_address,
-   .flags = I2C_M_RD,
-   .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
-   .buf = buff,
-   };
-   int ret;
 
*exceed_err_limit = false;
 
if (!__is_ras_eeprom_supported(adev))
return 0;
 
-   /* read EEPROM table header */
-   mutex_lock(&control->tbl_mutex);
-   ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
-   if (ret < 1) {
-   dev_err(adev->dev, "Failed to read EEPROM table header.\n");
-   goto err;
-   }
-
-   __decode_table_header_from_buff(hdr, &buff[2]);
-
-   if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+   if (control->tbl_hdr.header == EEPROM_TABLE_HDR_BAD) {
dev_warn(adev->dev, "This GPU is in BAD status.");
dev_warn(adev->dev, "Please retire it or setting one bigger "
"threshold value when reloading driver.\n");
*exceed_err_limit = true;
}
 
-err:
-   mutex_unlock(&control->tbl_mutex);
return 0;
 }
 
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: protect eeprom update from GPU reset

2020-10-14 Thread Li, Dennis
[AMD Public Use]

Hi, Hawking,
  Driver has multi-path into GPU reset, so driver couldn't guarantee that 
bad record update has been done before GPU reset. 

Best Regards
Dennis Li
-Original Message-
From: Zhang, Hawking  
Sent: Wednesday, October 14, 2020 5:52 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Koenig, Christian 
Cc: Li, Dennis 
Subject: RE: [PATCH] drm/amdgpu: protect eeprom update from GPU reset

[AMD Public Use]

Hmm, I think bad page record update is done ahead of scheduling gpu reset work. 
For mGPU case, shall we walk through all the nodes in a hive before issue gpu 
reset work?

Regards,
Hawking

-Original Message-
From: Dennis Li  
Sent: Wednesday, October 14, 2020 17:41
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Kuehling, Felix ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH] drm/amdgpu: protect eeprom update from GPU reset

because i2c is unstable in GPU reset, driver need protect eeprom update from 
GPU reset, to not miss any bad page record.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 0e64c39a2372..695bcfc5c983 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -149,7 +149,11 @@ static int __update_table_header(struct 
amdgpu_ras_eeprom_control *control,
 
msg.addr = control->i2c_address;
 
+   /* i2c may be unstable in gpu reset */
+   down_read(&adev->reset_sem);
ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
+   up_read(&adev->reset_sem);
+
if (ret < 1)
DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret);
 
@@ -557,7 +561,11 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
control->next_addr += EEPROM_TABLE_RECORD_SIZE;
}
 
+   /* i2c may be unstable in gpu reset */
+   down_read(&adev->reset_sem);
ret = i2c_transfer(&adev->pm.smu_i2c, msgs, num);
+   up_read(&adev->reset_sem);
+
if (ret < 1) {
DRM_ERROR("Failed to process EEPROM table records, ret:%d", 
ret);
 
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: clean up ras sysfs creation (v2)

2020-09-24 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


Reviewed-by: Dennis Li 

-Original Message-
From: Chen, Guchun  
Sent: Thursday, September 24, 2020 10:52 PM
To: amd-gfx@lists.freedesktop.org; Koenig, Christian 
; Zhang, Hawking ; Li, Dennis 
; Zhou1, Tao ; Clements, John 
; Deucher, Alexander ; Lazar, 
Lijo 
Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: clean up ras sysfs creation (v2)

Merge ras sysfs creation together by calling sysfs_create_group once, as 
sysfs_update_group may not work properly as expected.

v2: improve commit message

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 87 +
 1 file changed, 31 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e5ea14774c0c..6c57521b21fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1027,58 +1027,6 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct 
device *dev,
return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", 
con->features);  }
 
-static void amdgpu_ras_sysfs_add_bad_page_node(struct amdgpu_device *adev) -{
-   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-   struct attribute_group group;
-   struct bin_attribute *bin_attrs[] = {
-   &con->badpages_attr,
-   NULL,
-   };
-
-   con->badpages_attr = (struct bin_attribute) {
-   .attr = {
-   .name = "gpu_vram_bad_pages",
-   .mode = S_IRUGO,
-   },
-   .size = 0,
-   .private = NULL,
-   .read = amdgpu_ras_sysfs_badpages_read,
-   };
-
-   group.name = RAS_FS_NAME;
-   group.bin_attrs = bin_attrs;
-
-   sysfs_bin_attr_init(bin_attrs[0]);
-
-   sysfs_update_group(&adev->dev->kobj, &group);
-}
-
-static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) -{
-   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-   struct attribute *attrs[] = {
-   &con->features_attr.attr,
-   NULL
-   };
-   struct attribute_group group = {
-   .name = RAS_FS_NAME,
-   .attrs = attrs,
-   };
-
-   con->features_attr = (struct device_attribute) {
-   .attr = {
-   .name = "features",
-   .mode = S_IRUGO,
-   },
-   .show = amdgpu_ras_sysfs_features_read,
-   };
-
-   sysfs_attr_init(attrs[0]);
-
-   return sysfs_create_group(&adev->dev->kobj, &group);
-}
-
 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)  
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1300,13 
+1248,40 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device 
*adev)
 /* debugfs end */
 
 /* ras fs */
-
+static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
+   amdgpu_ras_sysfs_badpages_read, NULL, 0); static 
+DEVICE_ATTR(features, S_IRUGO,
+   amdgpu_ras_sysfs_features_read, NULL);
 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  {
-   amdgpu_ras_sysfs_create_feature_node(adev);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct attribute_group group = {
+   .name = RAS_FS_NAME,
+   };
+   struct attribute *attrs[] = {
+   &con->features_attr.attr,
+   NULL
+   };
+   struct bin_attribute *bin_attrs[] = {
+   NULL,
+   NULL,
+   };
 
-   if (amdgpu_bad_page_threshold != 0)
-   amdgpu_ras_sysfs_add_bad_page_node(adev);
+   /* add features entry */
+   con->features_attr = dev_attr_features;
+   group.attrs = attrs;
+   sysfs_attr_init(attrs[0]);
+
+   if (amdgpu_bad_page_threshold != 0) {
+   /* add bad_page_features entry */
+   bin_attr_gpu_vram_bad_pages.private = NULL;
+   con->badpages_attr = bin_attr_gpu_vram_bad_pages;
+   bin_attrs[0] = &con->badpages_attr;
+   group.bin_attrs = bin_attrs;
+   sysfs_bin_attr_init(bin_attrs[0]);
+   }
+
+   sysfs_create_group(&adev->dev->kobj, &group);
 
return 0;
 }
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH v4 1/8] drm/amdgpu: Avoid accessing HW when suspending SW state

2020-09-02 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, andrey

Did you want to use adev->in_pci_err_recovery to avoid hardware accessed by 
other threads when doing PCI recovery? If so, it is better to change to use 
lock protect them. This patch can't solve your issue completely. 

Best Regards
Dennis Li
-Original Message-
From: Andrey Grodzovsky  
Sent: Thursday, September 3, 2020 2:42 AM
To: amd-gfx@lists.freedesktop.org; sathyanarayanan.kuppusw...@linux.intel.com; 
linux-...@vger.kernel.org
Cc: Deucher, Alexander ; Das, Nirmoy 
; Li, Dennis ; Koenig, Christian 
; Tuikov, Luben ; 
bhelg...@google.com; Grodzovsky, Andrey 
Subject: [PATCH v4 1/8] drm/amdgpu: Avoid accessing HW when suspending SW state

At this point the ASIC is already post reset by the HW/PSP so the HW not in 
proper state to be configured for suspension, some blocks might be even gated 
and so best is to avoid touching it.

v2: Rename in_dpc to more meaningful name

Signed-off-by: Andrey Grodzovsky 
Reviewed-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c|  6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c|  6 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 18 --
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c |  3 +++
 6 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c311a3c..b20354f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,7 @@ struct amdgpu_device {
atomic_tthrottling_logging_enabled;
struct ratelimit_state  throttling_logging_rs;
uint32_tras_features;
+   boolin_pci_err_recovery;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 74a1c03..1fbf8a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,  {
uint32_t ret;
 
+   if (adev->in_pci_err_recovery)
+   return 0;
+
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_rreg(adev, reg);
 
@@ -351,6 +354,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,
  * Returns the 8 bit value from the offset specified.
  */
 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
+   if (adev->in_pci_err_recovery)
+   return 0;
+
if (offset < adev->rmmio_size)
return (readb(adev->rmmio + offset));
BUG();
@@ -372,6 +378,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, 
uint32_t offset) {
  * Writes the value specified to the offset specified.
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t 
value) {
+   if (adev->in_pci_err_recovery)
+   return;
+
if (offset < adev->rmmio_size)
writeb(value, adev->rmmio + offset);
else
@@ -382,6 +391,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device 
*adev,
   uint32_t reg, uint32_t v,
   uint32_t acc_flags)
 {
+   if (adev->in_pci_err_recovery)
+   return;
+
trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 
if ((reg * 4) < adev->rmmio_size)
@@ -409,6 +421,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device 
*adev,  void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t 
v,
uint32_t acc_flags)
 {
+   if (adev->in_pci_err_recovery)
+   return;
+
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_wreg(adev, reg, v);
 
@@ -423,6 +438,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v,  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 
uint32_t reg, uint32_t v,
uint32_t acc_flags)
 {
+   if (adev->in_pci_err_recovery)
+   return;
+
if (amdgpu_sriov_fullaccess(adev) &&
adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -444,6 +462,9 
@@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, 
uint32_t
  */
 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)  {
+   if (adev->in_pci_err_recovery)
+   return 0;
+
if ((reg * 4) < adev->rio_mem_size)
return ioread32(ade

RE: [PATCH] drm/kfd: fix a system crash issue during GPU recovery

2020-09-01 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,

>>>The failure to execute_queues should probably not be reported to the caller 
>>>of create_queue, because the queue was already created, and the problem with 
>>>execute_queues has a much bigger scope than this one caller. So I think the 
>>>correct solution is to ignore the return value from 
>>>execute_queues.

Got it. I have created a patch v2 according to your suggestion. 

>>>As a follow up, we should probably handle all the error scenarios inside 
>>>execute_queues and make it a void function. Failure to unmap queues already 
>>>triggers a GPU reset, so nothing new needs to be done for that. 
>>>But we need to add handling of failures to map queues. It doesn't require a 
>>>GPU reset, because the problem is in the kernel (e.g. out of memory), not 
>>>the GPU. The best we can do is report this asynchronously as a GPU hang to 
>>>all KFD processes, so they know the GPU is no longer going to work 
>>>for them.

Understood.  I will follow up this issue and prepare a solution to discuss with 
you. 

Best Regards
Dennis Li

-Original Message-
From: Kuehling, Felix  
Sent: Wednesday, September 2, 2020 11:26 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhang, Hawking ; 
Koenig, Christian 
Subject: Re: [PATCH] drm/kfd: fix a system crash issue during GPU recovery

On 2020-09-01 11:21 a.m., Li, Dennis wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Felix,
>   If GPU hang, execute_queues_cpsch will fail to unmap or map queues and 
> then create_queue_cpsch will return error. If pqm_create_queue find 
> create_queue_cpsch failed, it will call uninit_queue to free queue object. 
> However this queue object has been added in to qpd->queues_list in the old 
> code.

Right, that's a problem. I think the intention here is to keep going because a 
failure to execute the runlist affects not just the queue that was just 
created, but all queues in all processes.

The failure to execute_queues should probably not be reported to the caller of 
create_queue, because the queue was already created, and the problem with 
execute_queues has a much bigger scope than this one caller. So I think the 
correct solution is to ignore the return value from execute_queues.

As a follow up, we should probably handle all the error scenarios inside 
execute_queues and make it a void function. Failure to unmap queues already 
triggers a GPU reset, so nothing new needs to be done for that. 
But we need to add handling of failures to map queues. It doesn't require a GPU 
reset, because the problem is in the kernel (e.g. out of memory), not the GPU. 
The best we can do is report this asynchronously as a GPU hang to all KFD 
processes, so they know the GPU is no longer going to work for them.

Regards,
   Felix

>
> Best Regards
> Dennis Li
>
> -Original Message-
> From: Kuehling, Felix 
> Sent: Tuesday, September 1, 2020 9:26 PM
> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
> Deucher, Alexander ; Zhang, Hawking 
> ; Koenig, Christian 
> Subject: Re: [PATCH] drm/kfd: fix a system crash issue during GPU 
> recovery
>
> I'm not sure how the bug you're fixing is caused, but your fix is clearly in 
> the wrong place.
>
> A queue being disabled is not the same thing as a queue being destroyed.
> Queues can be disabled for legitimate reasons, but they still should exist 
> and be in the qpd->queues_list.
>
> If a destroyed queue is left on the qpd->queues_list, that would be a 
> problem. Can you point out where such a thing is happening?
>
> Thanks,
>    Felix
>
>
> Am 2020-08-31 um 9:36 p.m. schrieb Dennis Li:
>> The crash log as the below:
>>
>> [Thu Aug 20 23:18:14 2020] general protection fault:  [#1] SMP NOPTI
>> [Thu Aug 20 23:18:14 2020] CPU: 152 PID: 1837 Comm: kworker/152:1 Tainted: G 
>>   OE 5.4.0-42-generic #46~18.04.1-Ubuntu
>> [Thu Aug 20 23:18:14 2020] Hardware name: GIGABYTE 
>> G482-Z53-YF/MZ52-G40-00, BIOS R12 05/13/2020 [Thu Aug 20 23:18:14 
>> 2020] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [Thu Aug 20
>> 23:18:14 2020] RIP: 0010:evict_process_queues_cpsch+0xc9/0x130
>> [amdgpu] [Thu Aug 20 23:18:14 2020] Code: 49 8d 4d 10 48 39 c8 75 21 
>> eb 44 83 fa 03 74 36 80 78 72 00 74 0c 83 ab 68 01 00 00 01 41 c6 45
>> 41 00 48 8b 00 48 39 c8 74 25 <80> 78 70 00 c6 40 6d 01 74 ee 8b 50 
>> 28
>> c6 40 70 00 83 ab 60 01 00 [Thu Aug 20 23:18:14 2020] RSP:
>> 0018:b29b52f6fc90 EFLAGS: 00010213 [Thu Aug 20 23:18:14 2020] RAX:
>> 1c884edb0a118914 RBX: 8a0d45ff3c00 RCX: 8a2d83e41038 [Thu Aug

RE: [PATCH] drm/kfd: fix a system crash issue during GPU recovery

2020-09-01 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,
 If GPU hang, execute_queues_cpsch will fail to unmap or map queues and 
then create_queue_cpsch will return error. If pqm_create_queue find 
create_queue_cpsch failed, it will call uninit_queue to free queue object. 
However this queue object has been added in to qpd->queues_list in the old 
code. 

Best Regards
Dennis Li

-Original Message-
From: Kuehling, Felix  
Sent: Tuesday, September 1, 2020 9:26 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhang, Hawking ; 
Koenig, Christian 
Subject: Re: [PATCH] drm/kfd: fix a system crash issue during GPU recovery

I'm not sure how the bug you're fixing is caused, but your fix is clearly in 
the wrong place.

A queue being disabled is not the same thing as a queue being destroyed.
Queues can be disabled for legitimate reasons, but they still should exist and 
be in the qpd->queues_list.

If a destroyed queue is left on the qpd->queues_list, that would be a problem. 
Can you point out where such a thing is happening?

Thanks,
  Felix


Am 2020-08-31 um 9:36 p.m. schrieb Dennis Li:
> The crash log as the below:
>
> [Thu Aug 20 23:18:14 2020] general protection fault:  [#1] SMP NOPTI
> [Thu Aug 20 23:18:14 2020] CPU: 152 PID: 1837 Comm: kworker/152:1 Tainted: G  
>  OE 5.4.0-42-generic #46~18.04.1-Ubuntu
> [Thu Aug 20 23:18:14 2020] Hardware name: GIGABYTE 
> G482-Z53-YF/MZ52-G40-00, BIOS R12 05/13/2020 [Thu Aug 20 23:18:14 
> 2020] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [Thu Aug 20 
> 23:18:14 2020] RIP: 0010:evict_process_queues_cpsch+0xc9/0x130 
> [amdgpu] [Thu Aug 20 23:18:14 2020] Code: 49 8d 4d 10 48 39 c8 75 21 
> eb 44 83 fa 03 74 36 80 78 72 00 74 0c 83 ab 68 01 00 00 01 41 c6 45 
> 41 00 48 8b 00 48 39 c8 74 25 <80> 78 70 00 c6 40 6d 01 74 ee 8b 50 28 
> c6 40 70 00 83 ab 60 01 00 [Thu Aug 20 23:18:14 2020] RSP: 
> 0018:b29b52f6fc90 EFLAGS: 00010213 [Thu Aug 20 23:18:14 2020] RAX: 
> 1c884edb0a118914 RBX: 8a0d45ff3c00 RCX: 8a2d83e41038 [Thu Aug 
> 20 23:18:14 2020] RDX:  RSI: 0082 RDI: 
> 8a0e2e4178c0 [Thu Aug 20 23:18:14 2020] RBP: b29b52f6fcb0 R08: 
> 1b64 R09: 0004 [Thu Aug 20 23:18:14 2020] R10: 
> b29b52f6fb78 R11: 0001 R12: 8a0d45ff3d28 [Thu Aug 20 
> 23:18:14 2020] R13: 8a2d83e41028 R14:  R15: 
>  [Thu Aug 20 23:18:14 2020] FS:  () 
> GS:8a0e2e40() knlGS: [Thu Aug 20 23:18:14 2020] 
> CS:  0010 DS:  ES:  CR0: 80050033 [Thu Aug 20 23:18:14 2020] 
> CR2: 55c783c0e6a8 CR3: 0034a1284000 CR4: 00340ee0 [Thu Aug 20 
> 23:18:14 2020] Call Trace:
> [Thu Aug 20 23:18:14 2020]  kfd_process_evict_queues+0x43/0xd0 
> [amdgpu] [Thu Aug 20 23:18:14 2020]  
> kfd_suspend_all_processes+0x60/0xf0 [amdgpu] [Thu Aug 20 23:18:14 
> 2020]  kgd2kfd_suspend.part.7+0x43/0x50 [amdgpu] [Thu Aug 20 23:18:14 
> 2020]  kgd2kfd_pre_reset+0x46/0x60 [amdgpu] [Thu Aug 20 23:18:14 2020]  
> amdgpu_amdkfd_pre_reset+0x1a/0x20 [amdgpu] [Thu Aug 20 23:18:14 2020]  
> amdgpu_device_gpu_recover+0x377/0xf90 [amdgpu] [Thu Aug 20 23:18:14 
> 2020]  ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [Thu Aug 20 
> 23:18:14 2020]  amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [Thu Aug 
> 20 23:18:14 2020]  process_one_work+0x20f/0x400 [Thu Aug 20 23:18:14 
> 2020]  worker_thread+0x34/0x410
>
> When GPU hang, user process will fail to create a compute queue whose 
> struct object will be freed later, but driver wrongly add this queue 
> to queue list of the proccess. And then kfd_process_evict_queues will 
> access a freed memory, which cause a system crash.
>
> Signed-off-by: Dennis Li 
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 560adc57a050..d5e6b07ffb27 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1296,16 +1296,18 @@ static int create_queue_cpsch(struct 
> device_queue_manager *dqm, struct queue *q,
>   mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>   &q->gart_mqd_addr, &q->properties);
>  
> - list_add(&q->list, &qpd->queues_list);
> - qpd->queue_count++;
> -
>   if (q->properties.is_active) {
>   increment_queue_count(dqm, q->properties.type);
>  
>   retval = execute_queues_cpsch(dqm,
>   KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> + if (retval)
> + goto out_execute_cps

RE: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery

2020-08-31 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey,

RE- Isn't adev->reset_sem non-recursive ? How this works when you try to access 
registers from within GPU reset thread while adev->reset_sem is already write 
locked from amdgpu_device_lock_adev earlier in the same thread ?

Deli: down_read_trylock will fail in this case, return false immediately and 
will not lock adev->reset_sem. In GPU reset thread, we should use MMIO instead 
of KIQ to access registers. 

Best Regards
Dennis Li
-Original Message-
From: Grodzovsky, Andrey  
Sent: Tuesday, September 1, 2020 9:40 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking ; Koenig, 
Christian 
Subject: Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery


On 8/31/20 9:17 PM, Dennis Li wrote:
> When GPU is in reset, its status isn't stable and ring buffer also 
> need be reset when resuming. Therefore driver should protect GPU 
> recovery thread from ring buffer accessed by other threads. Otherwise 
> GPU will randomly hang during recovery.
>
> Signed-off-by: Dennis Li 
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 172dc47b7f39..8db56a22cd1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
> uint32_t reg,
>   {
>   uint32_t ret;
>   
> - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> - return amdgpu_kiq_rreg(adev, reg);
> + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> + amdgpu_sriov_runtime(adev) &&
> + down_read_trylock(&adev->reset_sem)) {
> + ret = amdgpu_kiq_rreg(adev, reg);
> + up_read(&adev->reset_sem);
> + return ret;
> + }


Isn't adev->reset_sem non-recursive ? How this works when you try to access 
registers from within GPU reset thread while adev->reset_sem is already write 
locked from amdgpu_device_lock_adev earlier in the same thread ?

Andrey


>   
>   if ((reg * 4) < adev->rmmio_size)
>   ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ 
> -332,6 
> +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>   spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>   }
> +
>   trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>   return ret;
>   }
> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct 
> amdgpu_device *adev, uint32_t reg,
>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>   uint32_t acc_flags)
>   {
> - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> - return amdgpu_kiq_wreg(adev, reg, v);
> + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> + amdgpu_sriov_runtime(adev) &&
> + down_read_trylock(&adev->reset_sem)) {
> + amdgpu_kiq_wreg(adev, reg, v);
> + up_read(&adev->reset_sem);
> + return;
> + }
>   
>   amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index ad9ad622ccce..4ea2a065daa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
> *adev, uint32_t vmid,
>*/
>   if (adev->gfx.kiq.ring.sched.ready &&
>   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> - !amdgpu_in_reset(adev)) {
> + down_read_trylock(&adev->reset_sem)) {
>   
>   struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>   const unsigned eng = 17;
> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct 
> amdgpu_device *adev, uint32_t vmid,
>   
>   amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   1 << vmid);
> +
> + up_read(&adev->reset_sem);
>   return;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e1a0ae327cf5..33b7cf1c79ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -501,12 +501,13 @@

RE: [PATCH 1/7] drm/amdgpu: Implement DPC recovery

2020-08-26 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey,
 I found that the sequences of amdgpu_pci_slot_reset is mostly similar to 
amdgpu_do_asic_reset. Could help us refactor them to reuse more codes? 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Andrey 
Grodzovsky
Sent: Wednesday, August 26, 2020 10:46 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Grodzovsky, Andrey 
; Das, Nirmoy 
Subject: [PATCH 1/7] drm/amdgpu: Implement DPC recovery

Add DPC handlers with basic recovery functionality.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|   9 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 181 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|   9 +-
 3 files changed, 196 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 49ea9fa..3399242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -49,6 +49,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -1263,6 +1265,13 @@ static inline int amdgpu_dm_display_resume(struct 
amdgpu_device *adev) { return  void amdgpu_register_gpu_instance(struct 
amdgpu_device *adev);  void amdgpu_unregister_gpu_instance(struct amdgpu_device 
*adev);
 
+pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
+  pci_channel_state_t state);
+pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev); 
+pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev); void 
+amdgpu_pci_resume(struct pci_dev *pdev);
+
+
 #include "amdgpu_object.h"
 
 /* used by df_v3_6.c and amdgpu_pmu.c */ diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5a948ed..84f8d14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -350,7 +350,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,
  *
  * Returns the 8 bit value from the offset specified.
  */
-uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
+uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
+
if (offset < adev->rmmio_size)
return (readb(adev->rmmio + offset));
BUG();
@@ -371,7 +373,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, 
uint32_t offset) {
  *
  * Writes the value specified to the offset specified.
  */
-void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t 
value) {
+void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, 
+uint8_t value) {
+
if (offset < adev->rmmio_size)
writeb(value, adev->rmmio + offset);
else
@@ -380,6 +384,7 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t 
offset, uint8_t value)
 
 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v, uint32_t acc_flags)  {
+
trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 
if ((reg * 4) < adev->rmmio_size)
@@ -407,6 +412,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device 
*adev, uint32_t reg,  void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v,
uint32_t acc_flags)
 {
+
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_wreg(adev, reg, v);
 
@@ -461,6 +467,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  {
+
if ((reg * 4) < adev->rio_mem_size)
iowrite32(v, adev->rio_mem + (reg * 4));
else {
@@ -480,6 +487,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, 
u32 v)
  */
 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)  {
+
if (index < adev->doorbell.num_doorbells) {
return readl(adev->doorbell.ptr + index);
} else {
@@ -500,6 +508,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 
index)
  */
 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)  {
+
if (index < adev->doorbell.num_doorbells) {
writel(v, adev->doorbell.ptr + index);
} else {
@@ -518,6 +527,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 
index, u32 v)
  */
 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)  {
+
if (index < adev->doorbell.num_doorbells) {
return atomic64_read((atomic64_t *)(adev->doorbell.ptr + 
index));
} else {
@@ -538,6 +548,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 
index)
  */
 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)  {
+
if (index < adev->doorbell.num_doorbells) {
atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
} else {
@@

RE: [PATCH] drm/amdgpu: correct SE number for arcturus gfx ras

2020-08-26 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


Reviewed-by: Dennis Li 

-Original Message-
From: Chen, Guchun  
Sent: Wednesday, August 26, 2020 3:53 PM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Li, 
Dennis ; Zhou1, Tao 
Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: correct SE number for arcturus gfx ras

Arcturus GFX has 8 SEs and 16 CUs per SE, so when resetting EDC related 
register, all CUs needs to be visited, otherwise, garbage data from EDC 
regisger of missed SEs would present.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
index 46351db36922..bd85aed3523a 100755
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -57,10 +57,10 @@ static const struct soc15_reg_entry 
gfx_v9_4_edc_counter_regs[] = {
/* SPI */
{ SOC15_REG_ENTRY(GC, 0, mmSPI_EDC_CNT), 0, 4, 1 },
/* SQ */
-   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_CNT), 0, 4, 16 },
-   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_DED_CNT), 0, 4, 16 },
-   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_INFO), 0, 4, 16 },
-   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_SEC_CNT), 0, 4, 16 },
+   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_CNT), 0, 8, 16 },
+   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_DED_CNT), 0, 8, 16 },
+   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_INFO), 0, 8, 16 },
+   { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_SEC_CNT), 0, 8, 16 },
/* SQC */
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT), 0, 4, 6 },
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT2), 0, 4, 6 },
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-20 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

[AMD Official Use Only - Internal Distribution Only]

Hi, Monk,
 Got it, thanks for your explanation.

Best Regards
Dennis Li
-Original Message-
From: Liu, Monk 
Sent: Friday, August 21, 2020 11:14 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking ; Koenig, 
Christian 
Subject: RE: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

[AMD Official Use Only - Internal Distribution Only]

>>Locked = down_read_trylock(&adev->reset_sem);
>>If (!locked)
>>Return;
>>atomic_set(&adev->in_gpu_reset, 1);

[Dennis Li] why need we set adev->in_gpu_reset as 1 here?  It should be set 
when do GPU recovery.


[ML] because "in_gpu_reset" means GPU is under reset or FLR (VF FLR actually)

If we get the reset_sem in flr_work routine, that means Host side is doing the 
VF FLR (flr_work is initiated from host side through an interrupt to guest)

Since host side is doing VF FLR thus we want to occupy GPU with 1) take the 
reset_sem first to prevent guest side GPU recovery routine occupy GPU; 2) mark 
GPU under reset by set "in_gpu_rest" to true

Thanks

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Li, Dennis 
Sent: Thursday, August 20, 2020 7:36 PM
To: Liu, Monk ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking ; Koenig, 
Christian 
Subject: RE: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

[AMD Official Use Only - Internal Distribution Only]

[AMD Official Use Only - Internal Distribution Only]

Hi, Monk,
  See my below comments.

Best Regards
Dennis Li

[AMD Official Use Only - Internal Distribution Only]

--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)  struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);  struct amdgpu_device *adev = container_of(virt, struct 
amdgpu_device, virt);  int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; -int locked;

 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
  * otherwise the mailbox msg will be ruined/reseted by
  * the VF FLR.
- *
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
- * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
- * which means host side had finished this VF's FLR.
  */
-locked = mutex_trylock(&adev->lock_reset); -if (locked) 
-atomic_set(&adev->in_gpu_reset, 1);
+down_read(&adev->reset_sem);

>> Above piece looks suspicious :

>> The original logic (before this patch and your another patch) is :

>> 260 locked = mutex_trylock(&adev->lock_reset);
>> 261 if (!locked)
>> 262 return;
>> 263
>> 264 adev->in_gpu_reset = true;

>> So we only continue after the trylock success, and we "return"
>> immediately upon the trylock fail,

>> With your change the code path continue anyway (did you change the
>> logic in your another patch recently ??)

[Dennis Li] I didn't change the logic before, I guess that your local branch is 
different from drm-next.  In drm-next, the logic is:
locked = mutex_trylock(&adev->lock_reset); if (locked) 
atomic_set(&adev->in_gpu_reset, 1); According to the comments before these 
codes,  it wanted to block amdgpu_gpu_recover till msg FLR COMPLETE received, 
so I changed it in this patch.

>>Please modify it as:

>>Locked = down_read_trylock(&adev->reset_sem);
>>If (!locked)
>>Return;
>>atomic_set(&adev->in_gpu_reset, 1);

[Dennis Li] why need we set adev->in_gpu_reset as 1 here?  It should be set 
when do GPU recovery.

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of Dennis Li
Sent: Thursday, August 20, 2020 5:33 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Kuehling, Felix ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

clients don't need reset-lock for synchronization when no GPU recovery.

v2:
change to return the return value of down_read_killable.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {

 atomic_t in_gpu_reset;
 enum pp_mp1_state   mp1_state;
-struct mutex  lock_reset;
+struct rw_semaphore reset_sem;
 struct amdgpu_doorbell_index doorbell_index;

 struct mutexnotifier_lock;
diff --git a/drivers/g

RE: [PATCH] drm/amd/display: remove unintended executable mode

2020-08-20 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Lukas,
  Thanks for your fix. This issue was caused by that I modified these files 
in windows system with Samba. I will take care in the future. 

Best Regards
Dennis Li
-Original Message-
From: Lukas Bulwahn  
Sent: Wednesday, August 19, 2020 4:18 PM
To: Deucher, Alexander ; Koenig, Christian 
; Li, Dennis ; Zuo, Jerry 

Cc: amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org; 
linux-ker...@vger.kernel.org; Chen, Guchun ; Wu, Hersen 
; Lukas Bulwahn 
Subject: [PATCH] drm/amd/display: remove unintended executable mode

Besides the intended change, commit 4cc1178e166a ("drm/amdgpu: replace DRM 
prefix with PCI device info for gfx/mmhub") also set the source files 
mmhub_v1_0.c and gfx_v9_4.c to be executable, i.e., changed fromold mode
644 to new mode 755.

Commit 241b2ec9317e ("drm/amd/display: Add dcn30 Headers (v2)") added the four 
header files {dpcs,dcn}_3_0_0_{offset,sh_mask}.h as executable, i.e., mode 755.

Set to the usual modes for source and headers files and clean up those 
mistakes. No functional change.

Signed-off-by: Lukas Bulwahn 
---
applies cleanly on current master and next-20200819

Alex, Christian, please pick this minor non-urgent cleanup patch.

Dennis, Jerry, please ack.

Dennis, Jerry, you might want to check your development environment introducing 
those executable modes on files.

 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 0
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 0
 drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_offset.h   | 0
 drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_sh_mask.h  | 0  
drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_offset.h  | 0  
drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_sh_mask.h | 0
 6 files changed, 0 insertions(+), 0 deletions(-)  mode change 100755 => 100644 
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
 mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
 mode change 100755 => 100644 
drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_offset.h
 mode change 100755 => 100644 
drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_sh_mask.h
 mode change 100755 => 100644 
drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_offset.h
 mode change 100755 => 100644 
drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_sh_mask.h

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
old mode 100755
new mode 100644
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
old mode 100755
new mode 100644
diff --git a/drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_offset.h
old mode 100755
new mode 100644
diff --git a/drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/dcn/dcn_3_0_0_sh_mask.h
old mode 100755
new mode 100644
diff --git a/drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_offset.h
old mode 100755
new mode 100644
diff --git a/drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/dcn/dpcs_3_0_0_sh_mask.h
old mode 100755
new mode 100644
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-20 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

[AMD Official Use Only - Internal Distribution Only]

Hi, Monk,
  See my below comments.

Best Regards
Dennis Li

[AMD Official Use Only - Internal Distribution Only]

--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)  struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);  struct amdgpu_device *adev = container_of(virt, struct 
amdgpu_device, virt);  int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; -int locked;

 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
  * otherwise the mailbox msg will be ruined/reseted by
  * the VF FLR.
- *
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
- * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
- * which means host side had finished this VF's FLR.
  */
-locked = mutex_trylock(&adev->lock_reset); -if (locked) 
-atomic_set(&adev->in_gpu_reset, 1);
+down_read(&adev->reset_sem);

>> Above piece looks suspicious :

>> The original logic (before this patch and your another patch) is :

>> 260 locked = mutex_trylock(&adev->lock_reset);
>> 261 if (!locked)
>> 262 return;
>> 263
>> 264 adev->in_gpu_reset = true;

>> So we only continue after the trylock success, and we "return" immediately 
>> upon the trylock fail,

>> With your change the code path continue anyway (did you change the logic in 
>> your another patch recently ??)

[Dennis Li] I didn't change the logic before, I guess that your local branch is 
different from drm-next.  In drm-next, the logic is:
locked = mutex_trylock(&adev->lock_reset);
if (locked)
atomic_set(&adev->in_gpu_reset, 1);
According to the comments before these codes,  it wanted to block 
amdgpu_gpu_recover till msg FLR COMPLETE received, so I changed it in this 
patch.

>>Please modify it as:

>>Locked = down_read_trylock(&adev->reset_sem);
>>If (!locked)
>>Return;
>>atomic_set(&adev->in_gpu_reset, 1);

[Dennis Li] why need we set adev->in_gpu_reset as 1 here?  It should be set 
when do GPU recovery.

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of Dennis Li
Sent: Thursday, August 20, 2020 5:33 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Kuehling, Felix ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

clients don't need reset-lock for synchronization when no GPU recovery.

v2:
change to return the return value of down_read_killable.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {

 atomic_t in_gpu_reset;
 enum pp_mp1_state   mp1_state;
-struct mutex  lock_reset;
+struct rw_semaphore reset_sem;
 struct amdgpu_doorbell_index doorbell_index;

 struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..cc5c7f81c540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode 
*inode, struct file *file)

 file->private_data = adev;

-mutex_lock(&adev->lock_reset);
+ret = down_read_killable(&adev->reset_sem);
+if (ret)
+return ret;
+
 if (adev->autodump.dumping.done) {
 reinit_completion(&adev->autodump.dumping);
 ret = 0;
 } else {
 ret = -EBUSY;
 }
-mutex_unlock(&adev->lock_reset);
+
+up_read(&adev->reset_sem);

 return ret;
 }
@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)  }

 /* Avoid accidently unparking the sched thread during GPU reset */ 
-mutex_lock(&adev->lock_reset);
+r = down_read_killable(&adev->reset_sem);
+if (r)
+return r;

 /* hold on the scheduler */
 for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1275,7 @@ static int 
amdgpu_debugfs_test_ib(struct seq_file *m, void *data)  
kthread_unpark(ring->sched.thread);
 }

-mutex_unlock(&adev->lock_reset);
+up_read(&adev->reset_sem);

 pm_runtime_mark_last_busy(dev->dev);
 pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) 
 return -ENOMEM;

 /* Avoid accidently unparking the sched thread during GPU reset */ 
-mutex_lock(&adev->lock_reset);
+r = down_read_killable(&adev->reset_sem);
+if (r

RE: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-20 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Frank and Monk,

Can you help review this patch? Because it also changes some virtualization 
related codes.

Best Regards
Dennis Li
-Original Message-
From: Dennis Li  
Sent: Thursday, August 20, 2020 5:33 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Kuehling, Felix ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

clients don't need reset-lock for synchronization when no GPU recovery.

v2:
change to return the return value of down_read_killable.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {
 
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..cc5c7f81c540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode 
*inode, struct file *file)
 
file->private_data = adev;
 
-   mutex_lock(&adev->lock_reset);
+   ret = down_read_killable(&adev->reset_sem);
+   if (ret)
+   return ret;
+
if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
-   mutex_unlock(&adev->lock_reset);
+
+   up_read(&adev->reset_sem);
 
return ret;
 }
@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(&adev->lock_reset);
+   r = down_read_killable(&adev->reset_sem);
+   if (r)
+   return r;
 
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1275,7 @@ static 
int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread);
}
 
-   mutex_unlock(&adev->lock_reset);
+   up_read(&adev->reset_sem);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(&adev->lock_reset);
+   r = down_read_killable(&adev->reset_sem);
+   if (r)
+   goto pro_end;
 
/* stop the scheduler */
kthread_park(ring->sched.thread);
@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   mutex_unlock(&adev->lock_reset);
+   up_read(&adev->reset_sem);
 
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
 
+pro_end:
kfree(fences);
 
-   return 0;
+   return r;
 }
 
 static int amdgpu_debugfs_sclk_set(void *data, u64 val) diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
atomic_set(&adev->in_gpu_reset, 0);
-   mutex_init(&adev->lock_reset);
+   init_rwsem(&adev->reset_sem);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
 
@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device 
*adev)
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
return false;
 
-   mutex_lock(&adev->lock_reset);
+   down_write(&adev->reset_sem);
 
atomic_inc(&adev->gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ static 
void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
atomic_set(&adev->in_gpu_reset, 0);
-   mutex_unlock(&adev->lock_reset)

RE: [PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-20 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
 Thanks for your review. I will update it according to your suggestion. 

Best Regards
Dennis Li
-Original Message-
From: Christian König  
Sent: Thursday, August 20, 2020 5:11 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking ; Koenig, 
Christian 
Subject: Re: [PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore

Am 20.08.20 um 04:09 schrieb Dennis Li:
> clients don't need reset-lock for synchronization when no GPU 
> recovery.
>
> Signed-off-by: Dennis Li 
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index c8aec832b244..ec11ed2a9ca4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -954,7 +954,7 @@ struct amdgpu_device {
>   
>   atomic_tin_gpu_reset;
>   enum pp_mp1_state   mp1_state;
> - struct mutex  lock_reset;
> + struct rw_semaphore reset_sem;
>   struct amdgpu_doorbell_index doorbell_index;
>   
>   struct mutexnotifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 79b397800cbc..0090e850eab9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -101,14 +101,17 @@ static int amdgpu_debugfs_autodump_open(struct 
> inode *inode, struct file *file)
>   
>   file->private_data = adev;
>   
> - mutex_lock(&adev->lock_reset);
> + if (down_read_killable(&adev->reset_sem))
> + return -EINTR;

Better use ret = down_read_killable(); if (ret) return ret; here. Same for all 
other places of course.

> +
>   if (adev->autodump.dumping.done) {
>   reinit_completion(&adev->autodump.dumping);
>   ret = 0;
>   } else {
>   ret = -EBUSY;
>   }
> - mutex_unlock(&adev->lock_reset);
> +
> + up_read(&adev->reset_sem);
>   
>   return ret;
>   }
> @@ -1242,7 +1245,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
> void *data)
>   }
>   
>   /* Avoid accidently unparking the sched thread during GPU reset */
> - mutex_lock(&adev->lock_reset);
> + if (down_read_killable(&adev->reset_sem))
> + return -EINTR;
>   
>   /* hold on the scheduler */
>   for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1273,7 @@ 
> static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>   kthread_unpark(ring->sched.thread);
>   }
>   
> - mutex_unlock(&adev->lock_reset);
> + up_read(&adev->reset_sem);
>   
>   pm_runtime_mark_last_busy(dev->dev);
>   pm_runtime_put_autosuspend(dev->dev);
> @@ -1459,7 +1463,10 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>   return -ENOMEM;
>   
>   /* Avoid accidently unparking the sched thread during GPU reset */
> - mutex_lock(&adev->lock_reset);
> + if (down_read_killable(&adev->reset_sem)) {
> + kfree(fences);
> + return -EINTR;

Maybe better use a "goto err;" style error handling here.

> + }
>   
>   /* stop the scheduler */
>   kthread_park(ring->sched.thread);
> @@ -1500,7 +1507,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>   /* restart the scheduler */
>   kthread_unpark(ring->sched.thread);
>   
> - mutex_unlock(&adev->lock_reset);
> + up_read(&adev->reset_sem);
>   
>   ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 78fd2c9a7b7d..82242e2f5658 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   mutex_init(&adev->virt.vf_errors.lock);
>   hash_init(adev->mn_hash);
>   atomic_set(&adev->in_gpu_reset, 0);
> - mutex_init(&adev->lock_reset);
> + init_rwsem(&adev->reset_sem);
>   mutex_init(&adev->psp.mutex);
>   mutex_init(&adev->notifier_lock);
>   
> @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct 
> amdgpu_device *adev)
>   if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
>   return false;
>   
> - mutex_lock(&

RE: [PATCH] drm/amdgpu: fix the nullptr issue when reenter GPU recovery

2020-08-20 Thread Li, Dennis
[AMD Public Use]

Hi, Hawking,
  When RAS uncorrectable error happens, RAS interrupt will trigger a GPU 
recovery.  At the same time, if a GFX or compute job is timeout, driver will 
trigger a new one. 

Best Regards
Dennis Li
-Original Message-
From: Zhang, Hawking  
Sent: Thursday, August 20, 2020 4:24 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Koenig, Christian 
Cc: Li, Dennis 
Subject: RE: [PATCH] drm/amdgpu: fix the nullptr issue when reenter GPU recovery

[AMD Public Use]

Hi Dennis,

Can you elaborate the case that driver re-enter GPU recovery in sGPU system? 
I'm wondering whether this is a valid case or we shall prevent this from the 
beginning.

Regards,
Hawking

-Original Message-
From: Dennis Li  
Sent: Thursday, August 20, 2020 10:21
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Kuehling, Felix ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH] drm/amdgpu: fix the nullptr issue when reenter GPU recovery

in single gpu system, if driver reenter gpu recovery, amdgpu_device_lock_adev 
will return false, but hive is nullptr now.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 82242e2f5658..81b1d9a1dca0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4371,8 +4371,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (!amdgpu_device_lock_adev(tmp_adev)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another 
already in progress",
  job ? job->base.id : -1);
-   mutex_unlock(&hive->hive_lock);
-   return 0;
+   r = 0;
+   goto skip_recovery;
}
 
/*
@@ -4505,6 +4505,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_device_unlock_adev(tmp_adev);
}
 
+skip_recovery:
if (hive) {
atomic_set(&hive->in_reset, 0);
mutex_unlock(&hive->hive_lock);
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH v2] drm/amdgpu: refine create and release logic of hive info

2020-08-18 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
  Thanks for your review. I will update a new patch according to your 
suggestion. 

Best Regards
Dennis Li
-Original Message-
From: Christian König  
Sent: Tuesday, August 18, 2020 7:50 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking ; Koenig, 
Christian 
Subject: Re: [PATCH v2] drm/amdgpu: refine create and release logic of hive info

Am 18.08.20 um 13:42 schrieb Dennis Li:
> Change to dynamically create and release hive info object, which help 
> driver support more hives in the future.
>
> v2:
> Change to save hive object pointer in adev, to avoid locking 
> xgmi_mutex every time when calling amdgpu_get_xgmi_hive.
>
> Signed-off-by: Dennis Li 
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 98d0c6e5ab3c..894886d6381b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -730,7 +730,7 @@ struct amdgpu_device {
>   #ifdef CONFIG_DRM_AMD_ACP
>   struct amdgpu_acp   acp;
>   #endif
> -
> + void*hive;

Any reason not to use the struct amdgpu_hive_info here?



>   /* ASIC */
>   enum amd_asic_type  asic_type;
>   uint32_tfamily;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f323281c82b0..bc6ef0caf157 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2857,7 +2857,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
> work_struct *__work)
>   {
>   struct amdgpu_device *adev =
>   container_of(__work, struct amdgpu_device, xgmi_reset_work);
> - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
>   
>   /* It's a bug to not have a hive within this function */
>   if (WARN_ON(!hive))
> @@ -2895,6 +2895,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
> work_struct *__work)
>   if (adev->asic_reset_res)
>   DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>adev->asic_reset_res, adev->ddev->unique);
> + amdgpu_put_xgmi_hive(hive);
>   }
>   
>   static int amdgpu_device_get_job_timeout_settings(struct 
> amdgpu_device *adev) @@ -4315,7 +4316,7 @@ int 
> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>* We always reset all schedulers for device and all devices for XGMI
>* hive so that should take care of them too.
>*/
> - hive = amdgpu_get_xgmi_hive(adev, false);
> + hive = amdgpu_get_xgmi_hive(adev);
>   if (hive) {
>   if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
>   DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
> another 
> already in progress", diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index bf71f0a58786..18cdd259d568 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1555,9 +1555,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>   struct amdgpu_device *remote_adev = NULL;
>   struct amdgpu_device *adev = ras->adev;
>   struct list_head device_list, *device_list_handle =  NULL;
> - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
>   
>   if (!ras->disable_ras_err_cnt_harvest) {
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
> +
>   /* Build list of devices to query RAS related errors */
>   if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
>   device_list_handle = &hive->device_list; @@ -1570,6 
> +1571,8 @@ 
> static void amdgpu_ras_do_recovery(struct work_struct *work)
>   list_for_each_entry(remote_adev,
>   device_list_handle, gmc.xgmi.head)
>   amdgpu_ras_log_on_err_counter(remote_adev);
> +
> + amdgpu_put_xgmi_hive(hive);
>   }
>   
>   if (amdgpu_device_should_recover_gpu(ras->adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 7a61dc6738eb..c6bd5f0c1339 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -35,11 +35,9 @@
>   
>   static DEFINE_MUTEX(xgmi_mutex);
>   
> -#defi

RE: [PATCH] drm/amdgpu: fix amdgpu_bo_release_notify() comment error

2020-08-17 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


Reviewed-by: Dennis Li 

-Original Message-
From: amd-gfx  On Behalf Of Kevin Wang
Sent: Monday, August 17, 2020 3:36 PM
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Wang, Kevin(Yang) 
; Koenig, Christian 
Subject: [PATCH] drm/amdgpu: fix amdgpu_bo_release_notify() comment error

fix amdgpu_bo_release_notify() comment error.

Signed-off-by: Kevin Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 3d95b3edb635..4cb750ed6851 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1301,7 +1301,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,  
}
 
 /**
- * amdgpu_bo_move_notify - notification about a BO being released
+ * amdgpu_bo_release_notify - notification about a BO being released
  * @bo: pointer to a buffer object
  *
  * Wipes VRAM buffers whose contents should not be leaked before the
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CDennis.Li%40amd.com%7C2c67e088110b4b1a4e9f08d8428033ca%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637332465681957370&sdata=wbSpFkp1XpgMw7eogSPgplu8ySGiIGAFVRSdlD%2BtYHo%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: Fix incorrect return value in sysfs for pp_od_clk_voltage

2020-08-13 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Matt,
  With your change, I still could reproduce the following issue:

# echo "s 1 1900" > /sys/class/drm/card0/device/pp_od_clk_voltage
bash: echo: write error: Invalid argument

 I found that it is related the following lines code, could you help double 
check it?

while ((sub_str = strsep(&tmp_str, delimiter)) != NULL) {  // sub_str 
will be empty string
ret = kstrtol(sub_str, 0, ¶meter[parameter_size]);
if (ret)
return -EINVAL; // return here
parameter_size++;

while (isspace(*tmp_str))
tmp_str++;
}

Best Regards
Dennis Li
-Original Message-
From: Matt Coffin  
Sent: Friday, August 14, 2020 9:15 AM
To: amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; Li, Dennis 
; Matt Coffin 
Subject: [PATCH] drm/amdgpu: Fix incorrect return value in sysfs for 
pp_od_clk_voltage

The changes in edad8312cbbf9a33c86873fc4093664f150dd5c1 introduced an issue 
with the sysfs interface for pp_od_clk_voltage. It overwrites the return value 
to 0 when it calls another function, then returns 0. The intended behavior is 
that a positive return value indicates the number of bytes from the buffer that 
you processed in that call.

With the 0 return value, clients would submit the same value to be written over 
and over again, resulting in an infinite loop.

This is resolved by returning the count of bytes read (in this case the whole 
message), when the desired return is 0 (success).

Fixes: edad8312cbbf ("drm/amdgpu: fix system hang issue during GPU")
Bug: 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F1245&data=02%7C01%7CDennis.Li%40amd.com%7C4de8308bf7974ea9e62308d83fef922b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329646078379799&sdata=N9c6e7cUMCDpvBIYUEzxkadJbJdBryXyfhfhb%2BUEwjg%3D&reserved=0
Signed-off-by: Matt Coffin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 1705e328c6fc..f00c7ed361d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -937,7 +937,11 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device 
*dev,
 
 pro_end:
up_read(&adev->reset_sem);
-   return ret;
+   if (ret) {
+   return ret;
+   } else {
+   return count;
+   }
 }
 
 static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
--
2.28.0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: guard ras debugfs creation/removal based on CONFIG_DEBUG_FS

2020-08-13 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


Reviewed-by: Dennis Li 

-Original Message-
From: Chen, Guchun  
Sent: Thursday, August 13, 2020 3:04 PM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Li, 
Dennis ; Zhou1, Tao ; Clements, John 
; Deucher, Alexander 
Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: guard ras debugfs creation/removal based on 
CONFIG_DEBUG_FS

It can avoid potential build warn/error when CONFIG_DEBUG_FS is not set.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 35d5bf9e6f6f..e1d78cb448e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1244,6 +1244,7 @@ void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
 
 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  {
+#if defined(CONFIG_DEBUG_FS)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
struct ras_fs_if fs_info;
@@ -1266,6 +1267,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device 
*adev)
amdgpu_ras_debugfs_create(adev, &fs_info);
}
}
+#endif
 }
 
 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, @@ -1282,6 +1284,7 
@@ void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
 
 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)  {
+#if defined(CONFIG_DEBUG_FS)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj, *tmp;
 
@@ -1290,6 +1293,7 @@ static void amdgpu_ras_debugfs_remove_all(struct 
amdgpu_device *adev)
}
 
con->dir = NULL;
+#endif
 }
 /* debugfs end */
 
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-12 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Am 12.08.20 um 12:02 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Am 12.08.20 um 11:23 schrieb Li, Dennis:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Am 12.08.20 um 03:33 schrieb Li, Dennis:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Hi, Christian,
>>>
>>> Re: I was wondering the same thing for the amdgpu_gem_va_ioctl() as well. 
>>> We shouldn't have any hardware access here, so taking the reset_sem looks 
>>> like overkill to me.
>>>
>>> [Dennis Li] amdgpu_vm_bo_unmap, amdgpu_vm_bo_clear_mappings, 
>>> amdgpu_vm_bo_replace_map  and amdgpu_gem_va_update_vm all a chance to 
>>> access hardware.
>> This is complete nonsense. The functions intentionally work through the 
>> scheduler to avoid accessing the hardware directly for exactly that reason.
>>
>> The only hardware access we have here is the HDP flush and that can fail in 
>> the case of a GPU reset without causing problems.
>>
>> [Dennis Li]  amdgpu_vm_bo_clear_mappings -> amdgpu_vm_prt_get -> 
>> amdgpu_vm_update_prt_state -> gmc_v8_0_set_prt
> That is for pre gfx9 hardware and only called once during initial enabling of 
> the feature.
>
> Please remove that locking again since it is clearly completely against the 
> driver design.
>
> [Dennis Li] okay, if you agree, I will change to only protect 
> amdgpu_gem_va_update_vm in this function.

Better even only protect the amdgpu_vm_update_prt_state() function.
[Dennis Li] Got it. According to your suggestion, I will also narrow down the 
scope of reset_sem in other functions. 

Christian.

>
> Christian.
>
>> Regards,
>> Christian.
>>
>>> Best Regards
>>> Dennis Li
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Wednesday, August 12, 2020 12:15 AM
>>> To: Kuehling, Felix ; Li, Dennis 
>>> ; amd-gfx@lists.freedesktop.org; Deucher, 
>>> Alexander ; Zhang, Hawking 
>>> 
>>> Subject: Re: [PATCH] drm/amdgpu: fix a potential circular locking 
>>> dependency
>>>
>>> Am 11.08.20 um 15:57 schrieb Felix Kuehling:
>>>> Am 2020-08-11 um 5:32 a.m. schrieb Dennis Li:
>>>>> [  653.902305]
>>>>> ==
>>>>> [  653.902928] WARNING: possible circular locking dependency detected
>>>>> [  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   
>>>>> OE
>>>>> [  653.904098]
>>>>> --
>>>>> [  653.904675] amdgpu_test/3975 is trying to acquire lock:
>>>>> [  653.905241] 97848f8647a0 (&adev->reset_sem){.+.+}, at:
>>>>> amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [  653.905953]
>>>>>but task is already holding lock:
>>>>> [  653.907087] 9744adbee1f8 
>>>>> (reservation_ww_class_mutex){+.+.},
>>>>> at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [  653.907694]
>>>>>which lock already depends on the new lock.
>>>>>
>>>>> [  653.909423]
>>>>>the existing dependency chain (in reverse order) is:
>>>>> [  653.910594]
>>>>>-> #1 (reservation_ww_class_mutex){+.+.}:
>>>>> [  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
>>>>> [  653.912350]ww_mutex_lock+0x73/0x80
>>>>> [  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
>>>>> [  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
>>>>> [  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
>>>>> [  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
>>>>> [  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
>>>>> [  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
>>>>> [  653.916959]local_pci_probe+0x47/0xa0
>>>>> [  653.917570]work_for_cpu_fn+0x1a/0x30
>>>>> [  653.918184]process_one_work+0x29e/0x630
>>>>> [  653.918803]worker_thread+0x22b/0x3f0
>>>>> [  653.919427]kthread+0x12f/0x150
>>>>> [  653.920047]ret_from_fork+0x3a/0x50
>>>>> [  653.920661]
>>>>>

RE: [PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-12 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Am 12.08.20 um 11:23 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Am 12.08.20 um 03:33 schrieb Li, Dennis:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi, Christian,
>>
>> Re: I was wondering the same thing for the amdgpu_gem_va_ioctl() as well. We 
>> shouldn't have any hardware access here, so taking the reset_sem looks like 
>> overkill to me.
>>
>> [Dennis Li] amdgpu_vm_bo_unmap, amdgpu_vm_bo_clear_mappings, 
>> amdgpu_vm_bo_replace_map  and amdgpu_gem_va_update_vm all a chance to access 
>> hardware.
> This is complete nonsense. The functions intentionally work through the 
> scheduler to avoid accessing the hardware directly for exactly that reason.
>
> The only hardware access we have here is the HDP flush and that can fail in 
> the case of a GPU reset without causing problems.
>
> [Dennis Li]  amdgpu_vm_bo_clear_mappings -> amdgpu_vm_prt_get -> 
> amdgpu_vm_update_prt_state -> gmc_v8_0_set_prt

That is for pre gfx9 hardware and only called once during initial enabling of 
the feature.

Please remove that locking again since it is clearly completely against the 
driver design.

[Dennis Li] okay, if you agree, I will change to only protect 
amdgpu_gem_va_update_vm in this function. 

Christian.

>
> Regards,
> Christian.
>
>> Best Regards
>> Dennis Li
>> -Original Message-
>> From: Koenig, Christian 
>> Sent: Wednesday, August 12, 2020 12:15 AM
>> To: Kuehling, Felix ; Li, Dennis 
>> ; amd-gfx@lists.freedesktop.org; Deucher, 
>> Alexander ; Zhang, Hawking 
>> 
>> Subject: Re: [PATCH] drm/amdgpu: fix a potential circular locking 
>> dependency
>>
>> Am 11.08.20 um 15:57 schrieb Felix Kuehling:
>>> Am 2020-08-11 um 5:32 a.m. schrieb Dennis Li:
>>>> [  653.902305]
>>>> ==
>>>> [  653.902928] WARNING: possible circular locking dependency detected
>>>> [  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   
>>>> OE
>>>> [  653.904098]
>>>> --
>>>> [  653.904675] amdgpu_test/3975 is trying to acquire lock:
>>>> [  653.905241] 97848f8647a0 (&adev->reset_sem){.+.+}, at:
>>>> amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [  653.905953]
>>>>   but task is already holding lock:
>>>> [  653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.},
>>>> at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [  653.907694]
>>>>   which lock already depends on the new lock.
>>>>
>>>> [  653.909423]
>>>>   the existing dependency chain (in reverse order) is:
>>>> [  653.910594]
>>>>   -> #1 (reservation_ww_class_mutex){+.+.}:
>>>> [  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
>>>> [  653.912350]ww_mutex_lock+0x73/0x80
>>>> [  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
>>>> [  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
>>>> [  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
>>>> [  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
>>>> [  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
>>>> [  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
>>>> [  653.916959]local_pci_probe+0x47/0xa0
>>>> [  653.917570]work_for_cpu_fn+0x1a/0x30
>>>> [  653.918184]process_one_work+0x29e/0x630
>>>> [  653.918803]worker_thread+0x22b/0x3f0
>>>> [  653.919427]kthread+0x12f/0x150
>>>> [  653.920047]ret_from_fork+0x3a/0x50
>>>> [  653.920661]
>>>>   -> #0 (&adev->reset_sem){.+.+}:
>>>> [  653.921893]__lock_acquire+0x13ec/0x16e0
>>>> [  653.922531]lock_acquire+0xb8/0x1c0
>>>> [  653.923174]down_read+0x48/0x230
>>>> [  653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
>>>> [  653.924588]drm_ioctl_kernel+0xb6/0x100 [drm]
>>>> [  653.925283]drm_ioctl+0x389/0x450 [drm]
>>>> [  653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu]
>>>> [  653.926686]ksys_ioctl+0x98/0xb0
>>>> [  653.927357]__x64_sys_ioctl+0x1a/0x20
>>>> [  653.928030]do_syscall_6

RE: [PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-12 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Am 12.08.20 um 03:19 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Felix,
>
> Re: It may be better to fix it the other way around in 
> amdgpu_amdkfd_alloc_gtt_mem. Always take the reset_sem inside the 
> reservation. Otherwise you will never be able to take the reset_sem while any 
> BOs are reserved. That's probably going to cause you other problems later.
> [Dennis Li] Thanks that you find the potential issue, I will change it in 
> version 2.
>
> Re: That makes me wonder, why do you need the reset_sem in 
> amdgpu_amdkfd_alloc_gtt_mem in the first place? There is no obvious hardware 
> access in that function. Is it for amdgpu_ttm_alloc_gart updating the GART 
> table through HDP?
> [Dennis Li] Yes, amdgpu_gart_bind will flush HDP and TLB. I have considered 
> to only protect amdgpu_ttm_alloc_gart before.

That access is irrelevant and the lock should be removed or changed into a 
trylock.

See we need the HDP flush only because the hardware could have accessed the 
data before.

But after a GPU reset the HDP is known to be clean, so this doesn't need any 
protection.

>   But I worry other functions will access hardware in the future. Therefore I 
> select an aggressive approach which lock reset_sem at the beginning of entry 
> functions of amdgpu driver.

This is not a good idea. We used to have such a global lock before and removed 
it because it caused all kind of problems.

[Dennis Li] okay. If you don't agree this aggressive approach. I will change to 
protect amdgpu_ttm_alloc_gart only. 

When was this added? Looks like it slipped under my radar or I wasn't awake 
enough at that moment.

Christian.

>
> Best Regards
> Dennis Li
> -Original Message-
> From: Kuehling, Felix 
> Sent: Tuesday, August 11, 2020 9:57 PM
> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
> Deucher, Alexander ; Zhang, Hawking 
> ; Koenig, Christian 
> Subject: Re: [PATCH] drm/amdgpu: fix a potential circular locking 
> dependency
>
> Am 2020-08-11 um 5:32 a.m. schrieb Dennis Li:
>> [  653.902305] ==
>> [  653.902928] WARNING: possible circular locking dependency detected
>> [  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
>> [  653.904098] --
>> [  653.904675] amdgpu_test/3975 is trying to acquire lock:
>> [  653.905241] 97848f8647a0 (&adev->reset_sem){.+.+}, at:
>> amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [  653.905953]
>> but task is already holding lock:
>> [  653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.},
>> at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [  653.907694]
>> which lock already depends on the new lock.
>>
>> [  653.909423]
>> the existing dependency chain (in reverse order) is:
>> [  653.910594]
>> -> #1 (reservation_ww_class_mutex){+.+.}:
>> [  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
>> [  653.912350]ww_mutex_lock+0x73/0x80
>> [  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
>> [  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
>> [  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
>> [  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
>> [  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
>> [  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
>> [  653.916959]local_pci_probe+0x47/0xa0
>> [  653.917570]work_for_cpu_fn+0x1a/0x30
>> [  653.918184]process_one_work+0x29e/0x630
>> [  653.918803]worker_thread+0x22b/0x3f0
>> [  653.919427]kthread+0x12f/0x150
>> [  653.920047]ret_from_fork+0x3a/0x50
>> [  653.920661]
>> -> #0 (&adev->reset_sem){.+.+}:
>> [  653.921893]__lock_acquire+0x13ec/0x16e0
>> [  653.922531]lock_acquire+0xb8/0x1c0
>> [  653.923174]down_read+0x48/0x230
>> [  653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
>> [  653.924588]drm_ioctl_kernel+0xb6/0x100 [drm]
>> [  653.925283]drm_ioctl+0x389/0x450 [drm]
>> [  653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu]
>> [  653.926686]ksys_ioctl+0x98/0xb0
>> [  653.927357]__x64_sys_ioctl+0x1a/0x20
>> [  653.928030]do_syscall_64+0x5f/0x250
>> [  653.928697]entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [  653.929373]
>> other info that might help us debug this:
&g

RE: [PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-12 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Am 12.08.20 um 03:33 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Christian,
>
> Re: I was wondering the same thing for the amdgpu_gem_va_ioctl() as well. We 
> shouldn't have any hardware access here, so taking the reset_sem looks like 
> overkill to me.
>
> [Dennis Li] amdgpu_vm_bo_unmap, amdgpu_vm_bo_clear_mappings, 
> amdgpu_vm_bo_replace_map  and amdgpu_gem_va_update_vm all a chance to access 
> hardware.

This is complete nonsense. The functions intentionally work through the 
scheduler to avoid accessing the hardware directly for exactly that reason.

The only hardware access we have here is the HDP flush and that can fail in the 
case of a GPU reset without causing problems.

[Dennis Li]  amdgpu_vm_bo_clear_mappings -> amdgpu_vm_prt_get -> 
amdgpu_vm_update_prt_state -> gmc_v8_0_set_prt

Regards,
Christian.

>
> Best Regards
> Dennis Li
> -Original Message-
> From: Koenig, Christian 
> Sent: Wednesday, August 12, 2020 12:15 AM
> To: Kuehling, Felix ; Li, Dennis 
> ; amd-gfx@lists.freedesktop.org; Deucher, Alexander 
> ; Zhang, Hawking 
> Subject: Re: [PATCH] drm/amdgpu: fix a potential circular locking 
> dependency
>
> Am 11.08.20 um 15:57 schrieb Felix Kuehling:
>> Am 2020-08-11 um 5:32 a.m. schrieb Dennis Li:
>>> [  653.902305] 
>>> ==
>>> [  653.902928] WARNING: possible circular locking dependency detected
>>> [  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
>>> [  653.904098] 
>>> --
>>> [  653.904675] amdgpu_test/3975 is trying to acquire lock:
>>> [  653.905241] 97848f8647a0 (&adev->reset_sem){.+.+}, at:
>>> amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [  653.905953]
>>>  but task is already holding lock:
>>> [  653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.},
>>> at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [  653.907694]
>>>  which lock already depends on the new lock.
>>>
>>> [  653.909423]
>>>  the existing dependency chain (in reverse order) is:
>>> [  653.910594]
>>>  -> #1 (reservation_ww_class_mutex){+.+.}:
>>> [  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
>>> [  653.912350]ww_mutex_lock+0x73/0x80
>>> [  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
>>> [  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
>>> [  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
>>> [  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
>>> [  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
>>> [  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
>>> [  653.916959]local_pci_probe+0x47/0xa0
>>> [  653.917570]work_for_cpu_fn+0x1a/0x30
>>> [  653.918184]process_one_work+0x29e/0x630
>>> [  653.918803]worker_thread+0x22b/0x3f0
>>> [  653.919427]kthread+0x12f/0x150
>>> [  653.920047]ret_from_fork+0x3a/0x50
>>> [  653.920661]
>>>  -> #0 (&adev->reset_sem){.+.+}:
>>> [  653.921893]__lock_acquire+0x13ec/0x16e0
>>> [  653.922531]lock_acquire+0xb8/0x1c0
>>> [  653.923174]down_read+0x48/0x230
>>> [  653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
>>> [  653.924588]drm_ioctl_kernel+0xb6/0x100 [drm]
>>> [  653.925283]drm_ioctl+0x389/0x450 [drm]
>>> [  653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu]
>>> [  653.926686]ksys_ioctl+0x98/0xb0
>>> [  653.927357]__x64_sys_ioctl+0x1a/0x20
>>> [  653.928030]do_syscall_64+0x5f/0x250
>>> [  653.928697]entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>> [  653.929373]
>>>  other info that might help us debug this:
>>>
>>> [  653.931356]  Possible unsafe locking scenario:
>>>
>>> [  653.932647]CPU0CPU1
>>> [  653.933287]
>>> [  653.933911]   lock(reservation_ww_class_mutex);
>>> [  653.934530]lock(&adev->reset_sem);
>>> [  653.935154]
>>> lock(reservation_ww_class_mutex);
>>> [  653.935766]   lock(&adev->reset_sem);
>>> [  653.936360]
>>> 

RE: [PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-11 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,

Re: I was wondering the same thing for the amdgpu_gem_va_ioctl() as well. We 
shouldn't have any hardware access here, so taking the reset_sem looks like 
overkill to me.

[Dennis Li] amdgpu_vm_bo_unmap, amdgpu_vm_bo_clear_mappings, 
amdgpu_vm_bo_replace_map  and amdgpu_gem_va_update_vm all a chance to access 
hardware. 

Best Regards
Dennis Li
-Original Message-
From: Koenig, Christian  
Sent: Wednesday, August 12, 2020 12:15 AM
To: Kuehling, Felix ; Li, Dennis ; 
amd-gfx@lists.freedesktop.org; Deucher, Alexander ; 
Zhang, Hawking 
Subject: Re: [PATCH] drm/amdgpu: fix a potential circular locking dependency

Am 11.08.20 um 15:57 schrieb Felix Kuehling:
> Am 2020-08-11 um 5:32 a.m. schrieb Dennis Li:
>> [  653.902305] ==
>> [  653.902928] WARNING: possible circular locking dependency detected
>> [  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
>> [  653.904098] --
>> [  653.904675] amdgpu_test/3975 is trying to acquire lock:
>> [  653.905241] 97848f8647a0 (&adev->reset_sem){.+.+}, at: 
>> amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [  653.905953]
>> but task is already holding lock:
>> [  653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, 
>> at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [  653.907694]
>> which lock already depends on the new lock.
>>
>> [  653.909423]
>> the existing dependency chain (in reverse order) is:
>> [  653.910594]
>> -> #1 (reservation_ww_class_mutex){+.+.}:
>> [  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
>> [  653.912350]ww_mutex_lock+0x73/0x80
>> [  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
>> [  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
>> [  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
>> [  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
>> [  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
>> [  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
>> [  653.916959]local_pci_probe+0x47/0xa0
>> [  653.917570]work_for_cpu_fn+0x1a/0x30
>> [  653.918184]process_one_work+0x29e/0x630
>> [  653.918803]worker_thread+0x22b/0x3f0
>> [  653.919427]kthread+0x12f/0x150
>> [  653.920047]ret_from_fork+0x3a/0x50
>> [  653.920661]
>> -> #0 (&adev->reset_sem){.+.+}:
>> [  653.921893]__lock_acquire+0x13ec/0x16e0
>> [  653.922531]lock_acquire+0xb8/0x1c0
>> [  653.923174]down_read+0x48/0x230
>> [  653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
>> [  653.924588]drm_ioctl_kernel+0xb6/0x100 [drm]
>> [  653.925283]drm_ioctl+0x389/0x450 [drm]
>> [  653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu]
>> [  653.926686]ksys_ioctl+0x98/0xb0
>> [  653.927357]__x64_sys_ioctl+0x1a/0x20
>> [  653.928030]do_syscall_64+0x5f/0x250
>> [  653.928697]entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [  653.929373]
>> other info that might help us debug this:
>>
>> [  653.931356]  Possible unsafe locking scenario:
>>
>> [  653.932647]CPU0CPU1
>> [  653.933287]
>> [  653.933911]   lock(reservation_ww_class_mutex);
>> [  653.934530]lock(&adev->reset_sem);
>> [  653.935154]
>> lock(reservation_ww_class_mutex);
>> [  653.935766]   lock(&adev->reset_sem);
>> [  653.936360]
>>  *** DEADLOCK ***
>>
>> [  653.938028] 2 locks held by amdgpu_test/3975:
>> [  653.938574]  #0: b2a862d6bcd0 
>> (reservation_ww_class_acquire){+.+.}, at: 
>> amdgpu_gem_va_ioctl+0x39b/0x4f0 [amdgpu] [  653.939233]  #1: 
>> 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, at: 
>> ttm_eu_reserve_buffers+0x1ae/0x520 [ttm]
>>
>> change the order of reservation_ww_class_mutex and adev->reset_sem in 
>> amdgpu_gem_va_ioctl the same as ones in amdgpu_amdkfd_alloc_gtt_mem, 
>> to avoid potential dead lock.
> It may be better to fix it the other way around in 
> amdgpu_amdkfd_alloc_gtt_mem. Always take the reset_sem inside the 
> reservation. Otherwise you will never be able to take the reset_sem 
> while any BOs are reserved. That's probably going to cause you other 
> problem

RE: [PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-11 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,

Re: It may be better to fix it the other way around in 
amdgpu_amdkfd_alloc_gtt_mem. Always take the reset_sem inside the reservation. 
Otherwise you will never be able to take the reset_sem while any BOs are 
reserved. That's probably going to cause you other problems later.
[Dennis Li] Thanks that you find the potential issue, I will change it in 
version 2.

Re: That makes me wonder, why do you need the reset_sem in 
amdgpu_amdkfd_alloc_gtt_mem in the first place? There is no obvious hardware 
access in that function. Is it for amdgpu_ttm_alloc_gart updating the GART 
table through HDP?
[Dennis Li] Yes, amdgpu_gart_bind will flush HDP and TLB. I have considered to 
only protect amdgpu_ttm_alloc_gart before. But I worry other functions will 
access hardware in the future. Therefore I select an aggressive approach which 
lock reset_sem at the beginning of entry functions of amdgpu driver.

Best Regards
Dennis Li
-Original Message-
From: Kuehling, Felix  
Sent: Tuesday, August 11, 2020 9:57 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhang, Hawking ; 
Koenig, Christian 
Subject: Re: [PATCH] drm/amdgpu: fix a potential circular locking dependency

Am 2020-08-11 um 5:32 a.m. schrieb Dennis Li:
> [  653.902305] ==
> [  653.902928] WARNING: possible circular locking dependency detected
> [  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
> [  653.904098] --
> [  653.904675] amdgpu_test/3975 is trying to acquire lock:
> [  653.905241] 97848f8647a0 (&adev->reset_sem){.+.+}, at: 
> amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [  653.905953]
>but task is already holding lock:
> [  653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, 
> at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [  653.907694]
>which lock already depends on the new lock.
>
> [  653.909423]
>the existing dependency chain (in reverse order) is:
> [  653.910594]
>-> #1 (reservation_ww_class_mutex){+.+.}:
> [  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
> [  653.912350]ww_mutex_lock+0x73/0x80
> [  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
> [  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
> [  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
> [  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
> [  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
> [  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
> [  653.916959]local_pci_probe+0x47/0xa0
> [  653.917570]work_for_cpu_fn+0x1a/0x30
> [  653.918184]process_one_work+0x29e/0x630
> [  653.918803]worker_thread+0x22b/0x3f0
> [  653.919427]kthread+0x12f/0x150
> [  653.920047]ret_from_fork+0x3a/0x50
> [  653.920661]
>-> #0 (&adev->reset_sem){.+.+}:
> [  653.921893]__lock_acquire+0x13ec/0x16e0
> [  653.922531]lock_acquire+0xb8/0x1c0
> [  653.923174]down_read+0x48/0x230
> [  653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
> [  653.924588]drm_ioctl_kernel+0xb6/0x100 [drm]
> [  653.925283]drm_ioctl+0x389/0x450 [drm]
> [  653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu]
> [  653.926686]ksys_ioctl+0x98/0xb0
> [  653.927357]__x64_sys_ioctl+0x1a/0x20
> [  653.928030]do_syscall_64+0x5f/0x250
> [  653.928697]entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  653.929373]
>other info that might help us debug this:
>
> [  653.931356]  Possible unsafe locking scenario:
>
> [  653.932647]CPU0CPU1
> [  653.933287]
> [  653.933911]   lock(reservation_ww_class_mutex);
> [  653.934530]lock(&adev->reset_sem);
> [  653.935154]
> lock(reservation_ww_class_mutex);
> [  653.935766]   lock(&adev->reset_sem);
> [  653.936360]
> *** DEADLOCK ***
>
> [  653.938028] 2 locks held by amdgpu_test/3975:
> [  653.938574]  #0: b2a862d6bcd0 
> (reservation_ww_class_acquire){+.+.}, at: 
> amdgpu_gem_va_ioctl+0x39b/0x4f0 [amdgpu] [  653.939233]  #1: 
> 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, at: 
> ttm_eu_reserve_buffers+0x1ae/0x520 [ttm]
>
> change the order of reservation_ww_class_mutex and adev->reset_sem in 
> amdgpu_gem_va_ioctl the same as ones in amdgpu_amdkfd_alloc_gtt_mem, 
> to avoid potential dead lock.

It may be better to fix it the other way around in am

RE: [PATCH v3] drm/amdgpu: annotate a false positive recursive locking

2020-08-11 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


Quick grep says it is protected like that ... can you pls paste the full build 
error without {}?
-Daniel

[Dennis Li] hi, Daniel, the full build error as the following: 

make: Entering directory 
'/home/yajunl/workspace/amd/brahma-staging/BUILD/x86_64/linux'
  CC [M]  drivers/gpu/drm/amd/amdgpu/amdgpu_device.o
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c: In function 
‘amdgpu_device_lock_adev’:
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:4155:2: error: ‘else’ without a 
previous ‘if’
  else
  ^~~~
scripts/Makefile.build:267: recipe for target 
'drivers/gpu/drm/amd/amdgpu/amdgpu_device.o' failed
make[1]: *** [drivers/gpu/drm/amd/amdgpu/amdgpu_device.o] Error 1
Makefile:1683: recipe for target 'drivers/gpu/drm/amd/amdgpu' failed
make: *** [drivers/gpu/drm/amd/amdgpu] Error 2



>
> Let me take a look,
> Christian.
>
> >
> > Regards,
> > Guchun
> >
> > -Original Message-
> > From: amd-gfx  On Behalf Of 
> > Christian König
> > Sent: Tuesday, August 11, 2020 2:53 PM
> > To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
> > Deucher, Alexander ; Kuehling, Felix 
> > ; Zhang, Hawking ; 
> > dan...@ffwll.ch
> > Subject: Re: [PATCH v3] drm/amdgpu: annotate a false positive 
> > recursive locking
> >
> > Am 11.08.20 um 04:12 schrieb Dennis Li:
> >> [  584.110304] 
> >> [  584.110590] WARNING: possible recursive locking detected
> >> [  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   
> >> OE
> >> [  584.64] 
> >> [  584.111456] kworker/38:1/553 is trying to acquire lock:
> >> [  584.111721] 9b15ff0a47a0 (&adev->reset_sem){}, at:
> >> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.112112]
> >>  but task is already holding lock:
> >> [  584.112673] 9b1603d247a0 (&adev->reset_sem){}, at:
> >> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.113068]
> >>  other info that might help us debug this:
> >> [  584.113689]  Possible unsafe locking scenario:
> >>
> >> [  584.114350]CPU0
> >> [  584.114685]
> >> [  584.115014]   lock(&adev->reset_sem);
> >> [  584.115349]   lock(&adev->reset_sem);
> >> [  584.115678]
> >>   *** DEADLOCK ***
> >>
> >> [  584.116624]  May be due to missing lock nesting notation
> >>
> >> [  584.117284] 4 locks held by kworker/38:1/553:
> >> [  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.},
> >> at: process_one_work+0x21f/0x630 [  584.117967]  #1: 
> >> ac708e1c3e58 ((work_completion)(&con->recovery_work)){+.+.}, at:
> >> process_one_work+0x21f/0x630 [  584.118358]  #2: c1c2a5d0 
> >> (&tmp->hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xae/0x1030 
> >> [amdgpu] [  584.118786]  #3: 9b1603d247a0 (&adev->reset_sem){}, 
> >> at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.119222]
> >>  stack backtrace:
> >> [  584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: 
> >> G   OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1
> >> [  584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, 
> >> BIOS 3.1 05/23/2019 [  584.121223] Workqueue: events 
> >> amdgpu_ras_do_recovery [amdgpu] [  584.121638] Call Trace:
> >> [  584.122050]  dump_stack+0x98/0xd5 [  584.122499]  
> >> __lock_acquire+0x1139/0x16e0 [  584.122931]  ?
> >> trace_hardirqs_on+0x3b/0xf0 [  584.123358]  ?
> >> cancel_delayed_work+0xa6/0xc0 [  584.123771]  
> >> lock_acquire+0xb8/0x1c0 [  584.124197]  ? 
> >> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.124599]  
> >> down_write+0x49/0x120 [  584.125032]  ?
> >> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.125472]
> >> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.125910]  ?
> >> amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [  584.126367]
> >> amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [  584.126789]
> >> process_one_work+0x29e/0x630 [  584.127208]  
> >> worker_thread+0x3c/0x3f0 [  584.127621]  ? 
> >> __kthread_parkme+0x61/0x90 [  584.128014]
> >> kthread+0x12f/0x150 [  584.128402]  ? process_one_work+0x630/0x630 
> >> kthread+[
> >> 584.128790]  ? kthread_park+0x90/0x90 [  584.129174]
> >> ret_from_fork+0x3a/0x50
> >>

RE: [PATCH 2/2] drm/amdgpu: add debugfs node to toggle ras error cnt harvest

2020-08-09 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


Reviewed-by: DennisLi 

Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Monday, August 10, 2020 1:23 PM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Li, 
Dennis ; Lazar, Lijo ; Zhou1, Tao 
; Clements, John 
Cc: Chen, Guchun 
Subject: [PATCH 2/2] drm/amdgpu: add debugfs node to toggle ras error cnt 
harvest

Before ras recovery is issued, user could operate this debugfs node to 
enable/disable the harvest of all RAS IPs' ras error count registers, which 
will help keep hardware's registers'
status instead of cleaning up them.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e6978b8e2143..31df6bf2dc1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1215,6 +1215,13 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct 
amdgpu_device *adev)
 */
debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
&con->reboot);
+
+   /*
+* User could set this not to clean up hardware's error count register
+* of RAS IPs during ras recovery.
+*/
+   debugfs_create_bool("disable_ras_err_cnt_harvest", 0644,
+   con->dir, &con->disable_ras_err_cnt_harvest);
 }
 
 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-07 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

On Fri, Aug 7, 2020 at 5:32 PM Li, Dennis  wrote:
>
> [AMD Public Use]
>
> On Fri, Aug 7, 2020 at 1:59 PM Li, Dennis  wrote:
> >
> > [AMD Public Use]
> >
> > Hi, Daniel,
> >   Thanks your review. And I also understand your concern. I guess you 
> > missed the description of this issue, so I paste it again in the below, and 
> > explain why this issue happens.
> >
> >   For example, in a XGMI system with 2 GPU devices whose device entity 
> > is named adev. And each adev has a separated reset_sem.
> >   // device init function
> >   void device_init(adev) {
> > ...
> > init_rwsem(&adev->reset_sem);
> >   ...
> >   }
> >
> >  XGMI system have two GPU devices, so system will call device_init 
> > twice. However the definition of init_rwsem macro has a limitation which 
> > use a local static lock_class_key to initialize rw_sem, which cause each 
> > adev->reset_sem share the same lock_class_key.
> >
> >  #define init_rwsem(sem)
> >  \
> >  do {   \
> >  static struct lock_class_key __key;\
> > \
> > __init_rwsem((sem), #sem, &__key);  \
> >  } while (0)
> >
> >  And when GPU hang, we should do gpu recovery for all devices in the 
> > hive. Therefore we should lock each adev->reset_sem to protect GPU from be 
> > accessed by other threads during recovery. The detailed recovery sequence 
> > as the following:
> >  // Step1: lock all GPU firstly:
> >  for each adev of GPU0 or GPU1 {
> >down_write(adev->reset_sem);
> >do_suspend(adev);
> > }
> >
> > // Step2:
> > do_hive_recovery(hive);
> >
> > // Step3: resume and unlock GPU
> > for each adev of GPU0 or GPU1 {
> >   do_resume(adev)
> >   up_write(adev->reset_sem);
> > }
> >
> > Each adev->reset_sem has the same lock_class_key, and lockdep will take 
> > them as the same rw_semaphore object. Therefore in step1, when lock GPU1, 
> > lockdep will pop the below warning.
> >
> > I have considered your proposal (using  _nest_lock() ) before. Just as 
> > you said, _nest_lock() will hide true positive recursive locking. So I gave 
> > up it in the end.
> >
> > After reviewing codes of lockdep, I found the lockdep support 
> > dynamic_key, so using separated lock_class_key has no any side effect. In 
> > fact, init_rwsem also use dynamic_key. Please see the call sequence of 
> > init_rwsem and lockdep_set_class as the below:
> >1) init_rwsem -> __init_rwsem -> lockdep_init_map;
> >2) lockdep_set_class -> lockdep_init_map;
> >
> > Finally we go back to your concern, you maybe worry this change will 
> > cause the below dead-lock can't be detected. In fact, lockdep still is able 
> > to detect the issue as circular locking dependency, but there is no warning 
> > "recursive locking " in this case.
> > Thread A: down_write(adev->reset_sem) for GPU 0 -> 
> > down_write(adev->reset_sem) for GPU 1 -> ... -> up_write(adev->reset_sem) 
> > for GPU 1 -> up_write(adev->reset_sem) for GPU 0
> > Thread B: down_write(adev->reset_sem) for GPU 1 ->
> > down_write(adev->reset_sem) for GPU 0 -> ... ->
> > up_write(adev->reset_sem) for GPU 0 -> up_write(adev->reset_sem) for 
> > GPU 1
> >
> > But lockdep still can detect recursive locking for this case:
> > Thread A: down_write(adev->reset_sem) for GPU 0 -> ...-> ...->
> > down_write(adev->reset_sem) for GPU 0
>
> Yeah, I guessed correctly what you're doing. My recommendation to use 
> down_write_nest_lock still stands. This is for reset only, kernel-wide reset 
> lock really shouldn't hurt. Or make it a lock per xgmi hive, I'm assuming 
> that information is known somewhere.
>
> The problem with manual lockdep annotations is that they increase complexity. 
> You have to keep all the annotations in mind, including justifcations and 
> which parts they still catch and which parts they don't catch. And there's 
> zero performance justification for a gpu reset path to create some fancy 
> lockdep special cases.
>
> Locking needs to 

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-07 Thread Li, Dennis
[AMD Public Use]

On Fri, Aug 7, 2020 at 1:59 PM Li, Dennis  wrote:
>
> [AMD Public Use]
>
> Hi, Daniel,
>   Thanks your review. And I also understand your concern. I guess you 
> missed the description of this issue, so I paste it again in the below, and 
> explain why this issue happens.
>
>   For example, in a XGMI system with 2 GPU devices whose device entity is 
> named adev. And each adev has a separated reset_sem.
>   // device init function
>   void device_init(adev) {
> ...
> init_rwsem(&adev->reset_sem);
>   ...
>   }
>
>  XGMI system have two GPU devices, so system will call device_init twice. 
> However the definition of init_rwsem macro has a limitation which use a local 
> static lock_class_key to initialize rw_sem, which cause each adev->reset_sem 
> share the same lock_class_key.
>
>  #define init_rwsem(sem) \
>  do {   \
>  static struct lock_class_key __key;\
> \
> __init_rwsem((sem), #sem, &__key);  \
>  } while (0)
>
>  And when GPU hang, we should do gpu recovery for all devices in the 
> hive. Therefore we should lock each adev->reset_sem to protect GPU from be 
> accessed by other threads during recovery. The detailed recovery sequence as 
> the following:
>  // Step1: lock all GPU firstly:
>  for each adev of GPU0 or GPU1 {
>down_write(adev->reset_sem);
>do_suspend(adev);
> }
>
> // Step2:
> do_hive_recovery(hive);
>
> // Step3: resume and unlock GPU
> for each adev of GPU0 or GPU1 {
>   do_resume(adev)
>   up_write(adev->reset_sem);
> }
>
> Each adev->reset_sem has the same lock_class_key, and lockdep will take 
> them as the same rw_semaphore object. Therefore in step1, when lock GPU1, 
> lockdep will pop the below warning.
>
> I have considered your proposal (using  _nest_lock() ) before. Just as 
> you said, _nest_lock() will hide true positive recursive locking. So I gave 
> up it in the end.
>
> After reviewing codes of lockdep, I found the lockdep support 
> dynamic_key, so using separated lock_class_key has no any side effect. In 
> fact, init_rwsem also use dynamic_key. Please see the call sequence of 
> init_rwsem and lockdep_set_class as the below:
>1) init_rwsem -> __init_rwsem -> lockdep_init_map;
>2) lockdep_set_class -> lockdep_init_map;
>
> Finally we go back to your concern, you maybe worry this change will 
> cause the below dead-lock can't be detected. In fact, lockdep still is able 
> to detect the issue as circular locking dependency, but there is no warning 
> "recursive locking " in this case.
> Thread A: down_write(adev->reset_sem) for GPU 0 -> 
> down_write(adev->reset_sem) for GPU 1 -> ... -> up_write(adev->reset_sem) for 
> GPU 1 -> up_write(adev->reset_sem) for GPU 0
> Thread B: down_write(adev->reset_sem) for GPU 1 -> 
> down_write(adev->reset_sem) for GPU 0 -> ... -> 
> up_write(adev->reset_sem) for GPU 0 -> up_write(adev->reset_sem) for 
> GPU 1
>
> But lockdep still can detect recursive locking for this case:
> Thread A: down_write(adev->reset_sem) for GPU 0 -> ...-> ...-> 
> down_write(adev->reset_sem) for GPU 0

Yeah, I guessed correctly what you're doing. My recommendation to use 
down_write_nest_lock still stands. This is for reset only, kernel-wide reset 
lock really shouldn't hurt. Or make it a lock per xgmi hive, I'm assuming that 
information is known somewhere.

The problem with manual lockdep annotations is that they increase complexity. 
You have to keep all the annotations in mind, including justifcations and which 
parts they still catch and which parts they don't catch. And there's zero 
performance justification for a gpu reset path to create some fancy lockdep 
special cases.

Locking needs to be
1. maintainable, i.e. every time you need to write a multi-paragraph 
explanation like the above it's probably not. This obviously includes 
correctness, but it's even more important that people can easily understand 
what you're doing.
2. fast enough, where it matters. gpu reset just doesn't.

[Dennis Li] Yeah. I strongly agree that manual lockdep annotation will increase 
complexity. However my patch isn't for lockdep annotation, and in fact it is to 
fix a bug. According to design of lockdep, every lock object should has a 
separated  lock_clas

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-07 Thread Li, Dennis
[AMD Public Use]

Hi, Daniel,
  Thanks your review. And I also understand your concern. I guess you 
missed the description of this issue, so I paste it again in the below, and 
explain why this issue happens.

  For example, in a XGMI system with 2 GPU devices whose device entity is 
named adev. And each adev has a separated reset_sem. 
  // device init function 
  void device_init(adev) {
...
init_rwsem(&adev->reset_sem);
  ...
  }

 XGMI system have two GPU devices, so system will call device_init twice. 
However the definition of init_rwsem macro has a limitation which use a local 
static lock_class_key to initialize rw_sem, which cause each adev->reset_sem 
share the same lock_class_key. 

 #define init_rwsem(sem) \ 
 do {   \
 static struct lock_class_key __key;\
\
__init_rwsem((sem), #sem, &__key);  \
 } while (0)

 And when GPU hang, we should do gpu recovery for all devices in the hive. 
Therefore we should lock each adev->reset_sem to protect GPU from be accessed 
by other threads during recovery. The detailed recovery sequence as the 
following:
 // Step1: lock all GPU firstly:
 for each adev of GPU0 or GPU1 {
   down_write(adev->reset_sem); 
   do_suspend(adev); 
}

// Step2:
do_hive_recovery(hive);

// Step3: resume and unlock GPU
for each adev of GPU0 or GPU1 {
  do_resume(adev)
  up_write(adev->reset_sem);
}

Each adev->reset_sem has the same lock_class_key, and lockdep will take 
them as the same rw_semaphore object. Therefore in step1, when lock GPU1, 
lockdep will pop the below warning. 

I have considered your proposal (using  _nest_lock() ) before. Just as you 
said, _nest_lock() will hide true positive recursive locking. So I gave up it 
in the end. 

After reviewing codes of lockdep, I found the lockdep support dynamic_key, 
so using separated lock_class_key has no any side effect. In fact, init_rwsem 
also use dynamic_key. Please see the call sequence of init_rwsem and 
lockdep_set_class as the below:
   1) init_rwsem -> __init_rwsem -> lockdep_init_map; 
   2) lockdep_set_class -> lockdep_init_map;

Finally we go back to your concern, you maybe worry this change will cause 
the below dead-lock can't be detected. In fact, lockdep still is able to detect 
the issue as circular locking dependency, but there is no warning "recursive 
locking " in this case. 
Thread A: down_write(adev->reset_sem) for GPU 0 -> 
down_write(adev->reset_sem) for GPU 1 -> ... -> up_write(adev->reset_sem) for 
GPU 1 -> up_write(adev->reset_sem) for GPU 0
Thread B: down_write(adev->reset_sem) for GPU 1 -> 
down_write(adev->reset_sem) for GPU 0 -> ... -> up_write(adev->reset_sem) for 
GPU 0 -> up_write(adev->reset_sem) for GPU 1

But lockdep still can detect recursive locking for this case:
Thread A: down_write(adev->reset_sem) for GPU 0 -> ...-> ...-> 
down_write(adev->reset_sem) for GPU 0 

[  584.110304] 
[  584.110590] WARNING: possible recursive locking detected
[  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
[  584.64] 
[  584.111456] kworker/38:1/553 is trying to acquire lock:
[  584.111721] 9b15ff0a47a0 (&adev->reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.112112]
   but task is already holding lock:
[  584.112673] 9b1603d247a0 (&adev->reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.113068]
   other info that might help us debug this:
[  584.113689]  Possible unsafe locking scenario:

[  584.114350]CPU0
[  584.114685]
[  584.115014]   lock(&adev->reset_sem);
[  584.115349]   lock(&adev->reset_sem);
[  584.115678]
*** DEADLOCK ***

[  584.116624]  May be due to missing lock nesting notation

[  584.117284] 4 locks held by kworker/38:1/553:
[  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: 
process_one_work+0x21f/0x630
[  584.117967]  #1: ac708e1c3e58 
((work_completion)(&con->recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  584.118358]  #2: c1c2a5d0 (&tmp->hive_lock){+.+.}, at: 
amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu]
[  584.118786]  #3: 9b1603d247a0 (&adev->reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]

Best Regards
Dennis Li
-Original Message-
From: Daniel Vetter  
Sent: Friday, August 7, 2020 5:45 PM
To: Koenig, Christian 
C

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-07 Thread Li, Dennis
[AMD Public Use]

> [AMD Public Use]
>
>> [SNIP]
>>>> I think it is a limitation of init_rwsem.
>>> And exactly that's wrong, this is intentional and perfectly correct.
>>>
>>> [Dennis Li] I couldn't understand. Why is it a perfectly correct?
>>> For example, we define two rw_sem: a and b. If we don't check init_rwsem 
>>> definition, we may think case#1 and case#2 have the same behavior, but in 
>>> fact they are different.
>>>
>>> Case 1:
>>> init_rwsem(&a);
>>> init_rwsem(&b);
>>>
>>> Case2:
>>> void init_rwsem_ext(rw_sem* sem)
>>> {
>>> init_rwsem(sem);
>>> }
>>> init_rwsem_ext(&a);
>>> init_rwsem_ext(&b);
>>>
>>> As far as I know it is perfectly possible that the locks in the hive are 
>>> not always grabbed in the same order. And that's why lockdep is complaining 
>>> about this.
>>> [Dennis Li] No. I takes a hive with two devices(a and b) to explain why 
>>> lockdep complain.
>>>
>>> // Firstly driver lock the reset_sem of all devices 
>>> down_write(&a->reset_sem); do_suspend(a);
>>> down_write(&b->reset_sem);   // Because  b shared the same lock_class_key 
>>> with a, lockdep will take a and b as the same rw_sem and complain here.
>>> do_suspend(b);
>>>
>>> // do recovery
>>> do_hive_recovery()
>>>
>>> // unlock the reset_sem of all devices do_resume(a); 
>>> up_write(&a->reset_sem); do_resume(b); up_write(&b->reset_sem);
>> Ah! Now I understand what you are working around. So the problem is the 
>> static lock_class_key in the macro?
>> [Dennis Li] Yes. The author of init_rwsem might not consider our similar use 
>> case.

Well this is also really not the intended use case.

When you lock the same rwsem class recursively you can easily run into 
deadlocks if you don't keep the order the same all the time.

And abstracting init functions behind your own layer is a no-go in the Linux 
kernel as well.

>>> What we should do instead is to make sure we have only a single lock for 
>>> the complete hive instead.
>>> [Dennis Li] If we use a single lock, users will must wait for all devices 
>>> resuming successfully, but in fact their tasks are only running in device 
>>> a. It is not friendly to users.
>> Well that is intentional I would say. We can only restart submissions after 
>> all devices are resumed successfully, cause otherwise it can happen that a 
>> job on device A depends on memory on device B which isn't accessible.
>> [Dennis Li] Yes, you are right. Driver have make sure that the shared 
>> resources(such as the shard memory) are ready before unlock the lock of adev 
>> one by one. But each device still has private hardware resources such as 
>> video  and display.
> Yeah, but those are negligible and we have a team working on display support 
> for XGMI. So this will sooner or later become a problem as well.
>
> I still think that a single rwsem for the whole hive is still the best option 
> here.
>
> [Dennis Li] For your above concern, we can open a new thread to discuss it. 
> If we make a decision to use a single after discussing, we can create another 
> patch for it.

That's a really good argument, but I still hesitate to merge this patch. 
How severe is the lockdep splat?

At bare minimum we need a "/* TODO: " comment why we do this and how to 
remove the workaround again.
[Dennis Li] It is not a workaround. According to design of lockdep, each 
rw_semaphore should has a separated lock_class_key. I have explained that 
init_rwsem has the above limitation, so we must correct it. Core network driver 
(net/core/dev.c) has the similar use case with us.

Regards,
Christian.

>
> Best Regards
> Dennis lI
>> Regards,
>> Christian.
>>
>>> Regards,
>>> Christian.
>>>
>>> Am 06.08.20 um 11:15 schrieb Li, Dennis:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>> Hi, Christian,
>>>>  For this case, it is safe to use separated lock key. Please see 
>>>> the definition of init_rwsem as the below. Every init_rwsem calling will 
>>>> use a new static key, but devices of  the hive share the same code to do 
>>>> initialization, so their lock_class_key are the same. I think it is a 
>>>> limitation of init_rwsem.  In our case, it should be correct that 
>>>> reset_sem of each adev has different  lock_class_key. BTW, this c

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-06 Thread Li, Dennis
[AMD Public Use]

> [SNIP]
>>> I think it is a limitation of init_rwsem.
>> And exactly that's wrong, this is intentional and perfectly correct.
>>
>> [Dennis Li] I couldn't understand. Why is it a perfectly correct?
>> For example, we define two rw_sem: a and b. If we don't check init_rwsem 
>> definition, we may think case#1 and case#2 have the same behavior, but in 
>> fact they are different.
>>
>> Case 1:
>> init_rwsem(&a);
>> init_rwsem(&b);
>>
>> Case2:
>> void init_rwsem_ext(rw_sem* sem)
>> {
>>init_rwsem(sem);
>> }
>> init_rwsem_ext(&a);
>> init_rwsem_ext(&b);
>>
>> As far as I know it is perfectly possible that the locks in the hive are not 
>> always grabbed in the same order. And that's why lockdep is complaining 
>> about this.
>> [Dennis Li] No. I takes a hive with two devices(a and b) to explain why 
>> lockdep complain.
>>
>> // Firstly driver lock the reset_sem of all devices 
>> down_write(&a->reset_sem); do_suspend(a);
>> down_write(&b->reset_sem);   // Because  b shared the same lock_class_key 
>> with a, lockdep will take a and b as the same rw_sem and complain here.
>> do_suspend(b);
>>
>> // do recovery
>> do_hive_recovery()
>>
>> // unlock the reset_sem of all devices do_resume(a); 
>> up_write(&a->reset_sem); do_resume(b); up_write(&b->reset_sem);
> Ah! Now I understand what you are working around. So the problem is the 
> static lock_class_key in the macro?
> [Dennis Li] Yes. The author of init_rwsem might not consider our similar use 
> case.
>
>> What we should do instead is to make sure we have only a single lock for the 
>> complete hive instead.
>> [Dennis Li] If we use a single lock, users will must wait for all devices 
>> resuming successfully, but in fact their tasks are only running in device a. 
>> It is not friendly to users.
> Well that is intentional I would say. We can only restart submissions after 
> all devices are resumed successfully, cause otherwise it can happen that a 
> job on device A depends on memory on device B which isn't accessible.
> [Dennis Li] Yes, you are right. Driver have make sure that the shared 
> resources(such as the shard memory) are ready before unlock the lock of adev 
> one by one. But each device still has private hardware resources such as 
> video  and display.

Yeah, but those are negligible and we have a team working on display support 
for XGMI. So this will sooner or later become a problem as well.

I still think that a single rwsem for the whole hive is still the best option 
here.

[Dennis Li] For your above concern, we can open a new thread to discuss it. If 
we make a decision to use a single after discussing, we can create another 
patch for it. 

Best Regards
Dennis lI
>
> Regards,
> Christian.
>
>> Regards,
>> Christian.
>>
>> Am 06.08.20 um 11:15 schrieb Li, Dennis:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Hi, Christian,
>>> For this case, it is safe to use separated lock key. Please see the 
>>> definition of init_rwsem as the below. Every init_rwsem calling will use a 
>>> new static key, but devices of  the hive share the same code to do 
>>> initialization, so their lock_class_key are the same. I think it is a 
>>> limitation of init_rwsem.  In our case, it should be correct that reset_sem 
>>> of each adev has different  lock_class_key. BTW, this change doesn't effect 
>>> dead-lock detection and just correct it.
>>>
>>> #define init_rwsem(sem) \
>>> do {\
>>> static struct lock_class_key __key; \
>>> \
>>> __init_rwsem((sem), #sem, &__key);  \
>>> } while (0)
>>>
>>> Best Regards
>>> Dennis Li
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Thursday, August 6, 2020 4:13 PM
>>> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
>>> Deucher, Alexander ; Kuehling, Felix 
>>> ; Zhang, Hawking 
>>> Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive 
>>> locking
>>>
>>> Preventing locking problems during implementation is obviously a good 
>>> approach, but lockdep has proven to be massively useful for finding and 
>>> fixing problems.
>>>
>>&

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-06 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

[SNIP]
>> I think it is a limitation of init_rwsem.
> And exactly that's wrong, this is intentional and perfectly correct.
>
> [Dennis Li] I couldn't understand. Why is it a perfectly correct?
> For example, we define two rw_sem: a and b. If we don't check init_rwsem 
> definition, we may think case#1 and case#2 have the same behavior, but in 
> fact they are different.
>
> Case 1:
> init_rwsem(&a);
> init_rwsem(&b);
>
> Case2:
> void init_rwsem_ext(rw_sem* sem)
> {
>   init_rwsem(sem);
> }
> init_rwsem_ext(&a);
> init_rwsem_ext(&b);
>
> As far as I know it is perfectly possible that the locks in the hive are not 
> always grabbed in the same order. And that's why lockdep is complaining about 
> this.
> [Dennis Li] No. I takes a hive with two devices(a and b) to explain why 
> lockdep complain.
>
> // Firstly driver lock the reset_sem of all devices 
> down_write(&a->reset_sem); do_suspend(a);
> down_write(&b->reset_sem);   // Because  b shared the same lock_class_key 
> with a, lockdep will take a and b as the same rw_sem and complain here.
> do_suspend(b);
>
> // do recovery
> do_hive_recovery()
>
> // unlock the reset_sem of all devices do_resume(a); 
> up_write(&a->reset_sem); do_resume(b); up_write(&b->reset_sem);

Ah! Now I understand what you are working around. So the problem is the static 
lock_class_key in the macro?
[Dennis Li] Yes. The author of init_rwsem might not consider our similar use 
case. 

> What we should do instead is to make sure we have only a single lock for the 
> complete hive instead.
> [Dennis Li] If we use a single lock, users will must wait for all devices 
> resuming successfully, but in fact their tasks are only running in device a. 
> It is not friendly to users.

Well that is intentional I would say. We can only restart submissions after all 
devices are resumed successfully, cause otherwise it can happen that a job on 
device A depends on memory on device B which isn't accessible.
[Dennis Li] Yes, you are right. Driver have make sure that the shared 
resources(such as the shard memory) are ready before unlock the lock of adev 
one by one. But each device still has private hardware resources such as video  
and display. 

Regards,
Christian.

>
> Regards,
> Christian.
>
> Am 06.08.20 um 11:15 schrieb Li, Dennis:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi, Christian,
>>For this case, it is safe to use separated lock key. Please see the 
>> definition of init_rwsem as the below. Every init_rwsem calling will use a 
>> new static key, but devices of  the hive share the same code to do 
>> initialization, so their lock_class_key are the same. I think it is a 
>> limitation of init_rwsem.  In our case, it should be correct that reset_sem 
>> of each adev has different  lock_class_key. BTW, this change doesn't effect 
>> dead-lock detection and just correct it.
>>
>> #define init_rwsem(sem)  \
>> do { \
>>  static struct lock_class_key __key;         \
>>  \
>>  __init_rwsem((sem), #sem, &__key);  \
>> } while (0)
>>
>> Best Regards
>> Dennis Li
>> -Original Message-
>> From: Koenig, Christian 
>> Sent: Thursday, August 6, 2020 4:13 PM
>> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
>> Deucher, Alexander ; Kuehling, Felix 
>> ; Zhang, Hawking 
>> Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive 
>> locking
>>
>> Preventing locking problems during implementation is obviously a good 
>> approach, but lockdep has proven to be massively useful for finding and 
>> fixing problems.
>>
>> Disabling lockdep splat by annotating lock with separate classes is usually 
>> a no-go and only allowed if there is no other potential approach.
>>
>> In this case here we should really clean things up instead.
>>
>> Christian.
>>
>> Am 06.08.20 um 09:44 schrieb Li, Dennis:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Hi, Christian,
>>>  I agree with your concern. However we shouldn't rely on system to 
>>> detect dead-lock, and should consider this when doing code implementation. 
>>> In fact, dead-lock detection isn't enabled by default.
>>>  For your proposal to remove reset_sem into the hive structure, we 
>>&

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-06 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
   See my below comments.

Best Regards
Dennis Li
-Original Message-
From: Koenig, Christian  
Sent: Thursday, August 6, 2020 5:19 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking 
Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive locking

> I think it is a limitation of init_rwsem.

And exactly that's wrong, this is intentional and perfectly correct.

[Dennis Li] I couldn't understand. Why is it a perfectly correct? 
For example, we define two rw_sem: a and b. If we don't check init_rwsem 
definition, we may think case#1 and case#2 have the same behavior, but in fact 
they are different. 

Case 1:
init_rwsem(&a);
init_rwsem(&b);

Case2:
void init_rwsem_ext(rw_sem* sem)
{
 init_rwsem(sem);
}
init_rwsem_ext(&a);
init_rwsem_ext(&b);

As far as I know it is perfectly possible that the locks in the hive are not 
always grabbed in the same order. And that's why lockdep is complaining about 
this.
[Dennis Li] No. I takes a hive with two devices(a and b) to explain why lockdep 
complain. 

// Firstly driver lock the reset_sem of all devices
down_write(&a->reset_sem);
do_suspend(a);
down_write(&b->reset_sem);   // Because  b shared the same lock_class_key with 
a, lockdep will take a and b as the same rw_sem and complain here. 
do_suspend(b);

// do recovery
do_hive_recovery()

// unlock the reset_sem of all devices
do_resume(a);
up_write(&a->reset_sem);
do_resume(b);
up_write(&b->reset_sem);

What we should do instead is to make sure we have only a single lock for the 
complete hive instead.
[Dennis Li] If we use a single lock, users will must wait for all devices 
resuming successfully, but in fact their tasks are only running in device a. It 
is not friendly to users. 

Regards,
Christian.

Am 06.08.20 um 11:15 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Christian,
>   For this case, it is safe to use separated lock key. Please see the 
> definition of init_rwsem as the below. Every init_rwsem calling will use a 
> new static key, but devices of  the hive share the same code to do 
> initialization, so their lock_class_key are the same. I think it is a 
> limitation of init_rwsem.  In our case, it should be correct that reset_sem 
> of each adev has different  lock_class_key. BTW, this change doesn't effect 
> dead-lock detection and just correct it.
>
> #define init_rwsem(sem)   \
> do {  \
>   static struct lock_class_key __key; \
>   \
>   __init_rwsem((sem), #sem, &__key);  \
> } while (0)
>
> Best Regards
> Dennis Li
> -Original Message-
> From: Koenig, Christian 
> Sent: Thursday, August 6, 2020 4:13 PM
> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
> Deucher, Alexander ; Kuehling, Felix 
> ; Zhang, Hawking 
> Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive 
> locking
>
> Preventing locking problems during implementation is obviously a good 
> approach, but lockdep has proven to be massively useful for finding and 
> fixing problems.
>
> Disabling lockdep splat by annotating lock with separate classes is usually a 
> no-go and only allowed if there is no other potential approach.
>
> In this case here we should really clean things up instead.
>
> Christian.
>
> Am 06.08.20 um 09:44 schrieb Li, Dennis:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi, Christian,
>> I agree with your concern. However we shouldn't rely on system to 
>> detect dead-lock, and should consider this when doing code implementation. 
>> In fact, dead-lock detection isn't enabled by default.
>> For your proposal to remove reset_sem into the hive structure, we 
>> can open a new topic to discuss it. Currently we couldn't make sure which is 
>> the best solution. For example, with your proposal, we must wait for all 
>> devices resuming successfully before resubmit an old task in one device, 
>> which will effect performance.
>>
>> Best Regards
>> Dennis Li
>> -Original Message-
>> From: amd-gfx  On Behalf Of 
>> Christian König
>> Sent: Thursday, August 6, 2020 3:08 PM
>> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
>> Deucher, Alexander ; Kuehling, Felix 
>> ; Zhang, Hawking 
>> Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive 
>> locking
>>
>> Am 06.08.20 um 09:02 schrieb 

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-06 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
 For this case, it is safe to use separated lock key. Please see the 
definition of init_rwsem as the below. Every init_rwsem calling will use a new 
static key, but devices of  the hive share the same code to do initialization, 
so their lock_class_key are the same. I think it is a limitation of init_rwsem. 
 In our case, it should be correct that reset_sem of each adev has different  
lock_class_key. BTW, this change doesn't effect dead-lock detection and just 
correct it.

#define init_rwsem(sem) \
do {\
static struct lock_class_key __key; \
\
__init_rwsem((sem), #sem, &__key);  \
} while (0)

Best Regards
Dennis Li
-Original Message-
From: Koenig, Christian  
Sent: Thursday, August 6, 2020 4:13 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking 
Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive locking

Preventing locking problems during implementation is obviously a good approach, 
but lockdep has proven to be massively useful for finding and fixing problems.

Disabling lockdep splat by annotating lock with separate classes is usually a 
no-go and only allowed if there is no other potential approach.

In this case here we should really clean things up instead.

Christian.

Am 06.08.20 um 09:44 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Christian,
>I agree with your concern. However we shouldn't rely on system to 
> detect dead-lock, and should consider this when doing code implementation. In 
> fact, dead-lock detection isn't enabled by default.
>For your proposal to remove reset_sem into the hive structure, we can 
> open a new topic to discuss it. Currently we couldn't make sure which is the 
> best solution. For example, with your proposal, we must wait for all devices 
> resuming successfully before resubmit an old task in one device, which will 
> effect performance.
>
> Best Regards
> Dennis Li
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Christian König
> Sent: Thursday, August 6, 2020 3:08 PM
> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
> Deucher, Alexander ; Kuehling, Felix 
> ; Zhang, Hawking 
> Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive 
> locking
>
> Am 06.08.20 um 09:02 schrieb Dennis Li:
>> [  584.110304] 
>> [  584.110590] WARNING: possible recursive locking detected
>> [  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
>> [  584.64] 
>> [  584.111456] kworker/38:1/553 is trying to acquire lock:
>> [  584.111721] 9b15ff0a47a0 (&adev->reset_sem){}, at:
>> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.112112]
>>  but task is already holding lock:
>> [  584.112673] 9b1603d247a0 (&adev->reset_sem){}, at:
>> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.113068]
>>  other info that might help us debug this:
>> [  584.113689]  Possible unsafe locking scenario:
>>
>> [  584.114350]CPU0
>> [  584.114685]
>> [  584.115014]   lock(&adev->reset_sem);
>> [  584.115349]   lock(&adev->reset_sem);
>> [  584.115678]
>>   *** DEADLOCK ***
>>
>> [  584.116624]  May be due to missing lock nesting notation
>>
>> [  584.117284] 4 locks held by kworker/38:1/553:
>> [  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.},
>> at: process_one_work+0x21f/0x630 [  584.117967]  #1: ac708e1c3e58 
>> ((work_completion)(&con->recovery_work)){+.+.}, at:
>> process_one_work+0x21f/0x630 [  584.118358]  #2: c1c2a5d0 
>> (&tmp->hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu] 
>> [  584.118786]  #3: 9b1603d247a0 (&adev->reset_sem){}, at: 
>> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.119222]
>>  stack backtrace:
>> [  584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G  
>>  OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1
>> [  584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, 
>> BIOS 3.1 05/23/2019 [  584.121223] Workqueue: events 
>> amdgpu_ras_do_recovery [amdgpu] [  584.121638] Call Trace:
>> [  584.122050]  dump_stack+0x98/0xd5
>> [ 

RE: [PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-06 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
  I agree with your concern. However we shouldn't rely on system to detect 
dead-lock, and should consider this when doing code implementation. In fact, 
dead-lock detection isn't enabled by default. 
  For your proposal to remove reset_sem into the hive structure, we can 
open a new topic to discuss it. Currently we couldn't make sure which is the 
best solution. For example, with your proposal, we must wait for all devices 
resuming successfully before resubmit an old task in one device, which will 
effect performance. 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Christian 
König
Sent: Thursday, August 6, 2020 3:08 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Kuehling, Felix 
; Zhang, Hawking 
Subject: Re: [PATCH] drm/amdgpu: annotate a false positive recursive locking

Am 06.08.20 um 09:02 schrieb Dennis Li:
> [  584.110304] 
> [  584.110590] WARNING: possible recursive locking detected
> [  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
> [  584.64] 
> [  584.111456] kworker/38:1/553 is trying to acquire lock:
> [  584.111721] 9b15ff0a47a0 (&adev->reset_sem){}, at: 
> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.112112]
> but task is already holding lock:
> [  584.112673] 9b1603d247a0 (&adev->reset_sem){}, at: 
> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.113068]
> other info that might help us debug this:
> [  584.113689]  Possible unsafe locking scenario:
>
> [  584.114350]CPU0
> [  584.114685]
> [  584.115014]   lock(&adev->reset_sem);
> [  584.115349]   lock(&adev->reset_sem);
> [  584.115678]
>  *** DEADLOCK ***
>
> [  584.116624]  May be due to missing lock nesting notation
>
> [  584.117284] 4 locks held by kworker/38:1/553:
> [  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.}, 
> at: process_one_work+0x21f/0x630 [  584.117967]  #1: ac708e1c3e58 
> ((work_completion)(&con->recovery_work)){+.+.}, at: 
> process_one_work+0x21f/0x630 [  584.118358]  #2: c1c2a5d0 
> (&tmp->hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu] [ 
>  584.118786]  #3: 9b1603d247a0 (&adev->reset_sem){}, at: 
> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.119222]
> stack backtrace:
> [  584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G   
> OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1
> [  584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, 
> BIOS 3.1 05/23/2019 [  584.121223] Workqueue: events 
> amdgpu_ras_do_recovery [amdgpu] [  584.121638] Call Trace:
> [  584.122050]  dump_stack+0x98/0xd5
> [  584.122499]  __lock_acquire+0x1139/0x16e0 [  584.122931]  ? 
> trace_hardirqs_on+0x3b/0xf0 [  584.123358]  ? 
> cancel_delayed_work+0xa6/0xc0 [  584.123771]  lock_acquire+0xb8/0x1c0 
> [  584.124197]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  
> 584.124599]  down_write+0x49/0x120 [  584.125032]  ? 
> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.125472]  
> amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [  584.125910]  ? 
> amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [  584.126367]  
> amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [  584.126789]  
> process_one_work+0x29e/0x630 [  584.127208]  worker_thread+0x3c/0x3f0 
> [  584.127621]  ? __kthread_parkme+0x61/0x90 [  584.128014]  
> kthread+0x12f/0x150 [  584.128402]  ? process_one_work+0x630/0x630 [  
> 584.128790]  ? kthread_park+0x90/0x90 [  584.129174]  
> ret_from_fork+0x3a/0x50
>
> Each adev has owned lock_class_key to avoid false positive recursive 
> locking.

NAK, that is not a false positive but a real problem.

The issue here is that we have multiple reset semaphores, one for each device 
in the hive. If those are not acquired in the correct order we deadlock.

The real solution would be to move the reset_sem into the hive structure and 
make sure that we lock it only once.

Christian.

>
> Signed-off-by: Dennis Li 
> Change-Id: I7571efeccbf15483982031d00504a353031a854a
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e97c088d03b3..766dc8f8c8a0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -967,6 +967,7 @@ struct amdgpu_device {
>   atomic_tin_gpu_reset;
>   enum pp_mp1_state   mp1_state;
>   struct rw_semaphore reset_sem;
> + struct lock_class

RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

2020-07-28 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Guchun,
  Please see my below comments.

Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Tuesday, July 28, 2020 3:49 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Zhang, Hawking ; Li, Dennis 
; Grodzovsky, Andrey ; Zhou1, Tao 
; Clements, John ; Lazar, Lijo 
; Koenig, Christian ; Yang, 
Stanley 
Cc: Chen, Guchun 
Subject: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs 
to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.

v3: Refine function argument name.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 ++-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..30af0dfee1a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 * it should be called after amdgpu_device_ip_hw_init_phase2  since
 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
 * for I2C communication which only true at this point.
-* recovery_init may fail, but it can free all resources allocated by
-* itself and its failure should not stop amdgpu init process.
+*
+* amdgpu_ras_recovery_init may fail, but the upper only cares the
+* failure from bad gpu situation and stop amdgpu init process
+* accordingly. For other failed cases, it will still release all
+* the resource and print error message, rather than returning one
+* negative value to upper level.
 *
 * Note: theoretically, this should be called before all vram 
allocations
 * to protect retired page from abusing
 */
-   amdgpu_ras_recovery_init(adev);
+   r = amdgpu_ras_recovery_init(adev);
+   if (r)
+   goto init_failed;
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c4c142e9d8a..56e1aeba2d64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
+   bool exc_err_limit = false;
int ret;
 
if (con)
@@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
-   ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
-   if (ret)
+   ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+   /*
+* We only fail this calling and halt booting up
+* when exc_err_limit is true.
+*/
+   if (exc_err_limit) {
+   ret = -EINVAL;
goto free;
+   }

[Dennis Li] Compared with old codes,  new change miss checking ret.
 
if (con->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev); @@ -1868,6 +1875,13 @@ 
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
 
+   /*
+* Except error threshold exceeding case, other failure cases in this
+* function would not fail amdgpu driver init.
+*/
+   if (!exc_err_limit)
+   ret = 0;
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 35c0c849d49b..67995b66d7d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct 
amdgpu_ras_eeprom_control *control)
 
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+   bool *exceed_err_limit)

 {
int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control); @@ -254,6 
+255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *co

RE: [PATCH 1/5] drm/amdgpu: add bad page count threshold in module parameter

2020-07-22 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Guchun,
  It is better to let user be able to change amdgpu_bad_page_threshold with 
sysfs, so that users no need to reboot system when they want to change their 
strategy.  

Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Wednesday, July 22, 2020 11:14 AM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Zhang, Hawking ; Li, Dennis 
; Yang, Stanley ; Zhou1, Tao 
; Clements, John 
Cc: Chen, Guchun 
Subject: [PATCH 1/5] drm/amdgpu: add bad page count threshold in module 
parameter

bad_page_threshold could be specified to detect and retire bad GPU if faulty 
bad pages exceed it.

When it's -1, ras will use typical bad page failure value.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 06bfb8658dec..bb83ffb5e26a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -181,6 +181,7 @@ extern uint amdgpu_dm_abm_level;  extern struct 
amdgpu_mgpu_info mgpu_info;  extern int amdgpu_ras_enable;  extern uint 
amdgpu_ras_mask;
+extern int amdgpu_bad_page_threshold;
 extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
 extern int amdgpu_discovery;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d28b95f721c4..f99671101746 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -161,6 +161,7 @@ struct amdgpu_mgpu_info mgpu_info = {  };  int 
amdgpu_ras_enable = -1;  uint amdgpu_ras_mask = 0x;
+int amdgpu_bad_page_threshold = -1;
 
 /**
  * DOC: vramlimit (int)
@@ -801,6 +802,16 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);  
MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = 
legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");  
module_param_named(reset_method, amdgpu_reset_method, int, 0444);
 
+/**
+ * DOC: bad_page_threshold (int)
+ * Bad page threshold configuration is driven by RMA(Return Merchandise
+ * Authorization) policy, which is to specify the threshold value of 
+faulty
+ * pages detected by ECC, which may result in GPU's retirement if total
+ * faulty pages by ECC exceed threshold value.
+ */
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = 
+auto(default typical value))"); module_param_named(bad_page_threshold, 
+amdgpu_bad_page_threshold, int, 0444);
+
 static const struct pci_device_id pciidlist[] = {  #ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 3/5] drm/amd/sriov add mmsch_v3 interface

2020-07-14 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Jack,
  Thanks for your expiation. It looks good to me.

Best Regards
Dennis Li

-Original Message-
From: Zhang, Jack (Jian)  
Sent: Tuesday, July 14, 2020 5:01 PM
To: Li, Dennis ; Liu, Leo ; Zhang, Hawking 
; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 3/5] drm/amd/sriov add mmsch_v3 interface

Hi, Dennis,

I gave some feedback in the comments.
Thank you for your review.

Best Regards,
Jack Zhang

-Original Message-
From: Li, Dennis 
Sent: Tuesday, July 14, 2020 12:35 PM
To: Zhang, Jack (Jian) ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Jack (Jian) ; Liu, Leo ; 
Zhang, Hawking 
Subject: RE: [PATCH 3/5] drm/amd/sriov add mmsch_v3 interface

[AMD Official Use Only - Internal Distribution Only]

Hi, Jack,
  Please see the following comments. 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Jack Zhang
Sent: Tuesday, July 14, 2020 10:47 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Jack (Jian) ; Liu, Leo ; 
Zhang, Hawking 
Subject: [PATCH 3/5] drm/amd/sriov add mmsch_v3 interface

For VCN3.0 SRIOV, Guest driver needs to communicate with mmsch to set the World 
Switch for MM appropriately. This patch add the interface for mmsch_v3.0.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h | 130 
 1 file changed, 130 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h

diff --git a/drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h 
b/drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h
new file mode 100644
index ..3e4e858a6965
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person 
+obtaining a
+ * copy of this software and associated documentation files (the 
+"Software"),
+ * to deal in the Software without restriction, including without 
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute, 
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom 
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __MMSCH_V3_0_H__
+#define __MMSCH_V3_0_H__
+
+#include "amdgpu_vcn.h"
+
+#define MMSCH_VERSION_MAJOR3
+#define MMSCH_VERSION_MINOR0
+#define MMSCH_VERSION  (MMSCH_VERSION_MAJOR << 16 | MMSCH_VERSION_MINOR)
+
+enum mmsch_v3_0_command_type {
+   MMSCH_COMMAND__DIRECT_REG_WRITE = 0,
+   MMSCH_COMMAND__DIRECT_REG_POLLING = 2,
+   MMSCH_COMMAND__DIRECT_REG_READ_MODIFY_WRITE = 3,
+   MMSCH_COMMAND__INDIRECT_REG_WRITE = 8,
+   MMSCH_COMMAND__END = 0xf
+};
+
+struct mmsch_v3_0_table_info {
+   uint32_t init_status;
+   uint32_t table_offset;
+   uint32_t table_size;
+};
+
+struct mmsch_v3_0_init_header {
+   uint32_t version;
+   uint32_t total_size;
+   struct mmsch_v3_0_table_info inst[AMDGPU_MAX_VCN_INSTANCES]; };

[Dennis]  You have defined total_size, why inst size is 
AMDGPU_MAX_VCN_INSTANCES, which will cause memory waste.
[Jack] In our case, AMDGPU_MAX_VCN_INSTANCES is a fixed number, which equals 2. 
 And struct mmsch_v3_0_table_info occupy 3 dws.  Thus there's not too much 
memory waste.

+struct mmsch_v3_0_cmd_direct_reg_header {
+   uint32_t reg_offset   : 28;
+   uint32_t command_type : 4;
+};
+
+struct mmsch_v3_0_cmd_indirect_reg_header {
+   uint32_t reg_offset: 20;
+   uint32_t reg_idx_space : 8;
+   uint32_t command_type  : 4;
+};
+
+struct mmsch_v3_0_cmd_direct_write {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header;
+   uint32_t reg_value;
+};
+
+struct mmsch_v3_0_cmd_direct_read_modify_write {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header;
+   uint32_t write_data;
+   uint32_t mask_value;
+};
+
+struct mmsch_v3_0_cmd_direct_polling {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header;
+   uint32_t mask_value;
+   uint32_t wait_value;
+};
+
+struct mmsch_v3_0_cmd_end {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header; };
+
+struct mmsch_v3_0_cmd_indirect_write {
+   struct mmsch_v3_0_cmd_indirect_reg_header cmd_header;
+   uint32_t reg_value;
+};
+
+#define MMSCH_V3_

RE: [PATCH 1/1] drm/amdkfd: Add IPC API

2020-07-13 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,
  amdgpu_gem_prime_export has different define in the old driver. I added 
some comment in the below codes. 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Felix 
Kuehling
Sent: Tuesday, July 14, 2020 11:15 AM
To: amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org
Cc: Deucher, Alexander ; daniel.vet...@ffwll.ch; 
airl...@gmail.com
Subject: [PATCH 1/1] drm/amdkfd: Add IPC API

This allows exporting and importing buffers. The API generates handles that can 
be used with the HIP IPC API, i.e. big numbers rather than file descriptors.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   5 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  56 +++-
 drivers/gpu/drm/amd/amdkfd/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  74 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c  | 263 ++
 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h  |  55 
 drivers/gpu/drm/amd/amdkfd/kfd_module.c   |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +
 include/uapi/linux/kfd_ioctl.h|  62 -
 9 files changed, 488 insertions(+), 40 deletions(-)  create mode 100644 
drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3f2b695cf19e..0f8dc4c4f924 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -49,6 +49,7 @@ struct kfd_bo_va_list {  struct kgd_mem {
struct mutex lock;
struct amdgpu_bo *bo;
+   struct kfd_ipc_obj *ipc_obj;
struct list_head bo_va_list;
/* protected by amdkfd_process_info.lock */
struct ttm_validate_buffer validate_list; @@ -240,9 +241,13 @@ int 
amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
 
 int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
  struct dma_buf *dmabuf,
+ struct kfd_ipc_obj *ipc_obj,
  uint64_t va, void *vm,
  struct kgd_mem **mem, uint64_t *size,
  uint64_t *mmap_offset);
+int amdgpu_amdkfd_gpuvm_export_ipc_obj(struct kgd_dev *kgd, void *vm,
+  struct kgd_mem *mem,
+  struct kfd_ipc_obj **ipc_obj);
 
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index c408936e8f98..cd5f23c0c2ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -29,6 +29,7 @@
 #include "amdgpu_vm.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_dma_buf.h"
+#include "kfd_ipc.h"
 #include 
 
 /* BO flag to indicate a KFD userptr BO */ @@ -1353,6 +1354,9 @@ int 
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
*size = 0;
}
 
+   /* Unreference the ipc_obj if applicable */
+   kfd_ipc_obj_put(&mem->ipc_obj);
+
/* Free the BO*/
drm_gem_object_put_unlocked(&mem->bo->tbo.base);
mutex_destroy(&mem->lock);
@@ -1656,6 +1660,7 @@ int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev 
*kgd,
 
 int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
  struct dma_buf *dma_buf,
+ struct kfd_ipc_obj *ipc_obj,
  uint64_t va, void *vm,
  struct kgd_mem **mem, uint64_t *size,
  uint64_t *mmap_offset)
@@ -1692,15 +1697,18 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev 
*kgd,
 
INIT_LIST_HEAD(&(*mem)->bo_va_list);
mutex_init(&(*mem)->lock);
-   
-   (*mem)->alloc_flags =
-   ((bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
-   KFD_IOC_ALLOC_MEM_FLAGS_VRAM : KFD_IOC_ALLOC_MEM_FLAGS_GTT)
-   | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
-   | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
+   if (bo->kfd_bo)
+   (*mem)->alloc_flags = bo->kfd_bo->alloc_flags;
+   else
+   (*mem)->alloc_flags =
+   ((bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
+   KFD_IOC_ALLOC_MEM_FLAGS_VRAM : 
KFD_IOC_ALLOC_MEM_FLAGS_GTT)
+   | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
+   | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
 
drm_gem_object_get(&bo->tbo.base);
(*mem)->bo = bo;
+   (*mem)->ipc_obj = ipc_obj;
(*mem)->va = va;
(*mem)->domain = (bo->preferred_dom

RE: [PATCH 3/5] drm/amd/sriov add mmsch_v3 interface

2020-07-13 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Jack,
  Please see the following comments. 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Jack Zhang
Sent: Tuesday, July 14, 2020 10:47 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Jack (Jian) ; Liu, Leo ; 
Zhang, Hawking 
Subject: [PATCH 3/5] drm/amd/sriov add mmsch_v3 interface

For VCN3.0 SRIOV, Guest driver needs to communicate with mmsch to set the World 
Switch for MM appropriately. This patch add the interface for mmsch_v3.0.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h | 130 
 1 file changed, 130 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h

diff --git a/drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h 
b/drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h
new file mode 100644
index ..3e4e858a6965
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/mmsch_v3_0.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person 
+obtaining a
+ * copy of this software and associated documentation files (the 
+"Software"),
+ * to deal in the Software without restriction, including without 
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute, 
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom 
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __MMSCH_V3_0_H__
+#define __MMSCH_V3_0_H__
+
+#include "amdgpu_vcn.h"
+
+#define MMSCH_VERSION_MAJOR3
+#define MMSCH_VERSION_MINOR0
+#define MMSCH_VERSION  (MMSCH_VERSION_MAJOR << 16 | MMSCH_VERSION_MINOR)
+
+enum mmsch_v3_0_command_type {
+   MMSCH_COMMAND__DIRECT_REG_WRITE = 0,
+   MMSCH_COMMAND__DIRECT_REG_POLLING = 2,
+   MMSCH_COMMAND__DIRECT_REG_READ_MODIFY_WRITE = 3,
+   MMSCH_COMMAND__INDIRECT_REG_WRITE = 8,
+   MMSCH_COMMAND__END = 0xf
+};
+
+struct mmsch_v3_0_table_info {
+   uint32_t init_status;
+   uint32_t table_offset;
+   uint32_t table_size;
+};
+
+struct mmsch_v3_0_init_header {
+   uint32_t version;
+   uint32_t total_size;
+   struct mmsch_v3_0_table_info inst[AMDGPU_MAX_VCN_INSTANCES]; };

[Dennis]  You have defined total_size, why inst size is 
AMDGPU_MAX_VCN_INSTANCES, which will cause memory waste.

+struct mmsch_v3_0_cmd_direct_reg_header {
+   uint32_t reg_offset   : 28;
+   uint32_t command_type : 4;
+};
+
+struct mmsch_v3_0_cmd_indirect_reg_header {
+   uint32_t reg_offset: 20;
+   uint32_t reg_idx_space : 8;
+   uint32_t command_type  : 4;
+};
+
+struct mmsch_v3_0_cmd_direct_write {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header;
+   uint32_t reg_value;
+};
+
+struct mmsch_v3_0_cmd_direct_read_modify_write {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header;
+   uint32_t write_data;
+   uint32_t mask_value;
+};
+
+struct mmsch_v3_0_cmd_direct_polling {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header;
+   uint32_t mask_value;
+   uint32_t wait_value;
+};
+
+struct mmsch_v3_0_cmd_end {
+   struct mmsch_v3_0_cmd_direct_reg_header cmd_header; };
+
+struct mmsch_v3_0_cmd_indirect_write {
+   struct mmsch_v3_0_cmd_indirect_reg_header cmd_header;
+   uint32_t reg_value;
+};
+
+#define MMSCH_V3_0_INSERT_DIRECT_RD_MOD_WT(reg, mask, data) { \
+   size = sizeof(struct mmsch_v3_0_cmd_direct_read_modify_write); \
+   size_dw = size / 4; \
+   direct_rd_mod_wt.cmd_header.reg_offset = reg; \
+   direct_rd_mod_wt.mask_value = mask; \
+   direct_rd_mod_wt.write_data = data; \
+   memcpy((void *)table_loc, &direct_rd_mod_wt, size); \
+   table_loc += size_dw; \
+   table_size += size_dw; \
+}

[Dennis]  direct_rd_mod_wt, table_loc and table_size are local variables, it is 
better not to define them in macro, which are not very readable. 

+#define MMSCH_V3_0_INSERT_DIRECT_WT(reg, value) { \
+   size = sizeof(struct mmsch_v3_0_cmd_direct_write); \
+   size_dw = size / 4; \
+   direct_wt.cmd_header.reg_offset = reg; \
+   direct_wt.reg_value = value; \
+   memcpy((void *)table_loc, &direct_wt, size); \
+   table_loc += size_dw; \
+   table_size += size_dw; \
+}
+
+#define MMSCH_V3_0_INS

RE: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

2020-07-12 Thread Li, Dennis
[AMD Public Use]

Hi, Hawking,
  Got it. Thanks for your expiation. It looks good to me now.

Reviewed-by: Dennis Li 

Best Regards
Dennis Li
-Original Message-
From: Zhang, Hawking  
Sent: Monday, July 13, 2020 1:44 PM
To: Sheng, Wenhui ; Li, Dennis ; 
amd-gfx@lists.freedesktop.org
Cc: Gao, Likun 
Subject: RE: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

[AMD Public Use]

RE - [Dennis Li] It is better change to use RREG32_SOC15_NO_KIQ, because when 
GPU hang, RREG32_SOC15 will fail if it use RREG32_KIQ to read register

RREG32_SOC15_NO_KIQ should have no difference from RREG32_SOC15 for this use 
scenario. This is the feature only supported in bare-metal environment and 
never run from guest environment.
But this do remind us to exclude the feature from guest run-time environment by 
checking amdgpu_vf_sriov()

Regards,
Hawking

-Original Message-
From: Sheng, Wenhui 
Sent: Monday, July 13, 2020 11:43
To: Li, Dennis ; amd-gfx@lists.freedesktop.org
Cc: Gao, Likun ; Zhang, Hawking 
Subject: RE: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

[AMD Official Use Only - Internal Distribution Only]

Ok, will refine it.


Brs
Wenhui

-Original Message-
From: Li, Dennis 
Sent: Monday, July 13, 2020 11:10 AM
To: Sheng, Wenhui ; amd-gfx@lists.freedesktop.org
Cc: Gao, Likun ; Sheng, Wenhui ; 
Zhang, Hawking 
Subject: RE: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

[AMD Official Use Only - Internal Distribution Only]



-Original Message-
From: amd-gfx  On Behalf Of Wenhui Sheng
Sent: Friday, July 10, 2020 10:17 PM
To: amd-gfx@lists.freedesktop.org
Cc: Gao, Likun ; Sheng, Wenhui ; 
Zhang, Hawking 
Subject: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

>From PM FW 58.26.0 for sienna cichlid, SMU mode1 reset is support, driver 
>sends PPSMC_MSG_Mode1Reset message to PM FW could trigger this reset.

v2: add mode1 reset dpm interface

Signed-off-by: Likun Gao 
Signed-off-by: Wenhui Sheng 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c   | 20 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h   |  3 ++
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 34 +++
 .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  4 +++
 drivers/gpu/drm/amd/powerplay/inc/smu_types.h |  1 +  
drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h |  2 ++
 .../drm/amd/powerplay/sienna_cichlid_ppt.c| 31 +++--
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 13 +++
 8 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
index 65472b3dd815..16668fc52d0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
@@ -1141,6 +1141,26 @@ int amdgpu_dpm_baco_reset(struct amdgpu_device *adev)
return 0;
 }
 
+bool amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev) {
+   struct smu_context *smu = &adev->smu;
+
+   if (is_support_sw_smu(adev))
+   return smu_mode1_reset_is_support(smu);
+
+   return false;
+}
+
+int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev) {
+   struct smu_context *smu = &adev->smu;
+
+   if (is_support_sw_smu(adev))
+   return smu_mode1_reset(smu);
+
+   return -EOPNOTSUPP;
+}
+
 int amdgpu_dpm_switch_power_profile(struct amdgpu_device *adev,
enum PP_SMC_POWER_PROFILE type,
bool en)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
index 6a8aae70a0e6..7f3cd7185650 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
@@ -529,6 +529,9 @@ int amdgpu_dpm_mode2_reset(struct amdgpu_device *adev);
 
 bool amdgpu_dpm_is_baco_supported(struct amdgpu_device *adev);
 
+bool amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev); 
+int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev);
+
 int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev,
 enum pp_mp1_state mp1_state);
 
diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index fe4948aa662f..b5a7422d9548 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -2737,6 +2737,40 @@ int smu_baco_exit(struct smu_context *smu)
return ret;
 }
 
+bool smu_mode1_reset_is_support(struct smu_context *smu) {
+   bool ret = false;
+
+   if (!smu->pm_enabled)
+   return false;
+
+   mutex_lock(&smu->mutex);
+
+   if (smu->ppt_funcs && smu->ppt_funcs->mode1_reset_is_support)
+   ret = smu->ppt_funcs->mode1_reset_is_support(smu);
+
+   mutex_unlock(&smu->mutex);
+
+   return ret;
+}
+
+int smu_mode1_reset(struct smu_context *smu) {
+   int ret = 0;
+
+   if (!smu->pm_enabled)
+

RE: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

2020-07-12 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]



-Original Message-
From: amd-gfx  On Behalf Of Wenhui Sheng
Sent: Friday, July 10, 2020 10:17 PM
To: amd-gfx@lists.freedesktop.org
Cc: Gao, Likun ; Sheng, Wenhui ; 
Zhang, Hawking 
Subject: [PATCH 1/3] drm/amd/powerplay: add SMU mode1 reset

>From PM FW 58.26.0 for sienna cichlid, SMU mode1 reset is support, driver 
>sends PPSMC_MSG_Mode1Reset message to PM FW could trigger this reset.

v2: add mode1 reset dpm interface

Signed-off-by: Likun Gao 
Signed-off-by: Wenhui Sheng 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c   | 20 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h   |  3 ++
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 34 +++
 .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  4 +++
 drivers/gpu/drm/amd/powerplay/inc/smu_types.h |  1 +  
drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h |  2 ++
 .../drm/amd/powerplay/sienna_cichlid_ppt.c| 31 +++--
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 13 +++
 8 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
index 65472b3dd815..16668fc52d0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
@@ -1141,6 +1141,26 @@ int amdgpu_dpm_baco_reset(struct amdgpu_device *adev)
return 0;
 }
 
+bool amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev) {
+   struct smu_context *smu = &adev->smu;
+
+   if (is_support_sw_smu(adev))
+   return smu_mode1_reset_is_support(smu);
+
+   return false;
+}
+
+int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev) {
+   struct smu_context *smu = &adev->smu;
+
+   if (is_support_sw_smu(adev))
+   return smu_mode1_reset(smu);
+
+   return -EOPNOTSUPP;
+}
+
 int amdgpu_dpm_switch_power_profile(struct amdgpu_device *adev,
enum PP_SMC_POWER_PROFILE type,
bool en)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
index 6a8aae70a0e6..7f3cd7185650 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
@@ -529,6 +529,9 @@ int amdgpu_dpm_mode2_reset(struct amdgpu_device *adev);
 
 bool amdgpu_dpm_is_baco_supported(struct amdgpu_device *adev);
 
+bool amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev); 
+int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev);
+
 int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev,
 enum pp_mp1_state mp1_state);
 
diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index fe4948aa662f..b5a7422d9548 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -2737,6 +2737,40 @@ int smu_baco_exit(struct smu_context *smu)
return ret;
 }
 
+bool smu_mode1_reset_is_support(struct smu_context *smu) {
+   bool ret = false;
+
+   if (!smu->pm_enabled)
+   return false;
+
+   mutex_lock(&smu->mutex);
+
+   if (smu->ppt_funcs && smu->ppt_funcs->mode1_reset_is_support)
+   ret = smu->ppt_funcs->mode1_reset_is_support(smu);
+
+   mutex_unlock(&smu->mutex);
+
+   return ret;
+}
+
+int smu_mode1_reset(struct smu_context *smu) {
+   int ret = 0;
+
+   if (!smu->pm_enabled)
+   return -EOPNOTSUPP;
+
+   mutex_lock(&smu->mutex);
+
+   if (smu->ppt_funcs->mode1_reset)
+   ret = smu->ppt_funcs->mode1_reset(smu);
+
+   mutex_unlock(&smu->mutex);
+
+   return ret;
+}
+
 int smu_mode2_reset(struct smu_context *smu)  {
int ret = 0;
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index 7b349e038972..ba59620950d7 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -561,6 +561,8 @@ struct pptable_funcs {
int (*baco_set_state)(struct smu_context *smu, enum smu_baco_state 
state);
int (*baco_enter)(struct smu_context *smu);
int (*baco_exit)(struct smu_context *smu);
+   bool (*mode1_reset_is_support)(struct smu_context *smu);
+   int (*mode1_reset)(struct smu_context *smu);
int (*mode2_reset)(struct smu_context *smu);
int (*get_dpm_ultimate_freq)(struct smu_context *smu, enum smu_clk_type 
clk_type, uint32_t *min, uint32_t *max);
int (*set_soft_freq_limited_range)(struct smu_context *smu, enum 
smu_clk_type clk_type, uint32_t min, uint32_t max); @@ -672,6 +674,8 @@ int 
smu_baco_get_state(struct smu_context *smu, enum smu_baco_state *state);  int 
smu_baco_enter(struct smu_context *smu);  int smu_baco_exit(struct smu_context 
*smu);
 
+bool smu_mode1_reset_is_support(struct smu_context *smu); int 
+smu_mode1_reset(struct smu_contex

RE: [PATCH 06/10] drm/amd/display: fix dcn3 p_state_change_support validation

2020-07-12 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]


-Original Message-
From: amd-gfx  On Behalf Of Rodrigo 
Siqueira
Sent: Saturday, July 11, 2020 4:33 AM
To: amd-gfx@lists.freedesktop.org
Cc: Laktyushkin, Dmytro ; Brol, Eryk 
; Li, Sun peng (Leo) ; Wentland, Harry 
; Zhuo, Qingqing ; Siqueira, 
Rodrigo ; Pillai, Aurabindo 
; Lee, Alvin ; Lakha, Bhawanpreet 

Subject: [PATCH 06/10] drm/amd/display: fix dcn3 p_state_change_support 
validation

From: Dmytro Laktyushkin 

Our validation is a known mess with actual validation mixed with topology 
configuration. This change makes sure topolgical validation is completed before 
any topology changes are made so we do not run into issues where we merge and 
split a pipe over the course of a single call.

Signed-off-by: Dmytro Laktyushkin 
Reviewed-by: Alvin Lee 
Acked-by: Rodrigo Siqueira 
---
 .../drm/amd/display/dc/dcn30/dcn30_resource.c | 46 ---
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c 
b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c
index d7ba895de765..653a571e366d 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c
@@ -1870,12 +1870,14 @@ static bool dcn30_split_stream_for_mpc_or_odm(
 
return true;
 }
-static bool dcn30_fast_validate_bw(
+
+static bool dcn30_internal_validate_bw(
struct dc *dc,
struct dc_state *context,
display_e2e_pipe_params_st *pipes,
int *pipe_cnt_out,
-   int *vlevel_out)
+   int *vlevel_out,
+   bool fast_validate)
 {
bool out = false;
bool repopulate_pipes = false;
@@ -1898,7 +1900,38 @@ static bool dcn30_fast_validate_bw(
 
dml_log_pipe_params(&context->bw_ctx.dml, pipes, pipe_cnt);
 
-   vlevel = dml_get_voltage_level(&context->bw_ctx.dml, pipes, pipe_cnt);
+   if (!fast_validate) {
+   /*
+* DML favors voltage over p-state, but we're more interested in
+* supporting p-state over voltage. We can't support p-state in
+* prefetch mode > 0 so try capping the prefetch mode to start.
+*/
+   
context->bw_ctx.dml.soc.allow_dram_self_refresh_or_dram_clock_change_in_vblank =
+   dm_allow_self_refresh_and_mclk_switch;
+   vlevel = dml_get_voltage_level(&context->bw_ctx.dml, pipes, 
pipe_cnt);
+   /* This may adjust vlevel and maxMpcComb */
+   if (vlevel < context->bw_ctx.dml.soc.num_states)
+   vlevel = dcn20_validate_apply_pipe_split_flags(dc, 
context, vlevel, split, merge);
+   }
+   if (fast_validate || vlevel == context->bw_ctx.dml.soc.num_states ||
+   vba->DRAMClockChangeSupport[vlevel][vba->maxMpcComb] == 
dm_dram_clock_change_unsupported) {
+   /*
+* If mode is unsupported or there's still no p-state support 
then
+* fall back to favoring voltage.
+*
+* We don't actually support prefetch mode 2, so require that we
+* at least support prefetch mode 1.
+*/
+   
context->bw_ctx.dml.soc.allow_dram_self_refresh_or_dram_clock_change_in_vblank =
+   dm_allow_self_refresh;
+
+   vlevel = dml_get_voltage_level(&context->bw_ctx.dml, pipes, 
pipe_cnt);
+   if (vlevel < context->bw_ctx.dml.soc.num_states) {
+   memset(split, 0, sizeof(split));
+   memset(merge, 0, sizeof(merge));

[Dennis] It seems that the above code is wrong. Should they be the following:
memset(split, 0, sizeof(*split));
memset(merge, 0, sizeof(*merge));


+   vlevel = dcn20_validate_apply_pipe_split_flags(dc, 
context, vlevel, split, merge);
+   }
+   }
 
dml_log_mode_support_params(&context->bw_ctx.dml);
 
@@ -1938,8 +1971,6 @@ static bool dcn30_fast_validate_bw(
pipe_idx++;
}
 
-   vlevel = dcn20_validate_apply_pipe_split_flags(dc, context, vlevel, 
split, merge);
-
/* merge pipes if necessary */
for (i = 0; i < dc->res_pool->pipe_count; i++) {
struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; @@ 
-2187,7 +2218,8 @@ static void dcn30_calculate_wm(
}
 }
 
-bool dcn30_validate_bandwidth(struct dc *dc, struct dc_state *context,
+bool dcn30_validate_bandwidth(struct dc *dc,
+   struct dc_state *context,
bool fast_validate)
 {
bool out = false;
@@ -2201,7 +2233,7 @@ bool dcn30_validate_bandwidth(struct dc *dc, struct 
dc_state *context,
 
BW_VAL_TRACE_COUNT();
 
-   out = dcn30_fast_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel);
+   out = dcn30_internal_validate_bw(dc, context, p

RE: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset

2020-07-11 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]



-Original Message-
From: Grodzovsky, Andrey  
Sent: Friday, July 10, 2020 10:33 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhou1, Tao ; Zhang, 
Hawking ; Chen, Guchun 
Subject: Re: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset


On 7/8/20 3:48 AM, Dennis Li wrote:
> During GPU reset, driver should hold on all external access to
> GPU, otherwise psp will randomly fail to do post, and then cause
> system hang.
>
> v2:
> 1. add rwlock for some ioctls, debugfs and file-close function.
> 2. change to use dqm->is_resetting and dqm_lock for protection in kfd
> driver.
> 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
> re-enter GPU recovery for the same GPU hang.
>
> Signed-off-by: Dennis Li 
> Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 80f32b3beb88..f235492799d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -963,9 +963,9 @@ struct amdgpu_device {
>   boolin_suspend;
>   boolin_hibernate;
>   
> - boolin_gpu_reset;
> + atomic_tin_gpu_reset;
>   enum pp_mp1_state   mp1_state;
> - struct mutex  lock_reset;
> + struct rw_semaphore reset_sem;
>   struct amdgpu_doorbell_index doorbell_index;
>   
>   struct mutexnotifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..af71d8e93081 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   uint32_t temp;
>   struct v10_compute_mqd *m = get_mqd(mqd);
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   #if 0
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index 0b7e78748540..750a8308c868 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   unsigned long flags, end_jiffies;
>   int retry;
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   acquire_queue(kgd, pipe_id, queue_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> index ccd635b812b5..027793e0c1ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   int retry;
>   struct vi_mqd *m = get_mqd(mqd);
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   acquire_queue(kgd, pipe_id, queue_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..e4a77f7a4c2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -541,7 +541,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   uint32_t temp;
>   struct v9_mqd *m = get_mqd(mqd);
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   acquire_queue(kgd, pipe_id, queue_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ffbcaf4bfb8b..a94b3f862fc2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
> struct drm_file *filp)
>   parser.adev = adev;
>   parser.filp = filp;
>   
> + down_read(&adev->reset_sem);
> +
>   r = amdgpu_cs_parser_init(&parser, data);
>   if (r) {
>   DRM_ERROR("Failed to initialize parser %d!\n", r);
> @@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
> struct drm_file *filp)
&

RE: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset

2020-07-10 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]



-Original Message-
From: Grodzovsky, Andrey  
Sent: Saturday, July 11, 2020 1:54 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhou1, Tao ; Zhang, 
Hawking ; Chen, Guchun ; Koenig, 
Christian 
Subject: Re: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset


On 7/10/20 1:24 PM, Li, Dennis wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Andrey,
>Please see my below comments.
>
> Best Regards
> Dennis Li
> -Original Message-
> From: Grodzovsky, Andrey 
> Sent: Friday, July 10, 2020 11:08 PM
> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
> Alexander ; Zhou1, Tao ; Zhang, 
> Hawking ; Chen, Guchun ; Koenig, 
> Christian 
> Subject: Re: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset
>
>
> On 7/8/20 3:48 AM, Dennis Li wrote:
>> During GPU reset, driver should hold on all external access to
>> GPU, otherwise psp will randomly fail to do post, and then cause
>> system hang.
>>
>> v2:
>> 1. add rwlock for some ioctls, debugfs and file-close function.
>> 2. change to use dqm->is_resetting and dqm_lock for protection in kfd
>> driver.
>> 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
>> re-enter GPU recovery for the same GPU hang.
>
> Still adev->in_gpu_reset is prone to race, no ? I mean if the reset starts 
> right AFTER we checked for the flag value then we still have a problem.
> [Dennis]: right, adev->in_gpu_reset can't solve race issue, so I introduced 
> adev->reset_sem.


Then why keep in_gpu_reset and not convert all those places it's used to using 
reset_sem ?

[Dennis Li] Because some functions are also called by 
amdgpu_device_gpu_recover, adev->in_gpu_reset help these functions to skip some 
steps. 

Also bellow there was a comment on amdgpu_device_lock_adev, take a look please.

Andrey

>
> I recently worked (and still do) on device unplug support, I was thinking - 
> can we use something alike drm_dev_enter/drm_dev_exit and drm_dev_unplug for 
> our cause and this would replace both adev->in_gpu_reset and the lock_reset 
> mutex(or rw_semaphote) ? In our case we also would need a  
> 'drm_dev_plug_back' function too. Maybe we can even use the existing drm 
> helpers as is as because while device is being reset it's effectively the 
> same as if it's unplugged i think...
> [Dennis]: When GPU do baco reset, bus is still active which is different 
> unplug. I prefer to use adev->reset_sem.
>
> Some more bellow
>
>> Signed-off-by: Dennis Li 
>> Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 80f32b3beb88..f235492799d7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -963,9 +963,9 @@ struct amdgpu_device {
>>  boolin_suspend;
>>  boolin_hibernate;
>>
>> -boolin_gpu_reset;
>> +atomic_tin_gpu_reset;
>>  enum pp_mp1_state   mp1_state;
>> -struct mutex  lock_reset;
>> +struct rw_semaphore reset_sem;
>>  struct amdgpu_doorbell_index doorbell_index;
>>
>>  struct mutexnotifier_lock;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index 691c89705bcd..af71d8e93081 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void 
>> *mqd,
>>  uint32_t temp;
>>  struct v10_compute_mqd *m = get_mqd(mqd);
>>
>> -if (adev->in_gpu_reset)
>> +if (atomic_read(&adev->in_gpu_reset))
>>  return -EIO;
>>
>>#if 0
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> index 0b7e78748540..750a8308c868 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void 
>> *mqd,
>>  unsigned long flags, end_jiffies;
>>  int retry;
>>
>> -if (adev->in_gpu_reset)
>> +if (atomic_read(&adev->in_gpu_reset))
>&

RE: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset

2020-07-10 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey,
  Please see my below comments.

Best Regards
Dennis Li
-Original Message-
From: Grodzovsky, Andrey  
Sent: Friday, July 10, 2020 11:08 PM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhou1, Tao ; Zhang, 
Hawking ; Chen, Guchun ; Koenig, 
Christian 
Subject: Re: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset


On 7/8/20 3:48 AM, Dennis Li wrote:
> During GPU reset, driver should hold on all external access to
> GPU, otherwise psp will randomly fail to do post, and then cause
> system hang.
>
> v2:
> 1. add rwlock for some ioctls, debugfs and file-close function.
> 2. change to use dqm->is_resetting and dqm_lock for protection in kfd
> driver.
> 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
> re-enter GPU recovery for the same GPU hang.


Still adev->in_gpu_reset is prone to race, no ? I mean if the reset starts 
right AFTER we checked for the flag value then we still have a problem.
[Dennis]: right, adev->in_gpu_reset can't solve race issue, so I introduced 
adev->reset_sem. 

I recently worked (and still do) on device unplug support, I was thinking - can 
we use something alike drm_dev_enter/drm_dev_exit and drm_dev_unplug for our 
cause and this would replace both adev->in_gpu_reset and the lock_reset 
mutex(or rw_semaphote) ? In our case we also would need a  'drm_dev_plug_back' 
function too. Maybe we can even use the existing drm helpers as is as because 
while device is being reset it's effectively the same as if it's unplugged i 
think...
[Dennis]: When GPU do baco reset, bus is still active which is different 
unplug. I prefer to use adev->reset_sem. 

Some more bellow

>
> Signed-off-by: Dennis Li 
> Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 80f32b3beb88..f235492799d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -963,9 +963,9 @@ struct amdgpu_device {
>   boolin_suspend;
>   boolin_hibernate;
>   
> - boolin_gpu_reset;
> + atomic_tin_gpu_reset;
>   enum pp_mp1_state   mp1_state;
> - struct mutex  lock_reset;
> + struct rw_semaphore reset_sem;
>   struct amdgpu_doorbell_index doorbell_index;
>   
>   struct mutexnotifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..af71d8e93081 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   uint32_t temp;
>   struct v10_compute_mqd *m = get_mqd(mqd);
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   #if 0
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index 0b7e78748540..750a8308c868 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   unsigned long flags, end_jiffies;
>   int retry;
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   acquire_queue(kgd, pipe_id, queue_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> index ccd635b812b5..027793e0c1ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   int retry;
>   struct vi_mqd *m = get_mqd(mqd);
>   
> - if (adev->in_gpu_reset)
> + if (atomic_read(&adev->in_gpu_reset))
>   return -EIO;
>   
>   acquire_queue(kgd, pipe_id, queue_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..e4a77f7a4c2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -541,7 +541,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   uint32_t temp;
>   

RE: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset

2020-07-10 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Paul,
  I used our internal tool to make GPU hang and do stress test. In kernel, 
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the  
atomic  adev->in_gpu_reset is used to avoid re-entering GPU recovery. During 
GPU reset and resume, it is unsafe that other threads access GPU, which maybe 
cause GPU reset failed. Therefore the new rw_semaphore  adev->reset_sem is 
introduced, which protect GPU from being accessed by external threads when 
doing recovery.

Best Regards
Dennis Li
-Original Message-
From: Paul Menzel  
Sent: Wednesday, July 8, 2020 7:42 PM
To: Li, Dennis 
Cc: amd-gfx@lists.freedesktop.org; Alex Deucher ; Zhou1, 
Tao ; Zhang, Hawking ; Chen, Guchun 

Subject: Re: [PATCH v2] drm/amdgpu: fix system hang issue during GPU reset

Dear Dennis,


Thank you for you patch.

On 2020-07-08 09:48, Dennis Li wrote:
> During GPU reset, driver should hold on all external access to
> GPU, otherwise psp will randomly fail to do post, and then cause
> system hang.

Maybe update the commit message summary to read:

> Avoid external GPU access on GPU reset to fix system hang

As I am also experiencing system hangs, it would be great to have more
details. What systems are affected? What PSP firmware version? Will the
PSP firmware be fixed, or is the Linux driver violating the API.

How can the hang be reproduced?

Lastly, please explain your changes? Why does `atomic_read()` help for
example?

> v2:
> 1. add rwlock for some ioctls, debugfs and file-close function.
> 2. change to use dqm->is_resetting and dqm_lock for protection in kfd
> driver.
> 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
> re-enter GPU recovery for the same GPU hang.
> 
> Signed-off-by: Dennis Li 
> Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708

[…]


Kind regards,

Paul
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: fix system hang issue during GPU reset

2020-07-06 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,
  Do you mean that KFD use dqm->is_resetting and dqm_lock together to do 
this protection, just like the below function?  If so, there is a new problem. 
When KFD find dqm->is_resetting has been set, how to handle? 
  function () {
dqm_lock(dqm);
if (dqm->is_resetting)
// What to do? return error or others? 
dqm_unlock(dqm);
  }
  In my solution, current thread will be pending ,waiting for GPU reset 
finished and then continue work. BTW, I couldn't find lock dependency issue in 
my patch, can you show me the details? The reset_sem is only used in AMDGPU 
driver, if it is locked and unlocked in pair, it is safe. 

Best Regards
Dennis Li
-Original Message-
From: Kuehling, Felix  
Sent: Tuesday, July 7, 2020 9:38 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhou1, Tao ; Zhang, 
Hawking ; Chen, Guchun 
Subject: Re: [PATCH] drm/amdgpu: fix system hang issue during GPU reset

Am 2020-07-06 um 9:16 p.m. schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Felix,
>   Driver should use the same lock to protect hardware from being accessed 
> during GPU reset. The flag dqm->is_resetting couldn't prevent calls that 
> access hardware in multi-threads case. 

The flag is serialized by the DQM lock. That should handle the multi-threaded 
case. Just make sure you don't start the reset until after all the pre_reset 
calls are done.

Regards,
  Felix

>
> Best Regards
> Dennis Li
> -Original Message-
> From: Kuehling, Felix 
> Sent: Tuesday, July 7, 2020 5:43 AM
> To: Li, Dennis ; amd-gfx@lists.freedesktop.org; 
> Deucher, Alexander ; Zhou1, Tao 
> ; Zhang, Hawking ; Chen, 
> Guchun 
> Subject: Re: [PATCH] drm/amdgpu: fix system hang issue during GPU 
> reset
>
>
> Am 2020-07-06 um 6:01 a.m. schrieb Dennis Li:
>> During GPU reset, driver should hold on all external access to GPU, 
>> otherwise psp will randomly fail to do post, and then cause system 
>> hang.
>>
>> Signed-off-by: Dennis Li 
>> Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 6c7dd0a707c9..34bfc2a147ff 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -965,7 +965,7 @@ struct amdgpu_device {
>>  
>>  boolin_gpu_reset;
>>  enum pp_mp1_state   mp1_state;
>> -struct mutex  lock_reset;
>> +struct rw_semaphore reset_sem;
>>  struct amdgpu_doorbell_index doorbell_index;
>>  
>>  struct mutexnotifier_lock;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index ad59ac4423b8..4139c81389a4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
>> kgd_engine_type engine,
>>  /* This works for NO_HWS. TODO: need to handle without knowing VMID */
>>  job->vmid = vmid;
>>  
>> +down_read(&adev->reset_sem);
> This (and other instances below) will introduce some lock dependency issues. 
> Any lock that you take under KFD's DQM lock will inherit the problem that you 
> can't reclaim memory while holding it because the DQM lock is taken in MMU 
> notifiers. That will affect any attempts of allocating memory while holding 
> the reset_sem.
>
> DQM already has an internal flag dqm->is_resetting that is set in the KFD 
> pre_reset callback. It would be better to use that in DQM to prevent any 
> calls that access hardware.
>
> Regards,
>   Felix
>
>
>>  ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
>> +up_read(&adev->reset_sem);
>>  if (ret) {
>>  DRM_ERROR("amdgpu: failed to schedule IB.\n");
>>  goto err_ib_sched;
>> @@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct
>> kgd_dev *kgd, uint16_t vmid)  {
>>  struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>>  
>> +down_read(&adev->reset_sem);
>> +
>>  if (adev->family == AMDGPU_FAMILY_AI) {
>>  int i;
>>  
>> @@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev 
>> *kgd, uint16_t vmid)
>>  amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
>>  }
>>  
>> +up_read(&

RE: [PATCH] drm/amdgpu: fix system hang issue during GPU reset

2020-07-06 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,
  Driver should use the same lock to protect hardware from being accessed 
during GPU reset. The flag dqm->is_resetting couldn't prevent calls that access 
hardware in multi-threads case. 

Best Regards
Dennis Li
-Original Message-
From: Kuehling, Felix  
Sent: Tuesday, July 7, 2020 5:43 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhou1, Tao ; Zhang, 
Hawking ; Chen, Guchun 
Subject: Re: [PATCH] drm/amdgpu: fix system hang issue during GPU reset


Am 2020-07-06 um 6:01 a.m. schrieb Dennis Li:
> During GPU reset, driver should hold on all external access to GPU, 
> otherwise psp will randomly fail to do post, and then cause system 
> hang.
>
> Signed-off-by: Dennis Li 
> Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6c7dd0a707c9..34bfc2a147ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -965,7 +965,7 @@ struct amdgpu_device {
>  
>   boolin_gpu_reset;
>   enum pp_mp1_state   mp1_state;
> - struct mutex  lock_reset;
> + struct rw_semaphore reset_sem;
>   struct amdgpu_doorbell_index doorbell_index;
>  
>   struct mutexnotifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index ad59ac4423b8..4139c81389a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
> kgd_engine_type engine,
>   /* This works for NO_HWS. TODO: need to handle without knowing VMID */
>   job->vmid = vmid;
>  
> + down_read(&adev->reset_sem);

This (and other instances below) will introduce some lock dependency issues. 
Any lock that you take under KFD's DQM lock will inherit the problem that you 
can't reclaim memory while holding it because the DQM lock is taken in MMU 
notifiers. That will affect any attempts of allocating memory while holding the 
reset_sem.

DQM already has an internal flag dqm->is_resetting that is set in the KFD 
pre_reset callback. It would be better to use that in DQM to prevent any calls 
that access hardware.

Regards,
  Felix


>   ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
> + up_read(&adev->reset_sem);
>   if (ret) {
>   DRM_ERROR("amdgpu: failed to schedule IB.\n");
>   goto err_ib_sched;
> @@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct 
> kgd_dev *kgd, uint16_t vmid)  {
>   struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>  
> + down_read(&adev->reset_sem);
> +
>   if (adev->family == AMDGPU_FAMILY_AI) {
>   int i;
>  
> @@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, 
> uint16_t vmid)
>   amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
>   }
>  
> + up_read(&adev->reset_sem);
> +
>   return 0;
>  }
>  
> @@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev 
> *kgd, uint16_t pasid)
>   struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>   const uint32_t flush_type = 0;
>   bool all_hub = false;
> + int ret = 0;
>  
>   if (adev->family == AMDGPU_FAMILY_AI)
>   all_hub = true;
>  
> - return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
> + down_read(&adev->reset_sem);
> +
> + ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, 
> +all_hub);
> +
> + up_read(&adev->reset_sem);
> +
> + return ret;
>  }
>  
>  bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..db5d533dd406 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   unsigned long end_jiffies;
>   uint32_t temp;
>   struct v10_compute_mqd *m = get_mqd(mqd);
> + int ret = 0;
>  
>   if (adev->in_gpu_reset)
>   return -EIO;
> @@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>   int retry;
>  #endif
>  
> + down_read(&adev->reset_sem);
> +
>   acquire

RE: [PATCH] drm/amdgpu: return an error for hw access in INFO ioctl when in reset

2020-07-01 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian and Alex
  Not only amdgpu ioctls, but amdkfd ioctls also have the same issue. 

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Christian 
König
Sent: Wednesday, July 1, 2020 4:20 PM
To: Alex Deucher ; amd-gfx list 

Cc: Deucher, Alexander 
Subject: Re: [PATCH] drm/amdgpu: return an error for hw access in INFO ioctl 
when in reset

I don't think this is a good idea, we should probably rather wait for the GPU 
reset to finish by taking the appropriate lock.

Christian.

Am 01.07.20 um 07:33 schrieb Alex Deucher:
> ping?
>
> On Fri, Jun 26, 2020 at 10:04 AM Alex Deucher  wrote:
>> When the GPU is in reset, accessing the hw is unreliable and could 
>> interfere with the reset.  Return an error in those cases.
>>
>> Signed-off-by: Alex Deucher 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++
>>   1 file changed, 6 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 341d072edd95..fd51d6554ee2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -684,6 +684,9 @@ static int amdgpu_info_ioctl(struct drm_device *dev, 
>> void *data, struct drm_file
>>  if (info->read_mmr_reg.count > 128)
>>  return -EINVAL;
>>
>> +   if (adev->in_gpu_reset)
>> +   return -EPERM;
>> +
>>  regs = kmalloc_array(info->read_mmr_reg.count, 
>> sizeof(*regs), GFP_KERNEL);
>>  if (!regs)
>>  return -ENOMEM; @@ -854,6 +857,9 @@ static 
>> int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
>>  if (!adev->pm.dpm_enabled)
>>  return -ENOENT;
>>
>> +   if (adev->in_gpu_reset)
>> +   return -EPERM;
>> +
>>  switch (info->sensor_info.type) {
>>  case AMDGPU_INFO_SENSOR_GFX_SCLK:
>>  /* get sclk in Mhz */
>> --
>> 2.25.4
>>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CDe
> nnis.Li%40amd.com%7Cefeeda4b6d194660fbc508d81d9791a3%7C3dd8961fe4884e6
> 08e11a82d994e183d%7C0%7C0%7C637291884123360340&sdata=GNPWQNndUJKx7
> 70fDTuRGBnJzfmRUQjD4B1HBie3xUQ%3D&reserved=0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CDennis.Li%40amd.com%7Cefeeda4b6d194660fbc508d81d9791a3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637291884123360340&sdata=GNPWQNndUJKx770fDTuRGBnJzfmRUQjD4B1HBie3xUQ%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: correct ras query as part of ctx query

2020-06-11 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Guchun,
 The ras_manager obj will save the error counters in every querying, 
therefore the previous querying shouldn't affect the result of current 
querying. Please check the function: amdgpu_ras_error_query. 
 
Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Thursday, June 11, 2020 6:24 PM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Zhou1, Tao ; Pan, Xinhui ; Li, Dennis 
; Clements, John 
Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: correct ras query as part of ctx query

Almost error count registers are automatically cleared after reading once, so 
both CE and UE count needs to be read in one loop.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 ++--
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c06cb06398b1..29fa6b6b9d3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -335,7 +335,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,  {
struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr;
-   unsigned long ras_counter;
+   unsigned long ras_counter_ue, ras_counter_ce;
 
if (!fpriv)
return -EINVAL;
@@ -360,19 +360,17 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
-   /*query ue count*/
-   ras_counter = amdgpu_ras_query_error_count(adev, false);
+   /*query both ue and ce count*/
+   amdgpu_ras_query_error_count(adev, &ras_counter_ue, &ras_counter_ce);
/*ras counter is monotonic increasing*/
-   if (ras_counter != ctx->ras_counter_ue) {
+   if (ras_counter_ue != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
-   ctx->ras_counter_ue = ras_counter;
+   ctx->ras_counter_ue = ras_counter_ue;
}
 
-   /*query ce count*/
-   ras_counter = amdgpu_ras_query_error_count(adev, true);
-   if (ras_counter != ctx->ras_counter_ce) {
+   if (ras_counter_ce != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
-   ctx->ras_counter_ce = ras_counter;
+   ctx->ras_counter_ce = ras_counter_ce;
}
 
mutex_unlock(&mgr->lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 337bf2da7bdc..109eff2869b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -861,15 +861,18 @@ int amdgpu_ras_error_cure(struct amdgpu_device *adev,  }
 
 /* get the total error counts on all IPs */ -unsigned long 
amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+   unsigned long *ue_cnt, unsigned long *ce_cnt)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
struct ras_err_data data = {0, 0};
 
+   *ue_cnt = 0;
+   *ce_cnt = 0;
+
if (!con)
-   return 0;
+   return;
 
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
@@ -877,13 +880,14 @@ unsigned long amdgpu_ras_query_error_count(struct 
amdgpu_device *adev,
};
 
if (amdgpu_ras_error_query(adev, &info))
-   return 0;
+   continue;
 
data.ce_count += info.ce_count;
data.ue_count += info.ue_count;
}
 
-   return is_ce ? data.ce_count : data.ue_count;
+   *ue_cnt = data.ue_count;
+   *ce_cnt = data.ce_count;
 }
 /* query/inject/cure end */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index e7df5d8429f8..733eab5bc512 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -487,8 +487,8 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device 
*adev,  void amdgpu_ras_resume(struct amdgpu_device *adev);  void 
amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+   unsigned long *ue_cnt, unsigned long *ce_cnt);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-05-14 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Jiange,
  How to handle the case that multi-apps do the auto dump? This patch seems 
not multi-process safety.

Best Regards
Dennis Li
From: amd-gfx  On Behalf Of Christian 
König
Sent: Thursday, May 14, 2020 4:29 PM
To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Pelloux-prayer, Pierre-eric 
; Kuehling, Felix ; 
Liu, Monk ; Zhang, Hawking 
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Hi Jiange,

it probably won't hurt, but I would just drop that. You need roughly 4 billion 
GPU resets until the UINT_MAX-1 becomes zero again.

Christian.

Am 14.05.20 um 09:14 schrieb Zhao, Jiange:

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

wait_for_completion_interruptible_timeout() would decrease 
autodump.dumping.done to UINT_MAX-1.

complete_all() here would restore autodump.dumping to the state as in 
amdgpu_debugfs_autodump_init().

I want to make sure every open() deals with the same situation.

Jiange

From: Christian König 

Sent: Thursday, May 14, 2020 3:01 PM
To: Zhao, Jiange ; 
amd-gfx@lists.freedesktop.org 

Cc: Pelloux-prayer, Pierre-eric 
;
 Zhao, Jiange ; Kuehling, 
Felix ; Deucher, 
Alexander ; 
Koenig, Christian ; 
Liu, Monk ; Zhang, Hawking 

Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Am 14.05.20 um 07:29 schrieb jia...@amd.com:
> From: Jiange Zhao 
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>  (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>rename debugfs file to amdgpu_autodump,
>provide autodump_read as well,
>style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>  the node can be reopened; also, there is no need to wait for
>  completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>  add 'app_state_mutex' for race conditions:
>(1)Only 1 user can open this file node
>(2)wait_dump() can only take effect after poll() executed.
>(3)eliminated the race condition between release() and
>   wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>  removed state checking in amdgpu_debugfs_wait_dump
>  Improve on top of version 3 so that the node can be reopened.
>
> v7: move reinit_completion into open() so that only one user
>  can open it.
>
> Signed-off-by: Jiange Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>charproduct_number[16];
>charproduct_name[32];
>charserial[16];
> +
> + struct amdgpu_autodump  autodump;
>   };
>
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
> *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..efee3f1adecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include 
>   #include 
>   #include 
> -
> +#includ

RE: [PATCH] drm/amdgpu: Print CU information by default during initialization

2020-04-17 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Dennis Li 

-Original Message-
From: amd-gfx  On Behalf Of Yong Zhao
Sent: Saturday, April 18, 2020 5:46 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhao, Yong 
Subject: [PATCH] drm/amdgpu: Print CU information by default during 
initialization

This is convenient for multiple teams to obtain the information. Also, add 
device info by using dev_info().

Signed-off-by: Yong Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71ea56e220ae..423eed223aa5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3170,7 +3170,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
goto failed;
}
 
-   DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
+   dev_info(adev->dev,
+   "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
adev->gfx.config.max_shader_engines,
adev->gfx.config.max_sh_per_se,
adev->gfx.config.max_cu_per_sh,
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CDennis.Li%40amd.com%7Cc78222da1054497bf05a08d7e318d379%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637227568077825376&sdata=mC5ZAGmsbikJqKaOcJFR%2FF%2FxZK6PYIC5908DAGXCRTk%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: stop disable the scheduler during HW fini

2020-02-28 Thread Li, Dennis
[AMD Public Use]

Looks good to me

Test-by: Dennis Li mailto:dennis...@amd.com>>

Best Regards
Dennis Li
From: amd-gfx  On Behalf Of Deucher, 
Alexander
Sent: Thursday, February 27, 2020 11:18 PM
To: Christian König ; Das, Nirmoy 
; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: stop disable the scheduler during HW fini


[AMD Public Use]

Looks good to me.
Reviewed-by: Alex Deucher 
mailto:alexander.deuc...@amd.com>>

From: Christian König 
mailto:ckoenig.leichtzumer...@gmail.com>>
Sent: Thursday, February 27, 2020 9:50 AM
To: Das, Nirmoy mailto:nirmoy@amd.com>>; 
amd-gfx@lists.freedesktop.org 
mailto:amd-gfx@lists.freedesktop.org>>; Deucher, 
Alexander mailto:alexander.deuc...@amd.com>>
Subject: Re: [PATCH] drm/amdgpu: stop disable the scheduler during HW fini

Alex any comment on this?

Am 25.02.20 um 14:16 schrieb Nirmoy:
> Acked-by: Nirmoy Das mailto:nirmoy@amd.com>>
>
> On 2/25/20 2:07 PM, Christian König wrote:
>> When we stop the HW for example for GPU reset we should not stop the
>> front-end scheduler. Otherwise we run into intermediate failures during
>> command submission.
>>
>> The scheduler should only be stopped in very few cases:
>> 1. We can't get the hardware working in ring or IB test after a GPU
>> reset.
>> 2. The KIQ scheduler is not used in the front-end and should be
>> disabled during GPU reset.
>> 3. In amdgpu_ring_fini() when the driver unloads.
>>
>> Signed-off-by: Christian König 
>> mailto:christian.koe...@amd.com>>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  2 --
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c |  8 
>>   drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c  |  5 -
>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c  | 25 +
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  |  7 ---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  |  9 -
>>   drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  2 --
>>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  2 --
>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 
>>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/si_dma.c|  1 -
>>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c  |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c  |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c  |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c  |  7 ---
>>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c  |  4 
>>   drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c  |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c  |  9 -
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c  | 11 +--
>>   20 files changed, 10 insertions(+), 104 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> index 4274ccf765de..cb3b3a0a1348 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> @@ -320,8 +320,6 @@ static void cik_sdma_gfx_stop(struct
>> amdgpu_device *adev)
>>   WREG32(mmSDMA0_GFX_RB_CNTL + sdma_offsets[i], rb_cntl);
>>   WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], 0);
>>   }
>> -sdma0->sched.ready = false;
>> -sdma1->sched.ready = false;
>>   }
>> /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 7b6158320400..36ce67ce4800 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -2391,10 +2391,6 @@ static int gfx_v10_0_cp_gfx_enable(struct
>> amdgpu_device *adev, bool enable)
>>   tmp = REG_SET_FIELD(tmp, CP_ME_CNTL, ME_HALT, enable ? 0 : 1);
>>   tmp = REG_SET_FIELD(tmp, CP_ME_CNTL, PFP_HALT, enable ? 0 : 1);
>>   tmp = REG_SET_FIELD(tmp, CP_ME_CNTL, CE_HALT, enable ? 0 : 1);
>> -if (!enable) {
>> -for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>> -adev->gfx.gfx_ring[i].sched.ready = false;
>> -}
>>   WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>> for (i = 0; i < adev->usec_timeout; i++) {
>> @@ -2869,16 +2865,12 @@ static int gfx_v10_0_cp_gfx_resume(struct
>> amdgpu_device *adev)
>> static void gfx_v10_0_cp_compute_enable(struct amdgpu_device
>> *adev, bool enable)
>>   {
>> -int i;
>> -
>>   if (enable) {
>>   WREG32_SOC15(GC, 0, mmCP_MEC_CNTL, 0);
>>   } else {
>>   WREG32_SOC15(GC, 0, mmCP_MEC_CNTL,
>>(CP_MEC_CNTL__MEC_ME1_HALT_MASK |
>> CP_MEC_CNTL__MEC_ME2_HALT_MASK));
>> -for (i = 0; i < adev->gfx.num_compute_rings; i++)
>> -adev->gfx.compute_ring[i].sched.ready = false;
>>   adev->gfx.kiq.ring.sched.ready = false;
>>   }
>>   udelay(50);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> index 31f44d05e606..e462a099dbda 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> +++ b/drivers/gpu/drm/a

RE: [PATCH] drm/amdgpu: fix a bug NULL pointer dereference

2020-02-20 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Christian and Monk,
  When doing SDMA copy, a RAS uncorrectable error happens, which will cause 
this issue.  The RAS uncorrectable error event will trigger driver to do BACO 
reset which will set the status of SDMA scheduler to no ready. And then 
drm_sched_entity_get_free_sched will return NULL in drm_sched_entity_select_rq, 
which cause entity->rq to NULL. 

Best Regards
Dennis Li
-Original Message-
From: Liu, Monk  
Sent: Wednesday, February 19, 2020 7:30 PM
To: Koenig, Christian ; Zhang, Hawking 
; Li, Dennis ; 
amd-gfx@lists.freedesktop.org; Deucher, Alexander ; 
Zhou1, Tao ; Chen, Guchun 
Subject: 回复: [PATCH] drm/amdgpu: fix a bug NULL pointer dereference

> + if (!entity->rq)
> + return 0;
> +

Yes, supposedly we shouldn't get 'entity->rq == NULL' case , that looks the 
true bug 

-邮件原件-
发件人: amd-gfx  代表 Christian K?nig
发送时间: 2020年2月19日 18:50
收件人: Zhang, Hawking ; Li, Dennis ; 
amd-gfx@lists.freedesktop.org; Deucher, Alexander ; 
Zhou1, Tao ; Chen, Guchun 
主题: Re: [PATCH] drm/amdgpu: fix a bug NULL pointer dereference

Well of hand this patch looks like a clear NAK to me.

Returning without raising an error is certainly the wrong thing to do here 
because we just drop the necessary page table updates.

How does the entity->rq ends up as NULL in the first place?

Regards,
Christian.

Am 19.02.20 um 07:26 schrieb Zhang, Hawking:
> [AMD Official Use Only - Internal Distribution Only]
>
> Reviewed-by: Hawking Zhang 
>
> Regards,
> Hawking
> -Original Message-
> From: Dennis Li 
> Sent: Wednesday, February 19, 2020 12:05
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
> ; Zhou1, Tao ; Zhang, 
> Hawking ; Chen, Guchun 
> Cc: Li, Dennis 
> Subject: [PATCH] drm/amdgpu: fix a bug NULL pointer dereference
>
> check whether the queue of entity is null to avoid null pointer dereference.
>
> Change-Id: I08d56774012cf229ba2fe7a011c1359e8d1e2781
> Signed-off-by: Dennis Li 
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 4cc7881f438c..67cca463ddcc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -95,6 +95,9 @@ static int amdgpu_vm_sdma_commit(struct 
> amdgpu_vm_update_params *p,
>   int r;
>   
>   entity = p->direct ? &p->vm->direct : &p->vm->delayed;
> + if (!entity->rq)
> + return 0;
> +
>   ring = container_of(entity->rq->sched, struct amdgpu_ring, sched);
>   
>   WARN_ON(ib->length_dw == 0);
> --
> 2.17.1
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cmo
> nk.liu%40amd.com%7C28e7260af3a24eec758f08d7b52975e3%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637177062003213431&sdata=vMXmhwTlN8lAav
> uqhYhpmKLM6V%2F%2B2%2FubFBbsk%2BGY%2Bjw%3D&reserved=0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cmonk.liu%40amd.com%7C28e7260af3a24eec758f08d7b52975e3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637177062003213431&sdata=vMXmhwTlN8lAavuqhYhpmKLM6V%2F%2B2%2FubFBbsk%2BGY%2Bjw%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 4/4] drm/amdgpu: add RAS support for the gfx block of Arcturus

2020-01-18 Thread Li, Dennis
[AMD Public Use]

Hi, Guchun,
  adev->grbm_idx_mutex is only used to protect the access to registers 
whose instance switch is indexed by grbm_index. 

Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Sunday, January 19, 2020 11:40 AM
To: Li, Dennis ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhou1, Tao ; Zhang, 
Hawking 
Cc: Li, Dennis 
Subject: RE: [PATCH 4/4] drm/amdgpu: add RAS support for the gfx block of 
Arcturus

[AMD Public Use]

+   switch (adev->asic_type)
+   {
Please correct the coding style. '{' should stay at the same line of switch.

+   mutex_unlock(&adev->grbm_idx_mutex);
+
+   gfx_v9_4_query_utc_edc_status(adev, err_data);
Is it necessary to move gfx_v9_4_query_utc_edc_status calling above to be 
protected by lock? This can possibly avoid the problem of concurrent multi 
query?

With above two fixed, the series is: Reviewed-by: Guchun Chen 


Regards,
Guchun

-Original Message-
From: Dennis Li 
Sent: Sunday, January 19, 2020 10:46 AM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Zhou1, Tao ; Zhang, Hawking 
; Chen, Guchun 
Cc: Li, Dennis 
Subject: [PATCH 4/4] drm/amdgpu: add RAS support for the gfx block of Arcturus

Implement functions to do the RAS error injection and query EDC counter.

Change-Id: I4d947511331a19c1967551b9d42997698073f795
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  26 +-  
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 978 ++  
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h |  35 +
 4 files changed, 1039 insertions(+), 1 deletion(-)  create mode 100644 
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 83ee1c676e3a..ccfdcfc6a526 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -120,6 +120,7 @@ amdgpu-y += \
amdgpu_rlc.o \
gfx_v8_0.o \
gfx_v9_0.o \
+   gfx_v9_4.o \
gfx_v10_0.o
 
 # add async DMA block
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 35b5ca7a9272..7c5b3ad25d51 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -48,6 +48,8 @@
 
 #include "amdgpu_ras.h"
 
+#include "gfx_v9_4.h"
+
 #define GFX9_NUM_GFX_RINGS 1
 #define GFX9_MEC_HPD_SIZE 4096
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000L @@ -1822,6 +1824,17 @@ 
static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
.query_ras_error_count = &gfx_v9_0_query_ras_error_count  };
 
+static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
+   .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
+   .select_se_sh = &gfx_v9_0_select_se_sh,
+   .read_wave_data = &gfx_v9_0_read_wave_data,
+   .read_wave_sgprs = &gfx_v9_0_read_wave_sgprs,
+   .read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
+   .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
+   .ras_error_inject = &gfx_v9_4_ras_error_inject,
+   .query_ras_error_count = &gfx_v9_4_query_ras_error_count };
+
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)  {
u32 gb_addr_config;
@@ -1873,6 +1886,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device 
*adev)
gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
break;
case CHIP_ARCTURUS:
+   adev->gfx.funcs = &gfx_v9_4_gfx_funcs;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -4232,7 
+4246,17 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device 
*adev)
goto fail;
}
 
-   gfx_v9_0_clear_ras_edc_counter(adev);
+   switch (adev->asic_type)
+   {
+   case CHIP_VEGA20:
+   gfx_v9_0_clear_ras_edc_counter(adev);
+   break;
+   case CHIP_ARCTURUS:
+   gfx_v9_4_clear_ras_edc_counter(adev);
+   break;
+   default:
+   break;
+   }
 
 fail:
amdgpu_ib_free(adev, &ib, NULL);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
new file mode 100644
index ..e19d275f3f7d
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person 
+obtaining a
+ * copy of this software and associated documentation files (the 
+"Software"),
+ * to deal in the Software without restriction, including without 
+limitation
+ * the rights to use, copy, modify, merge

RE: [PATCH] drm/amdgpu: attempt to enable gfxoff on more raven1 boards

2020-01-15 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Alex,
  it is better to refine the patch as a common function, not only used for 
raven.

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Thursday, January 16, 2020 1:32 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH] drm/amdgpu: attempt to enable gfxoff on more raven1 boards

Switch to a blacklist so we can disable specific boards that are problematic.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 42 ---
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e3d466bd5c4e..b48b07bcd0fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1031,6 +1031,37 @@ static void gfx_v9_0_check_fw_write_wait(struct 
amdgpu_device *adev)
}
 }
 
+struct amdgpu_gfxoff_quirk {
+   u16 chip_vendor;
+   u16 chip_device;
+   u16 subsys_vendor;
+   u16 subsys_device;
+   u8 revision;
+};
+
+static const struct amdgpu_gfxoff_quirk amdgpu_gfxoff_quirk_list[] = {
+   /* 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.kernel.org%2Fshow_bug.cgi%3Fid%3D204689&data=02%7C01%7CDennis.Li%40amd.com%7C33990b7157714a2f943a08d799e0cda3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637147063123345220&sdata=2mT3Eug%2FEDKGGbI1bqymp2tnMqLX4x%2B2BAWnLUnq5m0%3D&reserved=0
 */
+   { 0x1002, 0x15dd, 0x1002, 0x15dd, 0xc8 },
+   { 0, 0, 0, 0, 0 },
+};
+
+static bool gfx_v9_0_raven_check_disable_gfxoff(struct pci_dev *pdev) {
+   const struct amdgpu_gfxoff_quirk *p = amdgpu_gfxoff_quirk_list;
+
+   while (p && p->chip_device != 0) {
+   if (pdev->vendor == p->chip_vendor &&
+   pdev->device == p->chip_device &&
+   pdev->subsystem_vendor == p->subsys_vendor &&
+   pdev->subsystem_device == p->subsys_device &&
+   pdev->revision == p->revision) {
+   return true;
+   }
+   ++p;
+   }
+   return false;
+}
+
 static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev)  {
switch (adev->asic_type) {
@@ -1039,10 +1070,13 @@ static void gfx_v9_0_check_if_need_gfxoff(struct 
amdgpu_device *adev)
case CHIP_VEGA20:
break;
case CHIP_RAVEN:
-   if (!(adev->rev_id >= 0x8 ||
- adev->pdev->device == 0x15d8) &&
-   (adev->pm.fw_version < 0x41e2b || /* not raven1 fresh */
-!adev->gfx.rlc.is_rlc_v2_1)) /* without rlc save restore 
ucodes */
+   if (!(adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8) &&
+   ((adev->gfx.rlc_fw_version != 106 &&
+ adev->gfx.rlc_fw_version < 531) ||
+(adev->gfx.rlc_fw_version == 53815) ||
+(adev->gfx.rlc_feature_version < 1) ||
+!adev->gfx.rlc.is_rlc_v2_1) &&
+   !gfx_v9_0_raven_check_disable_gfxoff(adev->pdev))
adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
 
if (adev->pm.pp_feature & PP_GFXOFF_MASK)
--
2.24.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CDennis.Li%40amd.com%7C33990b7157714a2f943a08d799e0cda3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637147063123345220&sdata=ON7UTCOhoCaW%2Bwp0KiMCjOQHt6QohngaFxx9hgfKS7o%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/4] drm/amd/powerplay: Add EEPROM I2C read/write support to Arcturus.

2019-10-20 Thread Li, Dennis
To make the function behavior more clear, It's better to change the function 
declaration from 

+ static void arcturus_fill_eeprom_i2c_req(SwI2cRequest_t  *req, bool write,
+ uint8_t address, uint32_t numbytes,
+ uint8_t *data)

to

+static void arcturus_fill_eeprom_i2c_req(SwI2cRequest_t  *req, bool 
write_or_read,
+ uint8_t address, uint32_t numbytes,
+ uint8_t *data)

Best Regards
Dennis Li
-Original Message-
From: amd-gfx  On Behalf Of Andrey 
Grodzovsky
Sent: Saturday, October 19, 2019 4:48 AM
To: amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey ; Chen, Guchun 
; Zhou1, Tao ; 
noreply-conflue...@amd.com; Deucher, Alexander ; 
Quan, Evan 
Subject: [PATCH 2/4] drm/amd/powerplay: Add EEPROM I2C read/write support to 
Arcturus.

The communication is done through SMU table and hence the code is in powerplay.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 229 +++
 1 file changed, 229 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index 90d871a..53d08de5 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -36,6 +36,11 @@
 #include "smu_v11_0_pptable.h"
 #include "arcturus_ppsmc.h"
 #include "nbio/nbio_7_4_sh_mask.h"
+#include 
+#include 
+#include "amdgpu_ras.h"
+
+#define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, 
+eeprom_control.eeprom_accessor))->adev
 
 #define CTF_OFFSET_EDGE5
 #define CTF_OFFSET_HOTSPOT 5
@@ -171,6 +176,7 @@ static struct smu_11_0_cmn2aisc_mapping 
arcturus_table_map[SMU_TABLE_COUNT] = {
TAB_MAP(SMU_METRICS),
TAB_MAP(DRIVER_SMU_CONFIG),
TAB_MAP(OVERDRIVE),
+   TAB_MAP(I2C_COMMANDS),
 };
 
 static struct smu_11_0_cmn2aisc_mapping 
arcturus_pwr_src_map[SMU_POWER_SOURCE_COUNT] = { @@ -293,6 +299,9 @@ static int 
arcturus_tables_init(struct smu_context *smu, struct smu_table *table
SMU_TABLE_INIT(tables, SMU_TABLE_SMU_METRICS, sizeof(SmuMetrics_t),
   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
 
+   SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, sizeof(SwI2cRequest_t),
+  PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
if (!smu_table->metrics_table)
return -ENOMEM;
@@ -1927,6 +1936,224 @@ static int arcturus_dpm_set_uvd_enable(struct 
smu_context *smu, bool enable)
return ret;
 }
 
+
+static void arcturus_fill_eeprom_i2c_req(SwI2cRequest_t  *req, bool write,
+ uint8_t address, uint32_t numbytes,
+ uint8_t *data)
+{
+   int i;
+
+   BUG_ON(numbytes > MAX_SW_I2C_COMMANDS);
+
+   req->I2CcontrollerPort = 0;
+   req->I2CSpeed = 2;
+   req->SlaveAddress = address;
+   req->NumCmds = numbytes;
+
+   for (i = 0; i < numbytes; i++) {
+   SwI2cCmd_t *cmd =  &req->SwI2cCmds[i];
+
+   /* First 2 bytes are always write for lower 2b EEPROM address */
+   if (i < 2)
+   cmd->Cmd = 1;
+   else
+   cmd->Cmd = write;
+
+
+   /* Add RESTART for read  after address filled */
+   cmd->CmdConfig |= (i == 2 && !write) ? CMDCONFIG_RESTART_MASK : 
0;
+
+   /* Add STOP in the end */
+   cmd->CmdConfig |= (i == (numbytes - 1)) ? CMDCONFIG_STOP_MASK : 
0;
+
+   /* Fill with data regardless if read or write to simplify code 
*/
+   cmd->RegisterAddr = data[i];
+   }
+}
+
+static int arcturus_i2c_eeprom_read_data(struct i2c_adapter *control,
+  uint8_t address,
+  uint8_t *data,
+  uint32_t numbytes)
+{
+   uint32_t  i, ret = 0;
+   SwI2cRequest_t req;
+   struct amdgpu_device *adev = to_amdgpu_device(control);
+   struct smu_table_context *smu_table = &adev->smu.smu_table;
+   struct smu_table *table = &smu_table->tables[SMU_TABLE_I2C_COMMANDS];
+
+   memset(&req, 0, sizeof(req));
+   arcturus_fill_eeprom_i2c_req(&req, false, address, numbytes, data);
+
+   mutex_lock(&adev->smu.mutex);
+   /* Now read data starting with that address */
+   ret = smu_update_table(&adev->smu, SMU_TABLE_I2C_COMMANDS, 0, &req,
+   true);
+   mutex_unlock(&adev->smu.mutex);
+
+   if (!ret) {
+   SwI2cRequest_t *res = (SwI2cRequest_t *)table->cpu_addr;
+
+   /* Assume SMU  fills res.SwI2cCmds[i].Data with read bytes */
+   for (i = 0; i < numbytes; i++)
+   data