[PATCH] drm/amdgpu: enable support error injection broadcast to all instances

2021-06-11 Thread Dennis Li
when the address is -1, TA will do error injection for all instances of
the specail sram.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 885a78301bbf..c828ce9525d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -402,8 +402,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
ret = amdgpu_ras_feature_enable(adev, , 1);
break;
case 2:
-   if ((data.inject.address >= adev->gmc.mc_vram_size) ||
-   (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+   if ((data.inject.address != (uint64_t)-1) &&
+   ((data.inject.address >= adev->gmc.mc_vram_size) ||
+   (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT))) {
dev_warn(adev->dev, "RAS WARN: input address "
"0x%llx is invalid.",
data.inject.address);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: fix a resource leakage issue

2021-05-18 Thread Dennis Li
The function kfd_lookup_process_by_pasid will increase the reference
count of kfd_process object, its caller should call kfd_unref_process to
decrease the reference count. Otherwise resource leakage will happen.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 3c9fe078334a..6cc6afb96a45 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1100,4 +1100,6 @@ void kfd_signal_poison_consumed_event(struct kfd_dev 
*dev, u32 pasid)
 
/* user application will handle SIGBUS signal */
send_sig(SIGBUS, p->lead_thread, 0);
+
+   kfd_unref_process(p);
 }
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: refine the poison data consumption handling

2021-05-11 Thread Dennis Li
The user applications maybe register the KFD_EVENT_TYPE_HW_EXCEPTION and
KFD_EVENT_TYPE_MEMORY events, driver could notify them when poison data
consumed. Beside that, some applications maybe register SIGBUS signal
hander. These applications will handle poison data by themselves, exit
or re-create context to re-dispatch works.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index ba2c2ce0c55a..4d210f23c33c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1050,3 +1050,42 @@ void kfd_signal_reset_event(struct kfd_dev *dev)
}
srcu_read_unlock(_processes_srcu, idx);
 }
+
+void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid)
+{
+   struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+   struct kfd_hsa_memory_exception_data memory_exception_data;
+   struct kfd_hsa_hw_exception_data hw_exception_data;
+   struct kfd_event *ev;
+   uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+
+   if (!p)
+   return; /* Presumably process exited. */
+
+   memset(_exception_data, 0, sizeof(hw_exception_data));
+   hw_exception_data.gpu_id = dev->id;
+   hw_exception_data.memory_lost = 1;
+   hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
+
+   memset(_exception_data, 0, sizeof(memory_exception_data));
+   memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
+   memory_exception_data.gpu_id = dev->id;
+   memory_exception_data.failure.imprecise = true;
+
+   mutex_lock(>event_mutex);
+   idr_for_each_entry_continue(>event_idr, ev, id) {
+   if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
+   ev->hw_exception_data = hw_exception_data;
+   set_event(ev);
+   }
+
+   if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+   ev->memory_exception_data = memory_exception_data;
+   set_event(ev);
+   }
+   }
+   mutex_unlock(>event_mutex);
+
+   /* user application will handle SIGBUS signal */
+   send_sig(SIGBUS, p->lead_thread, 0);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 97c36e3c8c80..9f9b1dfb9c37 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -230,7 +230,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-   kfd_signal_hw_exception_event(pasid);
+   kfd_signal_poison_consumed_event(dev, 
pasid);
amdgpu_amdkfd_gpu_reset(dev->kgd);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 64552f6b8ba4..daa9d47514c6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1144,6 +1144,8 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 
pasid,
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
 
+void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid);
+
 void kfd_flush_tlb(struct kfd_process_device *pdd);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: add synchronization among waves in the same threadgroup

2021-05-11 Thread Dennis Li
It is possible that the previous waves have exited before others are
created, so the other waves maybe reuse pyhsical resouces left by
previous ones. Therefore add barrier instruction to synchronize waves within
the same threadgroup.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index fdd65589f06b..dbad9ef002d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -93,98 +93,99 @@ static const struct soc15_reg_golden 
golden_settings_gc_9_4_2_alde[] = {
 static const u32 vgpr_init_compute_shader_aldebaran[] = {
0xb8840904, 0xb8851a04, 0xb8861344, 0xb8831804, 0x9208ff06, 0x0280,
0x9209a805, 0x920a8a04, 0x81080908, 0x81080a08, 0x81080308, 0x8e078208,
-   0x81078407, 0xc0410080, 0x0007, 0xbf8c, 0xd3d94000, 0x1880,
-   0xd3d94001, 0x1880, 0xd3d94002, 0x1880, 0xd3d94003, 0x1880,
-   0xd3d94004, 0x1880, 0xd3d94005, 0x1880, 0xd3d94006, 0x1880,
-   0xd3d94007, 0x1880, 0xd3d94008, 0x1880, 0xd3d94009, 0x1880,
-   0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, 0xd3d9400c, 0x1880,
-   0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, 0xd3d9400f, 0x1880,
-   0xd3d94010, 0x1880, 0xd3d94011, 0x1880, 0xd3d94012, 0x1880,
-   0xd3d94013, 0x1880, 0xd3d94014, 0x1880, 0xd3d94015, 0x1880,
-   0xd3d94016, 0x1880, 0xd3d94017, 0x1880, 0xd3d94018, 0x1880,
-   0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, 0xd3d9401b, 0x1880,
-   0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, 0xd3d9401e, 0x1880,
-   0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, 0xd3d94021, 0x1880,
-   0xd3d94022, 0x1880, 0xd3d94023, 0x1880, 0xd3d94024, 0x1880,
-   0xd3d94025, 0x1880, 0xd3d94026, 0x1880, 0xd3d94027, 0x1880,
-   0xd3d94028, 0x1880, 0xd3d94029, 0x1880, 0xd3d9402a, 0x1880,
-   0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, 0xd3d9402d, 0x1880,
-   0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, 0xd3d94030, 0x1880,
-   0xd3d94031, 0x1880, 0xd3d94032, 0x1880, 0xd3d94033, 0x1880,
-   0xd3d94034, 0x1880, 0xd3d94035, 0x1880, 0xd3d94036, 0x1880,
-   0xd3d94037, 0x1880, 0xd3d94038, 0x1880, 0xd3d94039, 0x1880,
-   0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, 0xd3d9403c, 0x1880,
-   0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, 0xd3d9403f, 0x1880,
-   0xd3d94040, 0x1880, 0xd3d94041, 0x1880, 0xd3d94042, 0x1880,
-   0xd3d94043, 0x1880, 0xd3d94044, 0x1880, 0xd3d94045, 0x1880,
-   0xd3d94046, 0x1880, 0xd3d94047, 0x1880, 0xd3d94048, 0x1880,
-   0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, 0xd3d9404b, 0x1880,
-   0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, 0xd3d9404e, 0x1880,
-   0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, 0xd3d94051, 0x1880,
-   0xd3d94052, 0x1880, 0xd3d94053, 0x1880, 0xd3d94054, 0x1880,
-   0xd3d94055, 0x1880, 0xd3d94056, 0x1880, 0xd3d94057, 0x1880,
-   0xd3d94058, 0x1880, 0xd3d94059, 0x1880, 0xd3d9405a, 0x1880,
-   0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, 0xd3d9405d, 0x1880,
-   0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, 0xd3d94060, 0x1880,
-   0xd3d94061, 0x1880, 0xd3d94062, 0x1880, 0xd3d94063, 0x1880,
-   0xd3d94064, 0x1880, 0xd3d94065, 0x1880, 0xd3d94066, 0x1880,
-   0xd3d94067, 0x1880, 0xd3d94068, 0x1880, 0xd3d94069, 0x1880,
-   0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, 0xd3d9406c, 0x1880,
-   0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880, 0xd3d9406f, 0x1880,
-   0xd3d94070, 0x1880, 0xd3d94071, 0x1880, 0xd3d94072, 0x1880,
-   0xd3d94073, 0x1880, 0xd3d94074, 0x1880, 0xd3d94075, 0x1880,
-   0xd3d94076, 0x1880, 0xd3d94077, 0x1880, 0xd3d94078, 0x1880,
-   0xd3d94079, 0x1880, 0xd3d9407a, 0x1880, 0xd3d9407b, 0x1880,
-   0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880, 0xd3d9407e, 0x1880,
-   0xd3d9407f, 0x1880, 0xd3d94080, 0x1880, 0xd3d94081, 0x1880,
-   0xd3d94082, 0x1880, 0xd3d94083, 0x1880, 0xd3d94084, 0x1880,
-   0xd3d94085, 0x1880, 0xd3d94086, 0x1880, 0xd3d94087, 0x1880,
-   0xd3d94088, 0x1880, 0xd3d94089, 0x1880, 0xd3d9408a, 0x1880,
-   0xd3d9408b, 0x1880, 0xd3d9408c, 0x1880, 0xd3d9408d, 0x1880,
-   0xd3d9408e, 0x1880, 0xd3d9408f, 0x1880, 0xd3d94090, 0x1880,
-   0xd3d94091, 0x1880, 0xd3d94092, 0x1880, 0xd3d94093, 0x1880,
-   0xd3d94094, 0x1880, 0xd3d94095, 0x1880, 0xd3d94096, 0x1880,
-   0xd3d94097, 0x1880, 0xd3d94098, 0x1880, 0xd3d94099, 0x1880,
-   0xd3d9409a, 0x1880

[PATCH] drm/amdgpu: add function to clear MMEA error status for aldebaran

2021-05-10 Thread Dennis Li
For aldebaran, hardware will not clear error status automatically when
reading error status register, insteadly driver should set clear bit of
the error status register explicitly to clear error status.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 11aa29933c1f..b27fcbccce2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -28,6 +28,7 @@ struct amdgpu_mmhub_ras_funcs {
  void *ras_error_status);
void (*query_ras_error_status)(struct amdgpu_device *adev);
void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   void (*reset_ras_error_status)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_mmhub_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4eebb97994d6..a324dc2da101 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -938,6 +938,10 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
if (adev->mmhub.ras_funcs &&
adev->mmhub.ras_funcs->reset_ras_error_count)
adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+
+   if (adev->mmhub.ras_funcs &&
+   adev->mmhub.ras_funcs->reset_ras_error_status)
+   adev->mmhub.ras_funcs->reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->reset_ras_error_count)
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index e1500be4a208..998e674f9369 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -1314,12 +1314,31 @@ static void mmhub_v1_7_query_ras_error_status(struct 
amdgpu_device *adev)
}
 }
 
+static void mmhub_v1_7_reset_ras_error_status(struct amdgpu_device *adev)
+{
+   int i;
+   uint32_t reg_value;
+
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB))
+   return;
+
+   for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) {
+   reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
+   mmhub_v1_7_ea_err_status_regs[i]));
+   reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS,
+ CLEAR_ERROR_STATUS, 0x01);
+   WREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_regs[i]),
+  reg_value);
+   }
+}
+
 const struct amdgpu_mmhub_ras_funcs mmhub_v1_7_ras_funcs = {
.ras_late_init = amdgpu_mmhub_ras_late_init,
.ras_fini = amdgpu_mmhub_ras_fini,
.query_ras_error_count = mmhub_v1_7_query_ras_error_count,
.reset_ras_error_count = mmhub_v1_7_reset_ras_error_count,
.query_ras_error_status = mmhub_v1_7_query_ras_error_status,
+   .reset_ras_error_status = mmhub_v1_7_reset_ras_error_status,
 };
 
 const struct amdgpu_mmhub_funcs mmhub_v1_7_funcs = {
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: correct the funtion to clear GCEA error status

2021-05-10 Thread Dennis Li
The bit 11 of GCEA_ERR_STATUS register is used to clear GCEA error
status.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index e943cd2923ac..c63599686708 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1674,13 +1674,16 @@ static void gfx_v9_4_2_reset_utc_err_status(struct 
amdgpu_device *adev)
 static void gfx_v9_4_2_reset_ea_err_status(struct amdgpu_device *adev)
 {
uint32_t i, j;
+   uint32_t value;
+
+   value = REG_SET_FIELD(0, GCEA_ERR_STATUS, CLEAR_ERROR_STATUS, 0x1);
 
mutex_lock(>grbm_idx_mutex);
for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
 j++) {
gfx_v9_4_2_select_se_sh(adev, i, 0, j);
-   
WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), 0x10);
+   
WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), value);
}
}
gfx_v9_4_2_select_se_sh(adev, 0x, 0x, 0x);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: covert ras status to kernel errno

2021-05-09 Thread Dennis Li
The original codes use ras status and kernl errno together in the same
function, which is a wrong code style.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 17b728d2c1f2..231479b67b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1114,6 +1114,28 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
return ret;
 }
 
+static int psp_ras_status_to_errno(struct amdgpu_device *adev,
+enum ta_ras_status ras_status)
+{
+   int ret = -EINVAL;
+
+   switch (ras_status) {
+   case TA_RAS_STATUS__SUCCESS:
+   ret = 0;
+   break;
+   case TA_RAS_STATUS__RESET_NEEDED:
+   ret = -EAGAIN;
+   break;
+   case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE:
+   dev_warn(adev->dev, "RAS WARN: ras function unavailable\n");
+   break;
+   default:
+   dev_err(adev->dev, "RAS ERROR: ras function failed ret 0x%X\n", 
ret);
+   }
+
+   return ret;
+}
+
 int psp_ras_enable_features(struct psp_context *psp,
union ta_ras_cmd_input *info, bool enable)
 {
@@ -1137,7 +1159,7 @@ int psp_ras_enable_features(struct psp_context *psp,
if (ret)
return -EINVAL;
 
-   return ras_cmd->ras_status;
+   return psp_ras_status_to_errno(psp->adev, ras_cmd->ras_status);
 }
 
 static int psp_ras_terminate(struct psp_context *psp)
@@ -1220,7 +1242,7 @@ int psp_ras_trigger_error(struct psp_context *psp,
if (amdgpu_ras_intr_triggered())
return 0;
 
-   return ras_cmd->ras_status;
+   return psp_ras_status_to_errno(psp->adev, ras_cmd->ras_status);
 }
 // ras end
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ebbe2c5190c4..9b06cb58cff2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -586,29 +586,6 @@ struct ras_manager *amdgpu_ras_find_obj(struct 
amdgpu_device *adev,
 }
 /* obj end */
 
-static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev,
-const char* invoke_type,
-const char* block_name,
-enum ta_ras_status ret)
-{
-   switch (ret) {
-   case TA_RAS_STATUS__SUCCESS:
-   return;
-   case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE:
-   dev_warn(adev->dev,
-   "RAS WARN: %s %s currently unavailable\n",
-   invoke_type,
-   block_name);
-   break;
-   default:
-   dev_err(adev->dev,
-   "RAS ERROR: %s %s error failed ret 0x%X\n",
-   invoke_type,
-   block_name,
-   ret);
-   }
-}
-
 /* feature ctl begin */
 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
struct ras_common_if *head)
@@ -705,15 +682,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(>psp, info, enable);
if (ret) {
-   amdgpu_ras_parse_status_code(adev,
-enable ? 
"enable":"disable",
-ras_block_str(head->block),
-   (enum ta_ras_status)ret);
-   if (ret == TA_RAS_STATUS__RESET_NEEDED)
-   ret = -EAGAIN;
-   else
-   ret = -EINVAL;
-
+   dev_err(adev->dev, "ras %s %s failed %d\n",
+   enable ? "enable":"disable",
+   ras_block_str(head->block),
+   ret);
goto out;
}
}
@@ -1058,10 +1030,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
ret = -EINVAL;
}
 
-   amdgpu_ras_parse_status_code(adev,
-"inject",
-ras_block_str(info->head.block),
-(enum ta_ras_status)ret);
+   if (ret)
+   dev_err(adev->dev, "ras inject %s failed %d\n",
+   ras_block_str(info->head.block), ret);
 
return ret;
 }
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: update the shader to clear specific SGPRs

2021-05-06 Thread Dennis Li
Add shader codes to explicitly clear specific SGPRs, such as
flat_scratch_lo, flat_scratch_hi and so on. And also correct the
allocation size of SGPRs in PGM_RSRC1.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 025b1e42e31b..8ad6717e67d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -220,23 +220,24 @@ static const u32 sgpr112_init_compute_shader_aldebaran[] 
= {
0x9209a805, 0x920a8a04, 0x81080908, 0x81080a08, 0x81080308, 0x8e078208,
0x81078407, 0xc0410080, 0x0007, 0xbf8c, 0xbf8e003f, 0xc0030200,
0x, 0xbf8c, 0xbf06ff08, 0xdeadbeaf, 0xbf84fff9, 0x81028102,
-   0xc0410080, 0x0007, 0xbf8c, 0xbefc0080, 0xbe880080, 0xbe890080,
-   0xbe8a0080, 0xbe8b0080, 0xbe8c0080, 0xbe8d0080, 0xbe8e0080, 0xbe8f0080,
-   0xbe900080, 0xbe910080, 0xbe920080, 0xbe930080, 0xbe940080, 0xbe950080,
-   0xbe960080, 0xbe970080, 0xbe980080, 0xbe990080, 0xbe9a0080, 0xbe9b0080,
-   0xbe9c0080, 0xbe9d0080, 0xbe9e0080, 0xbe9f0080, 0xbea00080, 0xbea10080,
-   0xbea20080, 0xbea30080, 0xbea40080, 0xbea50080, 0xbea60080, 0xbea70080,
-   0xbea80080, 0xbea90080, 0xbeaa0080, 0xbeab0080, 0xbeac0080, 0xbead0080,
-   0xbeae0080, 0xbeaf0080, 0xbeb00080, 0xbeb10080, 0xbeb20080, 0xbeb30080,
-   0xbeb40080, 0xbeb50080, 0xbeb60080, 0xbeb70080, 0xbeb80080, 0xbeb90080,
-   0xbeba0080, 0xbebb0080, 0xbebc0080, 0xbebd0080, 0xbebe0080, 0xbebf0080,
-   0xbec00080, 0xbec10080, 0xbec20080, 0xbec30080, 0xbec40080, 0xbec50080,
-   0xbec60080, 0xbec70080, 0xbec80080, 0xbec90080, 0xbeca0080, 0xbecb0080,
-   0xbecc0080, 0xbecd0080, 0xbece0080, 0xbecf0080, 0xbed00080, 0xbed10080,
-   0xbed20080, 0xbed30080, 0xbed40080, 0xbed50080, 0xbed60080, 0xbed70080,
-   0xbed80080, 0xbed90080, 0xbeda0080, 0xbedb0080, 0xbedc0080, 0xbedd0080,
-   0xbede0080, 0xbedf0080, 0xbee00080, 0xbee10080, 0xbee20080, 0xbee30080,
-   0xbee40080, 0xbee50080, 0xbf81
+   0xc0410080, 0x0007, 0xbf8c, 0xbefc0080, 0xbeea0080, 0xbeeb0080,
+   0xbf00f280, 0xbee60080, 0xbee70080, 0xbee80080, 0xbee90080, 0xbefe0080,
+   0xbeff0080, 0xbe880080, 0xbe890080, 0xbe8a0080, 0xbe8b0080, 0xbe8c0080,
+   0xbe8d0080, 0xbe8e0080, 0xbe8f0080, 0xbe900080, 0xbe910080, 0xbe920080,
+   0xbe930080, 0xbe940080, 0xbe950080, 0xbe960080, 0xbe970080, 0xbe980080,
+   0xbe990080, 0xbe9a0080, 0xbe9b0080, 0xbe9c0080, 0xbe9d0080, 0xbe9e0080,
+   0xbe9f0080, 0xbea00080, 0xbea10080, 0xbea20080, 0xbea30080, 0xbea40080,
+   0xbea50080, 0xbea60080, 0xbea70080, 0xbea80080, 0xbea90080, 0xbeaa0080,
+   0xbeab0080, 0xbeac0080, 0xbead0080, 0xbeae0080, 0xbeaf0080, 0xbeb00080,
+   0xbeb10080, 0xbeb20080, 0xbeb30080, 0xbeb40080, 0xbeb50080, 0xbeb60080,
+   0xbeb70080, 0xbeb80080, 0xbeb90080, 0xbeba0080, 0xbebb0080, 0xbebc0080,
+   0xbebd0080, 0xbebe0080, 0xbebf0080, 0xbec00080, 0xbec10080, 0xbec20080,
+   0xbec30080, 0xbec40080, 0xbec50080, 0xbec60080, 0xbec70080, 0xbec80080,
+   0xbec90080, 0xbeca0080, 0xbecb0080, 0xbecc0080, 0xbecd0080, 0xbece0080,
+   0xbecf0080, 0xbed00080, 0xbed10080, 0xbed20080, 0xbed30080, 0xbed40080,
+   0xbed50080, 0xbed60080, 0xbed70080, 0xbed80080, 0xbed90080, 0xbeda0080,
+   0xbedb0080, 0xbedc0080, 0xbedd0080, 0xbede0080, 0xbedf0080, 0xbee00080,
+   0xbee10080, 0xbee20080, 0xbee30080, 0xbee40080, 0xbee50080, 0xbf81,
 };
 
 const struct soc15_reg_entry sgpr112_init_regs_aldebaran[] = {
@@ -244,7 +245,7 @@ const struct soc15_reg_entry sgpr112_init_regs_aldebaran[] 
= {
{ SOC15_REG_ENTRY(GC, 0, regCOMPUTE_NUM_THREAD_X), 0x40 },
{ SOC15_REG_ENTRY(GC, 0, regCOMPUTE_NUM_THREAD_Y), 8 },
{ SOC15_REG_ENTRY(GC, 0, regCOMPUTE_NUM_THREAD_Z), 1 },
-   { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC1), 0x2c0 },
+   { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC1), 0x340 },
{ SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC2), 0x6 },
{ SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC3), 0x0 },
{ SOC15_REG_ENTRY(GC, 0, regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0x 
},
@@ -262,21 +263,22 @@ static const u32 sgpr96_init_compute_shader_aldebaran[] = 
{
0x9209a805, 0x920a8a04, 0x81080908, 0x81080a08, 0x81080308, 0x8e078208,
0x81078407, 0xc0410080, 0x0007, 0xbf8c, 0xbf8e003f, 0xc0030200,
0x, 0xbf8c, 0xbf06ff08, 0xdeadbeaf, 0xbf84fff9, 0x81028102,
-   0xc0410080, 0x0007, 0xbf8c, 0xbefc0080, 0xbe880080, 0xbe890080,
-   0xbe8a0080, 0xbe8b0080, 0xbe8c0080, 0xbe8d0080, 0xbe8e0080, 0xbe8f0080,
-   0xbe900080, 0xbe910080, 0xbe920080, 0xbe930080, 0xbe940080, 0xbe950080,
-   0xbe960080, 0xbe970080, 0xbe980080, 0xbe990080, 0xbe9a0080, 0xbe9b0080,
-   0xbe9c0080, 0xbe9d0080, 0xbe9e0080, 0xbe9f0080, 0xbea00080, 0xbea10080,
-   0xbea20080, 0xbea30080, 0xbea40080, 0xbea50080, 0xbea60080, 0xbea70080

[PATCH] drm/amdgpu: fix no full coverage issue for gprs initialization

2021-04-27 Thread Dennis Li
The number of waves is changed to 8, so it is impossible to use old
solution to cover all sgprs.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index a2fe2dac32c1..2e6789a7dc46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -328,7 +328,7 @@ int amdgpu_ib_pool_init(struct amdgpu_device *adev)
 
for (i = 0; i < AMDGPU_IB_POOL_MAX; i++) {
if (i == AMDGPU_IB_POOL_DIRECT)
-   size = PAGE_SIZE * 2;
+   size = PAGE_SIZE * 6;
else
size = AMDGPU_IB_POOL_SIZE;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index d17e57dea178..77948c033c45 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -32,6 +32,11 @@
 #include "amdgpu_ras.h"
 #include "amdgpu_gfx.h"
 
+#define SE_ID_MAX 8
+#define CU_ID_MAX 16
+#define SIMD_ID_MAX 4
+#define WAVE_ID_MAX 10
+
 enum gfx_v9_4_2_utc_type {
VML2_MEM,
VML2_WALKER_MEM,
@@ -81,100 +86,100 @@ static const struct soc15_reg_golden 
golden_settings_gc_9_4_2_alde[] = {
 };
 
 static const u32 vgpr_init_compute_shader_aldebaran[] = {
-   0xb8840904, 0xb8851a04, 0xb8861344, 0x9207c006, 0x92088405, 0x81070807,
-   0x81070407, 0x8e078207, 0xbe88008f, 0xc0410200, 0x0007, 0xd3d94000,
-   0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, 0xd3d94003,
-   0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, 0xd3d94006,
-   0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, 0xd3d94009,
-   0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, 0xd3d9400c,
-   0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, 0xd3d9400f,
-   0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, 0xd3d94012,
-   0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, 0xd3d94015,
-   0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, 0xd3d94018,
-   0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, 0xd3d9401b,
-   0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, 0xd3d9401e,
-   0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, 0xd3d94021,
-   0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, 0xd3d94024,
-   0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, 0xd3d94027,
-   0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, 0xd3d9402a,
-   0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, 0xd3d9402d,
-   0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, 0xd3d94030,
-   0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, 0xd3d94033,
-   0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, 0xd3d94036,
-   0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, 0xd3d94039,
-   0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, 0xd3d9403c,
-   0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, 0xd3d9403f,
-   0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, 0xd3d94042,
-   0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, 0xd3d94045,
-   0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, 0xd3d94048,
-   0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, 0xd3d9404b,
-   0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, 0xd3d9404e,
-   0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, 0xd3d94051,
-   0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, 0xd3d94054,
-   0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, 0xd3d94057,
-   0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, 0xd3d9405a,
-   0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, 0xd3d9405d,
-   0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, 0xd3d94060,
-   0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, 0xd3d94063,
-   0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, 0xd3d94066,
-   0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, 0xd3d94069,
-   0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, 0xd3d9406c,
-   0x1880, 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880, 0xd3d9406f,
-   0x1880, 0xd3d94070, 0x1880, 0xd3d94071, 0x1880, 0xd3d94072,
-   0x1880, 0xd3d94073, 0x1880, 0xd3d94074, 0x1880, 0xd3d94075,
-   0x1880, 0xd3d94076, 0x1880, 0xd3d94077, 0x1880, 0xd3d94078,
-   0x1880, 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880, 0xd3d9407b,
-   0x1880, 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880, 0xd3d9407e,
-   0x1880, 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880, 0xd3d94081,
-   0x1880, 0xd3d94082, 0x1880, 0xd3d94083,

[PATCH] drm/amdgpu: refine gprs init shaders to check coverage

2021-04-20 Thread Dennis Li
Add codes to check whether all SIMDs are covered, make sure that all
GPRs are initialized.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9889bd495ba5..9e629f239288 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4656,8 +4656,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
if (!ring->sched.ready)
return 0;
 
-   if (adev->asic_type == CHIP_ARCTURUS ||
-   adev->asic_type == CHIP_ALDEBARAN) {
+   if (adev->asic_type == CHIP_ARCTURUS) {
vgpr_init_shader_ptr = vgpr_init_compute_shader_arcturus;
vgpr_init_shader_size = 
sizeof(vgpr_init_compute_shader_arcturus);
vgpr_init_regs_ptr = vgpr_init_regs_arcturus;
@@ -4924,7 +4923,11 @@ static int gfx_v9_0_ecc_late_init(void *handle)
}
 
/* requires IBs so do in late init after IB pool is initialized */
-   r = gfx_v9_0_do_edc_gpr_workarounds(adev);
+   if (adev->asic_type == CHIP_ALDEBARAN)
+   r = gfx_v9_4_2_do_edc_gpr_workarounds(adev);
+   else
+   r = gfx_v9_0_do_edc_gpr_workarounds(adev);
+
if (r)
return r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 9ca76a3ac38c..798c0e178201 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -22,6 +22,7 @@
  */
 #include "amdgpu.h"
 #include "soc15.h"
+#include "soc15d.h"
 
 #include "gc/gc_9_4_2_offset.h"
 #include "gc/gc_9_4_2_sh_mask.h"
@@ -79,6 +80,377 @@ static const struct soc15_reg_golden 
golden_settings_gc_9_4_2_alde[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, regTCI_CNTL_3, 0xff, 0x20),
 };
 
+static const u32 vgpr_init_compute_shader_aldebaran[] = {
+   0xb8840904, 0xb8851a04, 0xb8861344, 0x9207c006, 0x92088405, 0x81070807,
+   0x81070407, 0x8e078207, 0xbe88008f, 0xc0410200, 0x0007, 0xd3d94000,
+   0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, 0xd3d94003,
+   0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, 0xd3d94006,
+   0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, 0xd3d94009,
+   0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, 0xd3d9400c,
+   0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, 0xd3d9400f,
+   0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, 0xd3d94012,
+   0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, 0xd3d94015,
+   0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, 0xd3d94018,
+   0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, 0xd3d9401b,
+   0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, 0xd3d9401e,
+   0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, 0xd3d94021,
+   0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, 0xd3d94024,
+   0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, 0xd3d94027,
+   0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, 0xd3d9402a,
+   0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, 0xd3d9402d,
+   0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, 0xd3d94030,
+   0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, 0xd3d94033,
+   0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, 0xd3d94036,
+   0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, 0xd3d94039,
+   0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, 0xd3d9403c,
+   0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, 0xd3d9403f,
+   0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, 0xd3d94042,
+   0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, 0xd3d94045,
+   0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, 0xd3d94048,
+   0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, 0xd3d9404b,
+   0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, 0xd3d9404e,
+   0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, 0xd3d94051,
+   0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, 0xd3d94054,
+   0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, 0xd3d94057,
+   0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, 0xd3d9405a,
+   0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, 0xd3d9405d,
+   0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, 0xd3d94060,
+   0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, 0xd3d94063,
+   0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, 0xd3d94066,
+   0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, 0xd3d94069,
+   0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, 0

[PATCH] drm/amdgpu: fix a error injection failed issue

2021-04-16 Thread Dennis Li
because "sscanf(str, "retire_page")" always return 0, if application use
the raw data for error injection, it always wrongly falls into "op ==
3". Change to use strstr instead.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 38a691a3b600..7438d4e84776 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -221,7 +221,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
-   else if (sscanf(str, "retire_page") == 0)
+   else if (strstr(str, "retire_page") != NULL)
op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: add edc error interrupt handle for poison propogate mode

2021-04-15 Thread Dennis Li
In poison progogate mode, when driver receive the edc error interrupt
from SQ, driver should kill the process by pasid which is using the
poison data, and then trigger GPU reset.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 1c20458f3962..696944fa0177 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -25,6 +25,70 @@
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
+#include "amdgpu.h"
+
+enum SQ_INTERRUPT_WORD_ENCODING {
+   SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
+   SQ_INTERRUPT_WORD_ENCODING_INST,
+   SQ_INTERRUPT_WORD_ENCODING_ERROR,
+};
+
+enum SQ_INTERRUPT_ERROR_TYPE {
+   SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
+   SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
+   SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
+   SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
+};
+
+/* SQ_INTERRUPT_WORD_AUTO_CTXID */
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26
+
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x0001
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x0002
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x0004
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x0008
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x0010
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x0020
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x0040
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x0080
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 0x0100
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x0300
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c00
+
+/* SQ_INTERRUPT_WORD_WAVE_CTXID */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26
+
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x0fff
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x1000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x2000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x0300
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c00
+
+#define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) 
\
+   ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff))
+
+#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF0
+#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -108,13 +172,15 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
 {
uint16_t source_id, client_id, pasid, vmid;
-   uint32_t context_id;
+   uint32_t context_id0, context_id1;
+   uint32_t sq_intr_err, sq_int_data, encoding;
 
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
-   context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+   context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+   context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
 
if (client_id == SOC15_IH_CLIENTID_GRBM_CP ||
client_id == SOC15_IH_CLIENTID_SE0SH ||
@@ -122,10 +188,59 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
client_id == SOC15_IH_CLIENTID_SE2SH ||
client_id == SOC15_IH_CLIENTID_SE3SH

[PATCH 4/4] drm/amdkfd: add reset lock protection for kfd entry functions

2021-03-18 Thread Dennis Li
When doing GPU reset, try to block all kfd functions including
kfd ioctls and file close function, which maybe access hardware.

v2: fix a potential recursive locking issue

kfd_ioctl_dbg_register has chance called into pqm_create_queue, which
will cause recursive locking. So remove locking read_lock from process
queue manager, and add read_lock into related ioctls instead.

v3: put pqm_query_dev_by_qid under the protection of p->mutex

Signed-off-by: Dennis Li 
Acked-by: Christian König 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6802c616e10e..283ba9435233 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -40,6 +40,7 @@
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
+#include "amdgpu.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -298,6 +299,9 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
}
 
mutex_lock(>mutex);
+   err = amdgpu_read_lock(dev->ddev, true);
+   if (err)
+   goto err_read_lock;
 
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -326,6 +330,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
 */
args->doorbell_offset |= doorbell_offset_in_process;
 
+   amdgpu_read_unlock(dev->ddev);
mutex_unlock(>mutex);
 
pr_debug("Queue id %d was created successfully\n", args->queue_id);
@@ -343,6 +348,8 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
 
 err_create_queue:
 err_bind_process:
+   amdgpu_read_unlock(dev->ddev);
+err_read_lock:
mutex_unlock(>mutex);
return err;
 }
@@ -352,6 +359,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, 
struct kfd_process *p,
 {
int retval;
struct kfd_ioctl_destroy_queue_args *args = data;
+   struct kfd_dev *dev;
 
pr_debug("Destroying queue id %d for pasid 0x%x\n",
args->queue_id,
@@ -359,8 +367,20 @@ static int kfd_ioctl_destroy_queue(struct file *filp, 
struct kfd_process *p,
 
mutex_lock(>mutex);
 
+   dev = pqm_query_dev_by_qid(>pqm, args->queue_id);
+   if (!dev) {
+   retval = -EINVAL;
+   goto err_query_dev;
+   }
+
+   retval = amdgpu_read_lock(dev->ddev, true);
+   if (retval)
+   goto err_read_lock;
retval = pqm_destroy_queue(>pqm, args->queue_id);
+   amdgpu_read_unlock(dev->ddev);
 
+err_read_lock:
+err_query_dev:
mutex_unlock(>mutex);
return retval;
 }
@@ -371,6 +391,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct 
kfd_process *p,
int retval;
struct kfd_ioctl_update_queue_args *args = data;
struct queue_properties properties;
+   struct kfd_dev *dev;
 
if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
pr_err("Queue percentage must be between 0 to 
KFD_MAX_QUEUE_PERCENTAGE\n");
@@ -404,10 +425,21 @@ static int kfd_ioctl_update_queue(struct file *filp, 
struct kfd_process *p,
 
mutex_lock(>mutex);
 
+   dev = pqm_query_dev_by_qid(>pqm, args->queue_id);
+   if (!dev) {
+   retval = -EINVAL;
+   goto err_query_dev;
+   }
+
+   retval = amdgpu_read_lock(dev->ddev, true);
+   if (retval)
+   goto err_read_lock;
retval = pqm_update_queue(>pqm, args->queue_id, );
+   amdgpu_read_unlock(dev->ddev);
 
+err_read_lock:
+err_query_dev:
mutex_unlock(>mutex);
-
return retval;
 }
 
@@ -420,6 +452,7 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct 
kfd_process *p,
struct queue_properties properties;
uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32);
+   struct kfd_dev *dev;
 
if ((args->num_cu_mask % 32) != 0) {
pr_debug("num_cu_mask 0x%x must be a multiple of 32",
@@ -456,8 +489,20 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct 
kfd_process *p,
 
mutex_lock(>mutex);
 
+   dev = pqm_query_dev_by_qid(>pqm, args->queue_id);
+   if (!dev) {
+   retval = -EINVAL;
+   goto err_query_dev;
+   }
+
+   retval = amdgpu_read_lock(dev->ddev, true);
+   if (retval)
+   goto err_read_lock;
retval = pqm_set_cu_mask(>pqm, args->queue_id, );
+   amdgpu_read_unlock(dev->ddev);
 
+err_read_lock:
+err_query_dev:
mutex_unlock(>mutex);
 
if (retval

[PATCH 3/4] drm/amdgpu: instead of using down/up_read directly

2021-03-18 Thread Dennis Li
change to use amdgpu_read_lock/unlock which could handle more cases

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index bcaf271b39bf..66dec0f49c4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -59,11 +59,12 @@ int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
 static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
 {
struct amdgpu_device *adev = inode->i_private;
+   struct drm_device *dev = adev_to_drm(adev);
int ret;
 
file->private_data = adev;
 
-   ret = down_read_killable(>reset_sem);
+   ret = amdgpu_read_lock(dev, true);
if (ret)
return ret;
 
@@ -74,7 +75,7 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, 
struct file *file)
ret = -EBUSY;
}
 
-   up_read(>reset_sem);
+   amdgpu_read_unlock(dev);
 
return ret;
 }
@@ -1206,7 +1207,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_read_killable(>reset_sem);
+   r = amdgpu_read_lock(dev, true);
if (r)
return r;
 
@@ -1235,7 +1236,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
kthread_unpark(ring->sched.thread);
}
 
-   up_read(>reset_sem);
+   amdgpu_read_unlock(dev);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1427,6 +1428,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
struct amdgpu_ring *ring;
struct dma_fence **fences = NULL;
struct amdgpu_device *adev = (struct amdgpu_device *)data;
+   struct drm_device *dev = adev_to_drm(adev);
 
if (val >= AMDGPU_MAX_RINGS)
return -EINVAL;
@@ -1446,7 +1448,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_read_killable(>reset_sem);
+   r = amdgpu_read_lock(dev, true);
if (r)
goto pro_end;
 
@@ -1489,7 +1491,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   up_read(>reset_sem);
+   amdgpu_read_unlock(dev);
 
ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 3ee481557fc9..113c63bf187f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -247,12 +247,13 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
+   struct drm_device *dev = adev_to_drm(adev);
 
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
 * otherwise the mailbox msg will be ruined/reseted by
 * the VF FLR.
 */
-   if (!down_read_trylock(>reset_sem))
+   if (amdgpu_read_lock(dev, true))
return;
 
amdgpu_virt_fini_data_exchange(adev);
@@ -268,7 +269,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
 
 flr_done:
atomic_set(>in_gpu_reset, 0);
-   up_read(>reset_sem);
+   amdgpu_read_unlock(dev);
 
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 48e588d3c409..2cd910e5caa7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -268,12 +268,13 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
+   struct drm_device *dev = adev_to_drm(adev);
 
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
 * otherwise the mailbox msg will be ruined/reseted by
 * the VF FLR.
 */
-   if (!down_read_trylock(>reset_sem))
+   if (amdgpu_read_lock(dev, true))
return;
 
amdgpu_virt_fini_data_exchange(adev);
@@ -289,7 +290,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
 
 flr_done:
atomic_set(>in_gpu_reset, 0);
-   up_read(>reset_sem);
+   amdgpu_read_unlock(

[PATCH 2/4] drm/amdgpu: refine the GPU recovery sequence

2021-03-18 Thread Dennis Li
Changed to only set in_gpu_reset as 1 when the recovery thread begin,
and delay hold reset_sem after pre-reset but before reset. It make sure
that other threads have exited or been blocked before doing GPU reset.
Compared with the old codes, it could make some threads exit more early
without waiting for timeout.

Introduce a event recovery_fini_event which is used to block new threads
when recovery thread has begun. These threads are only waked up when recovery
thread exit.

v2: remove codes to check the usage of adev->reset_sem, because lockdep
will show all locks held in the system, when system detect hung timeout
in the recovery thread.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 02a34f9a26aa..67c716e5ee8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1044,6 +1044,8 @@ struct amdgpu_device {
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct rw_semaphore reset_sem;
+   wait_queue_head_t recovery_fini_event;
+
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
@@ -1406,4 +1408,8 @@ static inline int amdgpu_in_reset(struct amdgpu_device 
*adev)
 {
return atomic_read(>in_gpu_reset);
 }
+
+int amdgpu_read_lock(struct drm_device *dev, bool interruptible);
+void amdgpu_read_unlock(struct drm_device *dev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 24ff5992cb02..15235610cc54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -211,6 +211,60 @@ static ssize_t amdgpu_device_get_serial_number(struct 
device *dev,
 static DEVICE_ATTR(serial_number, S_IRUGO,
amdgpu_device_get_serial_number, NULL);
 
+int amdgpu_read_lock(struct drm_device *dev, bool interruptible)
+{
+   struct amdgpu_device *adev = drm_to_adev(dev);
+   int ret = 0;
+
+   /**
+* if a thread hold the read lock, but recovery thread has started,
+* it should release the read lock and wait for recovery thread finished
+* Because pre-reset functions have begun, which stops old threads but 
no
+* include the current thread.
+   */
+   if (interruptible) {
+   while (!(ret = down_read_killable(>reset_sem)) &&
+   amdgpu_in_reset(adev)) {
+   up_read(>reset_sem);
+   ret = 
wait_event_interruptible(adev->recovery_fini_event,
+   !amdgpu_in_reset(adev));
+   if (ret)
+   break;
+   }
+   } else {
+   down_read(>reset_sem);
+   while (amdgpu_in_reset(adev)) {
+   up_read(>reset_sem);
+   wait_event(adev->recovery_fini_event,
+  !amdgpu_in_reset(adev));
+   down_read(>reset_sem);
+   }
+   }
+
+   return ret;
+}
+
+void amdgpu_read_unlock(struct drm_device *dev)
+{
+   struct amdgpu_device *adev = drm_to_adev(dev);
+
+   up_read(>reset_sem);
+}
+
+static void amdgpu_write_lock(struct amdgpu_device *adev, struct 
amdgpu_hive_info *hive)
+{
+   if (hive) {
+   down_write_nest_lock(>reset_sem, >hive_lock);
+   } else {
+   down_write(>reset_sem);
+   }
+}
+
+static void amdgpu_write_unlock(struct amdgpu_device *adev)
+{
+   up_write(>reset_sem);
+}
+
 /**
  * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
  *
@@ -3280,6 +3334,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
hash_init(adev->mn_hash);
atomic_set(>in_gpu_reset, 0);
init_rwsem(>reset_sem);
+   init_waitqueue_head(>recovery_fini_event);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
 
@@ -4509,39 +4564,18 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
return r;
 }
 
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
-   struct amdgpu_hive_info *hive)
+static bool amdgpu_device_recovery_enter(struct amdgpu_device *adev)
 {
if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
return false;
 
-   if (hive) {
-   down_write_nest_lock(>reset_sem, >hive_lock);
-   } else {
-   down_write(>reset_sem);
-   }
-
-   switch (amdgpu_asic_reset_method(adev)) {
-   case AMD_RESET_METHOD_MODE1:
-   adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
-   break;
-   case AMD_RESET_METHOD_MODE2:
-   adev->mp1_state = PP_MP1_STATE_RESET;
-   break;
-

[PATCH 1/4] drm/amdgpu: remove reset lock from low level functions

2021-03-18 Thread Dennis Li
It is easy to cause performance drop issue when using lock in low level
functions.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0b1e0127056f..24ff5992cb02 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -374,13 +374,10 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
-   amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   amdgpu_sriov_runtime(adev))
ret = amdgpu_kiq_rreg(adev, reg);
-   up_read(>reset_sem);
-   } else {
+   else
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
-   }
} else {
ret = adev->pcie_rreg(adev, reg * 4);
}
@@ -459,13 +456,10 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
-   amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   amdgpu_sriov_runtime(adev))
amdgpu_kiq_wreg(adev, reg, v);
-   up_read(>reset_sem);
-   } else {
+   else
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
-   }
} else {
adev->pcie_wreg(adev, reg * 4, v);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index a05dbbbd9803..9f6eaca107ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -155,11 +155,7 @@ static int __update_table_header(struct 
amdgpu_ras_eeprom_control *control,
 
msg.addr = control->i2c_address;
 
-   /* i2c may be unstable in gpu reset */
-   down_read(>reset_sem);
ret = i2c_transfer(>pm.smu_i2c, , 1);
-   up_read(>reset_sem);
-
if (ret < 1)
DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret);
 
@@ -546,11 +542,7 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
control->next_addr += EEPROM_TABLE_RECORD_SIZE;
}
 
-   /* i2c may be unstable in gpu reset */
-   down_read(>reset_sem);
ret = i2c_transfer(>pm.smu_i2c, msgs, num);
-   up_read(>reset_sem);
-
if (ret < 1) {
DRM_ERROR("Failed to process EEPROM table records, ret:%d", 
ret);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 33e54eed2eec..690f368ce378 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -317,8 +317,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 * Directly use kiq to do the vm invalidation instead
 */
if (adev->gfx.kiq.ring.sched.ready &&
-   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-   down_read_trylock(>reset_sem)) {
+   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
struct amdgpu_vmhub *hub = >vmhub[vmhub];
const unsigned eng = 17;
u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, 
flush_type);
@@ -328,7 +327,6 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);
 
-   up_read(>reset_sem);
return;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1567dd227f51..ec3c05360776 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -757,14 +757,12 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 * as GFXOFF under bare metal
 */
if (adev->gfx.kiq.ring.sched.ready &&
-   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-   down_read_trylock(>reset_sem)) {
+   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
 
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
   1 << vmid);
-   up

[PATCH 0/4] Refine GPU recovery sequence to enhance its stability

2021-03-18 Thread Dennis Li
We have defined two variables in_gpu_reset and reset_sem in adev object. The 
atomic type variable in_gpu_reset is used to avoid recovery thread reenter and 
make lower functions return more earlier when recovery start, but couldn't 
block recovery thread when it access hardware. The r/w semaphore reset_sem is 
used to solve these synchronization issues between recovery thread and other 
threads.

The original solution locked registers' access in lower functions, which will 
introduce following issues:

1) many lower functions are used in both recovery thread and others. Firstly we 
must harvest these functions, it is easy to miss someones. Secondly these 
functions need select which lock (read lock or write lock) will be used, 
according to the thread it is running in. If the thread context isn't 
considered, the added lock will easily introduce deadlock. Besides that, in 
most time, developer easily forget to add locks for new functions.

2) performance drop. More lower functions are more frequently called.

3) easily introduce false positive lockdep complaint, because write lock has 
big range in recovery thread, but low level functions will hold read lock may 
be protected by other locks in other threads.

Therefore the new solution will try to add lock protection for ioctls of kfd. 
Its goal is that there are no threads except for recovery thread or its 
children (for xgmi) to access hardware when doing GPU reset and resume. So 
refine recovery thread as the following:

Step 0: atomic_cmpxchg(>in_gpu_reset, 0, 1)
   1). if failed, it means system had a recovery thread running, current thread 
exit directly;
   2). if success, enter recovery thread;

Step 1: cancel all delay works, stop drm schedule, complete all unreceived 
fences and so on. It try to stop or pause other threads.

Step 2: call down_write(>reset_sem) to hold write lock, which will block 
recovery thread until other threads release read locks.

Step 3: normally, there is only recovery threads running to access hardware, it 
is safe to do gpu reset now.

Step 4: do post gpu reset, such as call all ips' resume functions;

Step 5: atomic set adev->in_gpu_reset as 0, wake up other threads and release 
write lock. Recovery thread exit normally.

Other threads call the amdgpu_read_lock to synchronize with recovery thread. If 
it finds that in_gpu_reset is 1, it should release read lock if it has holden 
one, and then blocks itself to wait for recovery finished event. If thread 
successfully hold read lock and in_gpu_reset is 0, it continues. It will exit 
normally or be stopped by recovery thread in step 1.

Dennis Li (4):
  drm/amdgpu: remove reset lock from low level functions
  drm/amdgpu: refine the GPU recovery sequence
  drm/amdgpu: instead of using down/up_read directly
  drm/amdkfd: add reset lock protection for kfd entry functions

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 173 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|   8 -
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|   4 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |   9 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |   5 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |   5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 172 -
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   4 +
 .../amd/amdkfd/kfd_process_queue_manager.c|  17 ++
 12 files changed, 345 insertions(+), 75 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery

2021-03-01 Thread Dennis Li
When GPU recovery thread is doing GPU reset, it is unsafe that other
threads access hardware concurrently, which could cause GPU reset
randomly hang.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1624c2bc8285..c71d3bba5f69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1033,6 +1033,7 @@ struct amdgpu_device {
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct rw_semaphore reset_sem;
+   struct thread_info *recovery_thread;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
@@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device 
*adev)
 {
return atomic_read(>in_gpu_reset);
 }
+
+static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev)
+{
+   if (unlikely(adev->recovery_thread != NULL) &&
+   adev->recovery_thread == current_thread_info())
+   return true;
+
+   return false;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71805dfd9e25..7c17a5468d43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, 
uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t 
value)
 {
+   bool locked;
+
if (adev->in_pci_err_recovery)
return;
 
+   locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+   if (locked)
+   down_read(>reset_sem);
+
if (offset < adev->rmmio_size)
writeb(value, adev->rmmio + offset);
else
BUG();
+
+   if (locked)
+   up_read(>reset_sem);
 }
 
 /**
@@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
uint32_t reg, uint32_t v,
uint32_t acc_flags)
 {
+   bool locked;
+
if (adev->in_pci_err_recovery)
return;
 
+   locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+   if (locked)
+   down_read(>reset_sem);
+
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
-   amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   amdgpu_sriov_runtime(adev)) {
amdgpu_kiq_wreg(adev, reg, v);
-   up_read(>reset_sem);
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
adev->pcie_wreg(adev, reg * 4, v);
}
 
+   if (locked)
+   up_read(>reset_sem);
+
trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 }
 
@@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 uint32_t reg, uint32_t v)
 {
+   bool locked;
+
if (adev->in_pci_err_recovery)
return;
 
+   locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+   if (locked)
+   down_read(>reset_sem);
+
if (amdgpu_sriov_fullaccess(adev) &&
adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) {
@@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
}
+
+   if (locked)
+   up_read(>reset_sem);
 }
 
 /**
@@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 {
+   bool locked;
+
if (adev->in_pci_err_recovery)
return;
 
+   locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+   if (locked)
+   down_read(>reset_sem);
+
if ((reg * 4) < adev->rio_mem_size)
iowrite32(v, adev->rio_mem + (reg * 4));
else {
iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
}
+
+   if (locked)
+   up_read(>reset_sem);
 }
 
 /**
@@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device 
*adev,
unsigned long flags;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+   bool locked;
+
+   locked

[PATCH v2] drm/amdgpu: remove unnecessary reading for epprom header

2021-02-25 Thread Dennis Li
If the number of badpage records exceed the threshold, driver has
updated both epprom header and control->tbl_hdr.header before gpu reset,
therefore GPU recovery thread no need to read epprom header directly.

v2: merge amdgpu_ras_check_err_threshold into 
amdgpu_ras_eeprom_check_err_threshold

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f0f7ed42ee7f..f2ff10403d93 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4397,7 +4397,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
 * bad_page_threshold value to fix this once
 * probing driver again.
 */
-   if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+   if 
(!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
/* must succeed. */
amdgpu_ras_resume(tmp_adev);
} else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 09546dec40ff..c669435ccc74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2189,19 +2189,3 @@ bool amdgpu_ras_need_emergency_restart(struct 
amdgpu_device *adev)
 
return false;
 }
-
-bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
-{
-   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-   bool exc_err_limit = false;
-
-   if (con && (amdgpu_bad_page_threshold != 0))
-   amdgpu_ras_eeprom_check_err_threshold(>eeprom_control,
-   _err_limit);
-
-   /*
-* We are only interested in variable exc_err_limit,
-* as it says if GPU is in bad state or not.
-*/
-   return exc_err_limit;
-}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index aed0716efa5a..42aab9adc263 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -491,8 +491,6 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce);
 
-bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
-
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 19d9aa76cfbf..7f527f8bbdb1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -434,47 +434,21 @@ static uint32_t __correct_eeprom_dest_address(uint32_t 
curr_address)
return curr_address;
 }
 
-int amdgpu_ras_eeprom_check_err_threshold(
-   struct amdgpu_ras_eeprom_control *control,
-   bool *exceed_err_limit)
+bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
 {
-   struct amdgpu_device *adev = to_amdgpu_device(control);
-   unsigned char buff[EEPROM_ADDRESS_SIZE +
-   EEPROM_TABLE_HEADER_SIZE] = { 0 };
-   struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
-   struct i2c_msg msg = {
-   .addr = control->i2c_address,
-   .flags = I2C_M_RD,
-   .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
-   .buf = buff,
-   };
-   int ret;
-
-   *exceed_err_limit = false;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
if (!__is_ras_eeprom_supported(adev))
-   return 0;
-
-   /* read EEPROM table header */
-   mutex_lock(>tbl_mutex);
-   ret = i2c_transfer(>pm.smu_i2c, , 1);
-   if (ret < 1) {
-   dev_err(adev->dev, "Failed to read EEPROM table header.\n");
-   goto err;
-   }
-
-   __decode_table_header_from_buff(hdr, [2]);
+   return false;
 
-   if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+   if (con->eeprom_control.tbl_hdr.header == EEPROM_TABLE_HDR_BAD) {
dev_warn(adev->dev, "This GPU is in BAD status.");
dev_warn(adev->dev, "Please retire it or setting one bigger "
"threshold value when reloading driver.\n");
-   *exceed_err_limit = true;
+   return true;
}
 
-err:
-   mutex_unlock(>tbl_mutex);
-   return 0;
+   return false;
 }
 
 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
di

[PATCH] drm/amdgpu: remove unnecessary reading for epprom header

2021-02-25 Thread Dennis Li
If the number of badpage records exceed the threshold, driver has
updated both epprom header and control->tbl_hdr.header before gpu reset,
therefore GPU recovery thread no need to read epprom header directly.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 19d9aa76cfbf..4310ad63890c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -439,41 +439,19 @@ int amdgpu_ras_eeprom_check_err_threshold(
bool *exceed_err_limit)
 {
struct amdgpu_device *adev = to_amdgpu_device(control);
-   unsigned char buff[EEPROM_ADDRESS_SIZE +
-   EEPROM_TABLE_HEADER_SIZE] = { 0 };
-   struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
-   struct i2c_msg msg = {
-   .addr = control->i2c_address,
-   .flags = I2C_M_RD,
-   .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
-   .buf = buff,
-   };
-   int ret;
 
*exceed_err_limit = false;
 
if (!__is_ras_eeprom_supported(adev))
return 0;
 
-   /* read EEPROM table header */
-   mutex_lock(>tbl_mutex);
-   ret = i2c_transfer(>pm.smu_i2c, , 1);
-   if (ret < 1) {
-   dev_err(adev->dev, "Failed to read EEPROM table header.\n");
-   goto err;
-   }
-
-   __decode_table_header_from_buff(hdr, [2]);
-
-   if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+   if (control->tbl_hdr.header == EEPROM_TABLE_HDR_BAD) {
dev_warn(adev->dev, "This GPU is in BAD status.");
dev_warn(adev->dev, "Please retire it or setting one bigger "
"threshold value when reloading driver.\n");
*exceed_err_limit = true;
}
 
-err:
-   mutex_unlock(>tbl_mutex);
return 0;
 }
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/amdgpu: reserve backup pages for bad page retirment

2021-02-22 Thread Dennis Li
it's not user friendly that users' visiable unused memories are
decreased when bad pages are retired. Therefore reserve limit backup
pages when init, and return ones when bad pages retired, to keep no
change of unused memory size.

v2: refine codes to calculate badpags threshold

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b7ee587484b2..ff4387bbfb1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -170,7 +170,7 @@ struct amdgpu_mgpu_info mgpu_info = {
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0x;
-int amdgpu_bad_page_threshold = -1;
+int amdgpu_bad_page_threshold = 100;
 
 /**
  * DOC: vramlimit (int)
@@ -804,7 +804,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 
0444);
  * faulty pages by ECC exceed threshold value and leave it for user's further
  * check.
  */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default 
value), 0 = disable bad page retirement)");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = 
disable bad page retirement, 100 = default value");
 module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
 
 MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
(8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 93699ea4860c..09546dec40ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1747,13 +1747,14 @@ static bool amdgpu_ras_check_bad_page(struct 
amdgpu_device *adev,
return ret;
 }
 
-static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
-   uint32_t max_length)
+static uint32_t
+amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
 {
-   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
+   uint32_t max_length = 0;
 
+   max_length = amdgpu_ras_eeprom_get_record_max_length();
/*
 * Justification of value bad_page_cnt_threshold in ras structure
 *
@@ -1779,20 +1780,18 @@ static void amdgpu_ras_validate_threshold(struct 
amdgpu_device *adev,
tmp_threshold = max_length;
 
if (tmp_threshold == -1) {
-   val = adev->gmc.mc_vram_size;
+   val = adev->gmc.real_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
-   con->bad_page_cnt_threshold = min(lower_32_bits(val),
-   max_length);
-   } else {
-   con->bad_page_cnt_threshold = tmp_threshold;
+   tmp_threshold = min(lower_32_bits(val), max_length);
}
+
+   return tmp_threshold;
 }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
-   uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret;
 
@@ -1812,8 +1811,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(>in_recovery, 0);
con->adev = adev;
 
-   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
-   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+   if (!con->bad_page_cnt_threshold) {
+   con->bad_page_cnt_threshold =
+   amdgpu_ras_calculate_badpags_threshold(adev);
+
+   ret = amdgpu_vram_mgr_reserve_backup_pages(
+   ttm_manager_type(>mman.bdev, TTM_PL_VRAM),
+   con->bad_page_cnt_threshold);
+   if (ret)
+   goto out;
+   }
 
ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 69ba8dd4f3ee..927d33d75c22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -52,6 +52,8 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
+   struct list_head backup_pages;
+   uint32_t num_backup_pages;
atomic64_t usage;
atomic64_t vis_usage;
 };
@@ -127,6 +129,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager 
*man);
 uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
 int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
  uint64_t start, uint64_t size);
+int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
+  

[PATCH] drm/amdgpu: reserve backup pages for bad page retirment

2021-02-22 Thread Dennis Li
it's not user friendly that users' visiable unused memories are
decreased when bad pages are retired. Therefore reserve limit backup
pages when init, and return ones when bad pages retired, to keep no
change of unused memory size.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b7ee587484b2..ff4387bbfb1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -170,7 +170,7 @@ struct amdgpu_mgpu_info mgpu_info = {
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0x;
-int amdgpu_bad_page_threshold = -1;
+int amdgpu_bad_page_threshold = 100;
 
 /**
  * DOC: vramlimit (int)
@@ -804,7 +804,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 
0444);
  * faulty pages by ECC exceed threshold value and leave it for user's further
  * check.
  */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default 
value), 0 = disable bad page retirement)");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = 
disable bad page retirement, 100 = default value");
 module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
 
 MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
(8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 93699ea4860c..fb1c3f6cef29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1779,7 +1779,7 @@ static void amdgpu_ras_validate_threshold(struct 
amdgpu_device *adev,
tmp_threshold = max_length;
 
if (tmp_threshold == -1) {
-   val = adev->gmc.mc_vram_size;
+   val = adev->gmc.real_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_length);
@@ -1812,8 +1812,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(>in_recovery, 0);
con->adev = adev;
 
-   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
-   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+   if (!con->bad_page_cnt_threshold) {
+   max_eeprom_records_len = 
amdgpu_ras_eeprom_get_record_max_length();
+   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
+   ret = amdgpu_vram_mgr_reserve_backup_pages(
+   ttm_manager_type(>mman.bdev, TTM_PL_VRAM),
+   con->bad_page_cnt_threshold);
+   if (ret)
+   goto out;
+   }
 
ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 69ba8dd4f3ee..927d33d75c22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -52,6 +52,8 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
+   struct list_head backup_pages;
+   uint32_t num_backup_pages;
atomic64_t usage;
atomic64_t vis_usage;
 };
@@ -127,6 +129,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager 
*man);
 uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
 int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
  uint64_t start, uint64_t size);
+int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
+uint32_t num_pages);
 int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
  uint64_t start);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 21d18efca277..b325b067926b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -28,6 +28,9 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
+uint32_t num_pages);
+
 static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager 
*man)
 {
return container_of(man, struct amdgpu_vram_mgr, manager);
@@ -189,6 +192,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
spin_lock_init(>lock);
INIT_LIST_HEAD(>reservations_pending);
INIT_LIST_HEAD(>reserved_pages);
+   INIT_LIST_HEAD(>backup_pages);
 
/* Add the two VRAM-related sysfs files */
ret = sysfs_create_files(>d

[PATCH] drm/amdgpu: Fix issue no bad_pages after umc ue injection

2021-01-04 Thread Dennis Li
old code wrongly used the bad page status as the function return value,
which cause amdgpu_ras_badpages_read always return failed.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c136bd449744..82e952696d24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1518,7 +1518,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device 
*adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
int i = 0;
-   int ret = 0;
+   int ret = 0, status;
 
if (!con || !con->eh_data || !bps || !count)
return -EINVAL;
@@ -1543,12 +1543,12 @@ static int amdgpu_ras_badpages_read(struct 
amdgpu_device *adev,
.size = AMDGPU_GPU_PAGE_SIZE,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
-   ret = amdgpu_vram_mgr_query_page_status(
+   status = amdgpu_vram_mgr_query_page_status(
ttm_manager_type(>mman.bdev, TTM_PL_VRAM),
data->bps[i].retired_page);
-   if (ret == -EBUSY)
+   if (status == -EBUSY)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
-   else if (ret == -ENOENT)
+   else if (status == -ENOENT)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
}
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/amdgpu: fix a GPU hang issue when remove device

2020-12-30 Thread Dennis Li
When GFXOFF is enabled and GPU is idle, driver will fail to access some
registers. Therefore change to disable power gating before all access
registers with MMIO.

Dmesg log is as following:
amdgpu :03:00.0: amdgpu: amdgpu: finishing device.
amdgpu: cp queue pipe 4 queue 0 preemption failed
amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2
amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706
amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2
amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706

Signed-off-by: Dennis Li 
Change-Id: I42431f5d0bf54909e1df888a0d72fc009d8e196c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1cb7d73f7317..b69c34074d8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2548,11 +2548,11 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_remove_device(adev);
 
-   amdgpu_amdkfd_device_fini(adev);
-
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
 
+   amdgpu_amdkfd_device_fini(adev);
+
/* need to disable SMC first */
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.hw)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: fix a GPU hang issue when remove device

2020-12-30 Thread Dennis Li
When GFXOFF is enabled and GPU is idle, driver will fail to access some
registers. Therefore disable GFXOFF before unload device.

amdgpu :03:00.0: amdgpu: amdgpu: finishing device.
amdgpu: cp queue pipe 4 queue 0 preemption failed
amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2
amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706
amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2
amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706

Signed-off-by: Dennis Li 
Change-Id: I42431f5d0bf54909e1df888a0d72fc009d8e196c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index e365c4fdcfe3..47d1291d5053 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -83,6 +83,8 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
if (adev == NULL)
return;
 
+   amdgpu_gfx_off_ctrl(adev, false);
+
amdgpu_unregister_gpu_instance(adev);
 
if (adev->rmmio == NULL)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: fix a memory protection fault when remove amdgpu device

2020-12-29 Thread Dennis Li
ASD and TA share the same firmware in SIENNA_CICHLID and only TA
firmware is requested during boot, so only need release TA firmware when
remove device.

[   83.877150] general protection fault, probably for non-canonical address 
0x1269f97e6ed04095:  [#1] SMP PTI
[   83.888076] CPU: 0 PID: 1312 Comm: modprobe Tainted: GW  OE 
5.9.0-rc5-deli-amd-vangogh-0.0.6.6-114-gdd99d5669a96-dirty #2
[   83.901160] Hardware name: System manufacturer System Product Name/TUF 
Z370-PLUS GAMING II, BIOS 0411 09/21/2018
[   83.912353] RIP: 0010:free_fw_priv+0xc/0x120
[   83.917531] Code: e8 99 cd b0 ff b8 a1 ff ff ff eb 9f 4c 89 f7 e8 8a cd b0 
ff b8 f4 ff ff ff eb 90 0f 1f 00 0f 1f 44 00 00 55 48 89 e5 41 54 53 <4c> 8b 67 
18 48 89 fb 4c 89 e7 e8 45 94 41 00 b8 ff ff ff ff f0 0f
[   83.937576] RSP: 0018:bc34c13a3ce0 EFLAGS: 00010206
[   83.943699] RAX: bb681850 RBX: a047f117eb60 RCX: 80800055
[   83.951879] RDX: bc34c1d5f000 RSI: 80800055 RDI: 1269f97e6ed04095
[   83.959955] RBP: bc34c13a3cf0 R08:  R09: 0001
[   83.968107] R10: bc34c13a3cc8 R11: ff00 R12: a047d6b23378
[   83.976166] R13: a047d6b23338 R14: a047d6b240c8 R15: 
[   83.984295] FS:  7f74f6712540() GS:a047fbe0() 
knlGS:
[   83.993323] CS:  0010 DS:  ES:  CR0: 80050033
[   84.56] CR2: 556a1cca4e18 CR3: 00021faa8004 CR4: 003706f0
[   84.008128] DR0:  DR1:  DR2: 
[   84.016155] DR3:  DR6: fffe0ff0 DR7: 0400
[   84.024174] Call Trace:
[   84.027514]  release_firmware.part.11+0x4b/0x70
[   84.033017]  release_firmware+0x13/0x20
[   84.037803]  psp_sw_fini+0x77/0xb0 [amdgpu]
[   84.042857]  amdgpu_device_fini+0x38c/0x5d0 [amdgpu]
[   84.048815]  amdgpu_driver_unload_kms+0x43/0x70 [amdgpu]
[   84.055055]  drm_dev_unregister+0x73/0xb0 [drm]
[   84.060499]  drm_dev_unplug+0x28/0x30 [drm]
[   84.065598]  amdgpu_dev_uninit+0x1b/0x40 [amdgpu]
[   84.071223]  amdgpu_pci_remove+0x4e/0x70 [amdgpu]
[   84.076835]  pci_device_remove+0x3e/0xc0
[   84.081609]  device_release_driver_internal+0xfb/0x1c0
[   84.087558]  driver_detach+0x4d/0xa0
[   84.092041]  bus_remove_driver+0x5f/0xe0
[   84.096854]  driver_unregister+0x2f/0x50
[   84.101594]  pci_unregister_driver+0x22/0xa0
[   84.106806]  amdgpu_exit+0x15/0x2b [amdgpu]

Signed-off-by: Dennis Li 
Change-Id: Icc981a421499dff844855d5a662e91d1730c2754

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index eb19ae734396..b44b46dd60f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -564,7 +564,7 @@ static int psp_asd_load(struct psp_context *psp)
 * add workaround to bypass it for sriov now.
 * TODO: add version check to make it common
 */
-   if (amdgpu_sriov_vf(psp->adev) || !psp->asd_fw)
+   if (amdgpu_sriov_vf(psp->adev) || !psp->asd_ucode_size)
return 0;
 
cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
@@ -2779,11 +2779,10 @@ static int parse_ta_bin_descriptor(struct psp_context 
*psp,
 
switch (desc->fw_type) {
case TA_FW_TYPE_PSP_ASD:
-   psp->asd_fw_version= le32_to_cpu(desc->fw_version);
+   psp->asd_fw_version= le32_to_cpu(desc->fw_version);
psp->asd_feature_version   = le32_to_cpu(desc->fw_version);
-   psp->asd_ucode_size= le32_to_cpu(desc->size_bytes);
+   psp->asd_ucode_size= le32_to_cpu(desc->size_bytes);
psp->asd_start_addr= ucode_start_addr;
-   psp->asd_fw= psp->ta_fw;
break;
case TA_FW_TYPE_PSP_XGMI:
psp->ta_xgmi_ucode_version = le32_to_cpu(desc->fw_version);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/3] drm/amdgpu: remove redundant GPU reset

2020-10-27 Thread Dennis Li
Because bad pages saving has been moved to UMC error interrupt callback,
which will trigger a new GPU reset after saving.

Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 16 
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0926c0770d7a..7c39d706e6d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -33,7 +33,6 @@
 
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1)
-#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV (0x1 << 2)
 
 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
@@ -513,14 +512,7 @@ static inline int amdgpu_ras_reset_gpu(struct 
amdgpu_device *adev)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-   /*
-* Save bad page to eeprom before gpu reset, i2c may be unstable
-* in gpu reset.
-*
-* Also, exclude the case when ras recovery issuer is
-* eeprom page write itself.
-*/
-   if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task())
+   if (in_task())
amdgpu_ras_reserve_bad_pages(adev);
 
if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 695bcfc5c983..c3710c591b55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -479,7 +479,6 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
int i, ret = 0;
struct i2c_msg *msgs, *msg;
unsigned char *buffs, *buff;
-   bool sched_ras_recovery = false;
struct eeprom_table_record *record;
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
@@ -517,7 +516,6 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
"Saved bad pages(%d) reaches threshold value(%d).\n",
control->num_recs + num, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
-   sched_ras_recovery = true;
}
 
/* In case of overflow just start from beginning to not lose newest 
records */
@@ -603,20 +601,6 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
__update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 
__update_table_header(control, buffs);
-
-   if (sched_ras_recovery) {
-   /*
-* Before scheduling ras recovery, assert the related
-* flag first, which shall bypass common bad page
-* reservation execution in amdgpu_ras_reset_gpu.
-*/
-   amdgpu_ras_get_context(adev)->flags |=
-   AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
-
-   dev_warn(adev->dev, "Conduct ras recovery due to bad "
-   "page threshold reached.\n");
-   amdgpu_ras_reset_gpu(adev);
-   }
} else if (!__validate_tbl_checksum(control, records, num)) {
DRM_WARN("EEPROM Table checksum mismatch!");
/* TODO Uncomment when EEPROM read/write is relliable */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/3] drm/amdgpu: fix the issue of reserving bad pages failed

2020-10-27 Thread Dennis Li
In amdgpu_ras_reset_gpu, because bad pages may not be freed,
it has high probability to reserve bad pages failed.

Change to reserve bad pages when freeing VRAM.

v2:
1. avoid allocating the drm_mm node outside of amdgpu_vram_mgr.c
2. move bad page reserving into amdgpu_ras_add_bad_pages, if vram mgr
   reserve bad page failed, it will put it into pending list, otherwise
   put it into processed list;
3. remove amdgpu_ras_release_bad_pages, because retired page's info has
   been moved into amdgpu_vram_mgr

v3:
1. formate code style;
2. rename amdgpu_vram_reserve_scope as amdgpu_vram_reservation;
3. rename scope_pending as reservations_pending;
4. rename scope_processed as reserved_pages;
5. change to iterate over all the pending ones and try to insert them
   with drm_mm_reserve_node();

v4:
1. rename amdgpu_vram_mgr_reserve_scope as
amdgpu_vram_mgr_reserve_range;
2. remove unused include "amdgpu_ras.h";
3. rename amdgpu_vram_mgr_check_and_reserve as
amdgpu_vram_mgr_do_reserve;
4. refine amdgpu_vram_mgr_reserve_range to call
amdgpu_vram_mgr_do_reserve.

Signed-off-by: Dennis Li 
Signed-off-by: Wenhui Sheng 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 150 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |   8 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 115 ++
 4 files changed, 158 insertions(+), 119 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0f57a0003df6..84bb55ab6ac5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+   uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
 
@@ -1573,10 +1575,12 @@ static int amdgpu_ras_badpages_read(struct 
amdgpu_device *adev,
.size = AMDGPU_GPU_PAGE_SIZE,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
-
-   if (data->last_reserved <= i)
+   ret = amdgpu_vram_mgr_query_page_status(
+   >mman.bdev.man[TTM_PL_VRAM],
+   data->bps[i].retired_page);
+   if (ret == -EBUSY)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
-   else if (data->bps_bo[i] == NULL)
+   else if (ret == -ENOENT)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
}
 
@@ -1628,12 +1632,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct 
amdgpu_device *adev,
unsigned int new_space = old_space + pages;
unsigned int align_space = ALIGN(new_space, 512);
void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-   struct amdgpu_bo **bps_bo =
-   kmalloc(align_space * sizeof(*data->bps_bo), 
GFP_KERNEL);
 
-   if (!bps || !bps_bo) {
+   if (!bps) {
kfree(bps);
-   kfree(bps_bo);
return -ENOMEM;
}
 
@@ -1642,14 +1643,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct 
amdgpu_device *adev,
data->count * sizeof(*data->bps));
kfree(data->bps);
}
-   if (data->bps_bo) {
-   memcpy(bps_bo, data->bps_bo,
-   data->count * sizeof(*data->bps_bo));
-   kfree(data->bps_bo);
-   }
 
data->bps = bps;
-   data->bps_bo = bps_bo;
data->space_left += align_space - old_space;
return 0;
 }
@@ -1661,6 +1656,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
int ret = 0;
+   uint32_t i;
 
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -1670,16 +1666,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
if (!data)
goto out;
 
-   if (data->space_left <= pages)
-   if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
+   for (i = 0; i < pages; i++) {
+   if (amdgpu_ras_check_bad_page_unlock(con,
+   bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+   continue;
+
+   if (!data->space_left &&
+   amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
ret = -ENOMEM;
goto out;
}
 
-   memcpy(>bps[data->count], bps, pages * sizeof(*data->bps));
-

[PATCH 1/3] drm/amdgpu: change to save bad pages in UMC error interrupt callback

2020-10-27 Thread Dennis Li
Instead of saving bad pages in amdgpu_ras_reset_gpu, it will reduce
the unnecessary calling of amdgpu_ras_save_bad_pages.

Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 ---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f79d25fbccc..0f57a0003df6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1690,7 +1690,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
  * write error record array to eeprom, the function should be
  * protected by recovery_lock
  */
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
@@ -1863,9 +1863,6 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device 
*adev)
data->last_reserved = i + 1;
bo = NULL;
}
-
-   /* continue to save bad pages to eeprom even reesrve_vram fails */
-   ret = amdgpu_ras_save_bad_pages(adev);
 out:
mutex_unlock(>recovery_lock);
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6b8d7bb83bb3..0926c0770d7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -506,6 +506,7 @@ bool amdgpu_ras_check_err_threshold(struct amdgpu_device 
*adev);
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
 
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
 
 static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 262baf0f61ea..a2975c8092a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -126,10 +126,11 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
err_data->ue_count);
 
if ((amdgpu_bad_page_threshold != 0) &&
-   err_data->err_addr_cnt &&
+   err_data->err_addr_cnt) {
amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
-   err_data->err_addr_cnt))
-   dev_warn(adev->dev, "Failed to add ras bad page!\n");
+   err_data->err_addr_cnt);
+   amdgpu_ras_save_bad_pages(adev);
+   }
 
amdgpu_ras_reset_gpu(adev);
}
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 0/3] Refine the codes about reseving bad pages.

2020-10-27 Thread Dennis Li
Beside umc, others' UE interrupt callback could enter into amdgpu_ras_reset_gpu,
so the first patch change to save bad pages in UMC error interrupt callback.

When bad page error happens, the bad page mostly still be hold by some
process, therefore driver will fail to reserve the bad page. The third
patch will reserve the bad page when freeing it, make system has no
chance to allocate it to other proccess.

Dennis Li (3):
  drm/amdgpu: change to save bad pages in UMC error interrupt callback
  drm/amdgpu: remove redundant GPU reset
  drm/amdgpu: fix the issue of reserving bad pages failed

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 155 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  17 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  16 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   |   7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 115 +
 6 files changed, 164 insertions(+), 150 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: protect eeprom update from GPU reset

2020-10-14 Thread Dennis Li
because i2c is unstable in GPU reset, driver need protect
eeprom update from GPU reset, to not miss any bad page record.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 0e64c39a2372..695bcfc5c983 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -149,7 +149,11 @@ static int __update_table_header(struct 
amdgpu_ras_eeprom_control *control,
 
msg.addr = control->i2c_address;
 
+   /* i2c may be unstable in gpu reset */
+   down_read(>reset_sem);
ret = i2c_transfer(>pm.smu_i2c, , 1);
+   up_read(>reset_sem);
+
if (ret < 1)
DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret);
 
@@ -557,7 +561,11 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
control->next_addr += EEPROM_TABLE_RECORD_SIZE;
}
 
+   /* i2c may be unstable in gpu reset */
+   down_read(>reset_sem);
ret = i2c_transfer(>pm.smu_i2c, msgs, num);
+   up_read(>reset_sem);
+
if (ret < 1) {
DRM_ERROR("Failed to process EEPROM table records, ret:%d", 
ret);
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: fix a memory leak issue

2020-09-02 Thread Dennis Li
In the resume stage of GPU recovery, start_cpsch will call pm_init
which set pm->allocated as false, cause the next pm_release_ib has
no chance to release ib memory.

Add pm_release_ib in stop_cpsch which will be called in the suspend
stage of GPU recovery.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 069ba4be1e8f..20ef048d6a03 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1192,6 +1192,8 @@ static int stop_cpsch(struct device_queue_manager *dqm)
dqm->sched_running = false;
dqm_unlock(dqm);
 
+   pm_release_ib(>packets);
+
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
pm_uninit(>packets, hanging);
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/kfd: fix a system crash issue during GPU recovery

2020-09-01 Thread Dennis Li
The crash log as the below:

[Thu Aug 20 23:18:14 2020] general protection fault:  [#1] SMP NOPTI
[Thu Aug 20 23:18:14 2020] CPU: 152 PID: 1837 Comm: kworker/152:1 Tainted: G
   OE 5.4.0-42-generic #46~18.04.1-Ubuntu
[Thu Aug 20 23:18:14 2020] Hardware name: GIGABYTE G482-Z53-YF/MZ52-G40-00, 
BIOS R12 05/13/2020
[Thu Aug 20 23:18:14 2020] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[Thu Aug 20 23:18:14 2020] RIP: 0010:evict_process_queues_cpsch+0xc9/0x130 
[amdgpu]
[Thu Aug 20 23:18:14 2020] Code: 49 8d 4d 10 48 39 c8 75 21 eb 44 83 fa 03 74 
36 80 78 72 00 74 0c 83 ab 68 01 00 00 01 41 c6 45 41 00 48 8b 00 48 39 c8 74 
25 <80> 78 70 00 c6 40 6d 01 74 ee 8b 50 28 c6 40 70 00 83 ab 60 01 00
[Thu Aug 20 23:18:14 2020] RSP: 0018:b29b52f6fc90 EFLAGS: 00010213
[Thu Aug 20 23:18:14 2020] RAX: 1c884edb0a118914 RBX: 8a0d45ff3c00 RCX: 
8a2d83e41038
[Thu Aug 20 23:18:14 2020] RDX:  RSI: 0082 RDI: 
8a0e2e4178c0
[Thu Aug 20 23:18:14 2020] RBP: b29b52f6fcb0 R08: 1b64 R09: 
0004
[Thu Aug 20 23:18:14 2020] R10: b29b52f6fb78 R11: 0001 R12: 
8a0d45ff3d28
[Thu Aug 20 23:18:14 2020] R13: 8a2d83e41028 R14:  R15: 

[Thu Aug 20 23:18:14 2020] FS:  () 
GS:8a0e2e40() knlGS:
[Thu Aug 20 23:18:14 2020] CS:  0010 DS:  ES:  CR0: 80050033
[Thu Aug 20 23:18:14 2020] CR2: 55c783c0e6a8 CR3: 0034a1284000 CR4: 
00340ee0
[Thu Aug 20 23:18:14 2020] Call Trace:
[Thu Aug 20 23:18:14 2020]  kfd_process_evict_queues+0x43/0xd0 [amdgpu]
[Thu Aug 20 23:18:14 2020]  kfd_suspend_all_processes+0x60/0xf0 [amdgpu]
[Thu Aug 20 23:18:14 2020]  kgd2kfd_suspend.part.7+0x43/0x50 [amdgpu]
[Thu Aug 20 23:18:14 2020]  kgd2kfd_pre_reset+0x46/0x60 [amdgpu]
[Thu Aug 20 23:18:14 2020]  amdgpu_amdkfd_pre_reset+0x1a/0x20 [amdgpu]
[Thu Aug 20 23:18:14 2020]  amdgpu_device_gpu_recover+0x377/0xf90 [amdgpu]
[Thu Aug 20 23:18:14 2020]  ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu]
[Thu Aug 20 23:18:14 2020]  amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[Thu Aug 20 23:18:14 2020]  process_one_work+0x20f/0x400
[Thu Aug 20 23:18:14 2020]  worker_thread+0x34/0x410

When GPU hang, user process will fail to create a compute queue whose
struct object will be freed later, but driver wrongly add this queue to
queue list of the proccess. And then kfd_process_evict_queues will
access a freed memory, which cause a system crash.

v2:
The failure to execute_queues should probably not be reported to
the caller of create_queue, because the queue was already created.
Therefore change to ignore the return value from execute_queues.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 560adc57a050..069ba4be1e8f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1302,7 +1302,7 @@ static int create_queue_cpsch(struct device_queue_manager 
*dqm, struct queue *q,
if (q->properties.is_active) {
increment_queue_count(dqm, q->properties.type);
 
-   retval = execute_queues_cpsch(dqm,
+   execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
}
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/amdgpu: block ring buffer access during GPU recovery

2020-09-01 Thread Dennis Li
When GPU is in reset, its status isn't stable and ring buffer also need
be reset when resuming. Therefore driver should protect GPU recovery
thread from ring buffer accessed by other threads. Otherwise GPU will
randomly hang during recovery.

v2: correct indent

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 172dc47b7f39..9b586bc80c38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,
 {
uint32_t ret;
 
-   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-   return amdgpu_kiq_rreg(adev, reg);
+   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
+   down_read_trylock(>reset_sem)) {
+   ret = amdgpu_kiq_rreg(adev, reg);
+   up_read(>reset_sem);
+   return ret;
+   }
 
if ((reg * 4) < adev->rmmio_size)
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,
ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
spin_unlock_irqrestore(>mmio_idx_lock, flags);
}
+
trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
return ret;
 }
@@ -407,8 +412,12 @@ void static inline amdgpu_mm_wreg_mmio(struct 
amdgpu_device *adev, uint32_t reg,
 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags)
 {
-   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-   return amdgpu_kiq_wreg(adev, reg, v);
+   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
+   down_read_trylock(>reset_sem)) {
+   amdgpu_kiq_wreg(adev, reg, v);
+   up_read(>reset_sem);
+   return;
+   }
 
amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index ad9ad622ccce..31359e519d69 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 */
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-   !amdgpu_in_reset(adev)) {
-
+   down_read_trylock(>reset_sem)) {
struct amdgpu_vmhub *hub = >vmhub[vmhub];
const unsigned eng = 17;
u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, 
flush_type);
@@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);
+
+   up_read(>reset_sem);
return;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e1a0ae327cf5..c602ddc68384 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -500,13 +500,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 * as GFXOFF under bare metal
 */
if (adev->gfx.kiq.ring.sched.ready &&
-   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) 
&&
-   !amdgpu_in_reset(adev)) {
+   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
+   down_read_trylock(>reset_sem)) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
 
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
   1 << vmid);
+   up_read(>reset_sem);
return;
}
 
@@ -599,7 +600,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
if (amdgpu_in_reset(adev))
return -EIO;
 
-   if (ring->sched.ready) {
+   if (ring->sched.ready && down_read_trylock(>reset_sem)) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a
 * heavy-weight TLB flush (type 2), which flushes
 * both. Due to a race condition with concurrent
@@ -626,6 +627,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
if (r) {
amdgpu_ring

[PATCH] drm/kfd: fix a system crash issue during GPU recovery

2020-08-31 Thread Dennis Li
The crash log as the below:

[Thu Aug 20 23:18:14 2020] general protection fault:  [#1] SMP NOPTI
[Thu Aug 20 23:18:14 2020] CPU: 152 PID: 1837 Comm: kworker/152:1 Tainted: G
   OE 5.4.0-42-generic #46~18.04.1-Ubuntu
[Thu Aug 20 23:18:14 2020] Hardware name: GIGABYTE G482-Z53-YF/MZ52-G40-00, 
BIOS R12 05/13/2020
[Thu Aug 20 23:18:14 2020] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[Thu Aug 20 23:18:14 2020] RIP: 0010:evict_process_queues_cpsch+0xc9/0x130 
[amdgpu]
[Thu Aug 20 23:18:14 2020] Code: 49 8d 4d 10 48 39 c8 75 21 eb 44 83 fa 03 74 
36 80 78 72 00 74 0c 83 ab 68 01 00 00 01 41 c6 45 41 00 48 8b 00 48 39 c8 74 
25 <80> 78 70 00 c6 40 6d 01 74 ee 8b 50 28 c6 40 70 00 83 ab 60 01 00
[Thu Aug 20 23:18:14 2020] RSP: 0018:b29b52f6fc90 EFLAGS: 00010213
[Thu Aug 20 23:18:14 2020] RAX: 1c884edb0a118914 RBX: 8a0d45ff3c00 RCX: 
8a2d83e41038
[Thu Aug 20 23:18:14 2020] RDX:  RSI: 0082 RDI: 
8a0e2e4178c0
[Thu Aug 20 23:18:14 2020] RBP: b29b52f6fcb0 R08: 1b64 R09: 
0004
[Thu Aug 20 23:18:14 2020] R10: b29b52f6fb78 R11: 0001 R12: 
8a0d45ff3d28
[Thu Aug 20 23:18:14 2020] R13: 8a2d83e41028 R14:  R15: 

[Thu Aug 20 23:18:14 2020] FS:  () 
GS:8a0e2e40() knlGS:
[Thu Aug 20 23:18:14 2020] CS:  0010 DS:  ES:  CR0: 80050033
[Thu Aug 20 23:18:14 2020] CR2: 55c783c0e6a8 CR3: 0034a1284000 CR4: 
00340ee0
[Thu Aug 20 23:18:14 2020] Call Trace:
[Thu Aug 20 23:18:14 2020]  kfd_process_evict_queues+0x43/0xd0 [amdgpu]
[Thu Aug 20 23:18:14 2020]  kfd_suspend_all_processes+0x60/0xf0 [amdgpu]
[Thu Aug 20 23:18:14 2020]  kgd2kfd_suspend.part.7+0x43/0x50 [amdgpu]
[Thu Aug 20 23:18:14 2020]  kgd2kfd_pre_reset+0x46/0x60 [amdgpu]
[Thu Aug 20 23:18:14 2020]  amdgpu_amdkfd_pre_reset+0x1a/0x20 [amdgpu]
[Thu Aug 20 23:18:14 2020]  amdgpu_device_gpu_recover+0x377/0xf90 [amdgpu]
[Thu Aug 20 23:18:14 2020]  ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu]
[Thu Aug 20 23:18:14 2020]  amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[Thu Aug 20 23:18:14 2020]  process_one_work+0x20f/0x400
[Thu Aug 20 23:18:14 2020]  worker_thread+0x34/0x410

When GPU hang, user process will fail to create a compute queue whose
struct object will be freed later, but driver wrongly add this queue to
queue list of the proccess. And then kfd_process_evict_queues will
access a freed memory, which cause a system crash.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 560adc57a050..d5e6b07ffb27 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1296,16 +1296,18 @@ static int create_queue_cpsch(struct 
device_queue_manager *dqm, struct queue *q,
mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
>gart_mqd_addr, >properties);
 
-   list_add(>list, >queues_list);
-   qpd->queue_count++;
-
if (q->properties.is_active) {
increment_queue_count(dqm, q->properties.type);
 
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+   if (retval)
+   goto out_execute_cpsch;
}
 
+   list_add(>list, >queues_list);
+   qpd->queue_count++;
+
/*
 * Unconditionally increment this counter, regardless of the queue's
 * type or whether the queue is active.
@@ -1318,6 +1320,9 @@ static int create_queue_cpsch(struct device_queue_manager 
*dqm, struct queue *q,
dqm_unlock(dqm);
return retval;
 
+out_execute_cpsch:
+   decrement_queue_count(dqm, q->properties.type);
+   dqm_unlock(dqm);
 out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
 out_deallocate_sdma_queue:
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: block ring buffer access during GPU recovery

2020-08-31 Thread Dennis Li
When GPU is in reset, its status isn't stable and ring buffer also need
be reset when resuming. Therefore driver should protect GPU recovery
thread from ring buffer accessed by other threads. Otherwise GPU will
randomly hang during recovery.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 172dc47b7f39..8db56a22cd1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,
 {
uint32_t ret;
 
-   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-   return amdgpu_kiq_rreg(adev, reg);
+   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
+   amdgpu_sriov_runtime(adev) &&
+   down_read_trylock(>reset_sem)) {
+   ret = amdgpu_kiq_rreg(adev, reg);
+   up_read(>reset_sem);
+   return ret;
+   }
 
if ((reg * 4) < adev->rmmio_size)
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -332,6 +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
uint32_t reg,
ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
spin_unlock_irqrestore(>mmio_idx_lock, flags);
}
+
trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
return ret;
 }
@@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct 
amdgpu_device *adev, uint32_t reg,
 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags)
 {
-   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-   return amdgpu_kiq_wreg(adev, reg, v);
+   if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
+   amdgpu_sriov_runtime(adev) &&
+   down_read_trylock(>reset_sem)) {
+   amdgpu_kiq_wreg(adev, reg, v);
+   up_read(>reset_sem);
+   return;
+   }
 
amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index ad9ad622ccce..4ea2a065daa9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 */
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-   !amdgpu_in_reset(adev)) {
+   down_read_trylock(>reset_sem)) {
 
struct amdgpu_vmhub *hub = >vmhub[vmhub];
const unsigned eng = 17;
@@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);
+
+   up_read(>reset_sem);
return;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e1a0ae327cf5..33b7cf1c79ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
 */
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) 
&&
-   !amdgpu_in_reset(adev)) {
+   down_read_trylock(>reset_sem)) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
 
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
   1 << vmid);
+   up_read(>reset_sem);
return;
}
 
@@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
if (amdgpu_in_reset(adev))
return -EIO;
 
-   if (ring->sched.ready) {
+   if (ring->sched.ready &&
+down_read_trylock(>reset_sem)) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a
 * heavy-weight TLB flush (type 2), which flushes
 * both. Due to a race condition with concurrent
@@ -626,6 +628,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
if (r) {
amdgpu_ring_undo(ring);
spin_unlock(>gfx.kiq.ring_lock);
+   up_read(>reset_sem);
return -ETIME;

[PATCH] drm/amdgpu: skip scheduling IBs when GPU recovery

2020-08-21 Thread Dennis Li
If GPU begin to do recovery, skip scheduling IBs. Otherwise
GPU recovery randomly fail.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index dcfe8a3b03ff..054d7b0357fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -212,6 +212,7 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
struct dma_fence *fence = NULL, *finished;
struct amdgpu_job *job;
int r = 0;
+   int locked;
 
job = to_amdgpu_job(sched_job);
finished = >base.s_fence->finished;
@@ -220,6 +221,10 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
 
trace_amdgpu_sched_run_job(job);
 
+   locked = down_read_trylock(>adev->reset_sem);
+   if (!locked)
+   dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
GPU recovery */
+
if (job->vram_lost_counter != 
atomic_read(>adev->vram_lost_counter))
dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
VRAM lost */
 
@@ -231,6 +236,10 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
if (r)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
}
+
+   if (locked)
+   up_read(>adev->reset_sem);
+
/* if gpu reset, hw fence will be replaced here */
dma_fence_put(job->fence);
job->fence = dma_fence_get(fence);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-20 Thread Dennis Li
clients don't need reset-lock for synchronization when no
GPU recovery.

v2:
change to return the return value of down_read_killable.

v3:
if GPU recovery begin, VF ignore FLR notification.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {
 
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..cc5c7f81c540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode 
*inode, struct file *file)
 
file->private_data = adev;
 
-   mutex_lock(>lock_reset);
+   ret = down_read_killable(>reset_sem);
+   if (ret)
+   return ret;
+
if (adev->autodump.dumping.done) {
reinit_completion(>autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
-   mutex_unlock(>lock_reset);
+
+   up_read(>reset_sem);
 
return ret;
 }
@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(>lock_reset);
+   r = down_read_killable(>reset_sem);
+   if (r)
+   return r;
 
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
kthread_unpark(ring->sched.thread);
}
 
-   mutex_unlock(>lock_reset);
+   up_read(>reset_sem);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(>lock_reset);
+   r = down_read_killable(>reset_sem);
+   if (r)
+   goto pro_end;
 
/* stop the scheduler */
kthread_park(ring->sched.thread);
@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   mutex_unlock(>lock_reset);
+   up_read(>reset_sem);
 
ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
 
+pro_end:
kfree(fences);
 
-   return 0;
+   return r;
 }
 
 static int amdgpu_debugfs_sclk_set(void *data, u64 val)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
atomic_set(>in_gpu_reset, 0);
-   mutex_init(>lock_reset);
+   init_rwsem(>reset_sem);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
 
@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device 
*adev)
if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
return false;
 
-   mutex_lock(>lock_reset);
+   down_write(>reset_sem);
 
atomic_inc(>gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) {
@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
atomic_set(>in_gpu_reset, 0);
-   mutex_unlock(>lock_reset);
+   up_write(>reset_sem);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f27d83f2de78..9c07014d9bd6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-   int locked;
 
/* block

[PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-20 Thread Dennis Li
clients don't need reset-lock for synchronization when no
GPU recovery.

v2:
change to return the return value of down_read_killable.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {
 
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..cc5c7f81c540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode 
*inode, struct file *file)
 
file->private_data = adev;
 
-   mutex_lock(>lock_reset);
+   ret = down_read_killable(>reset_sem);
+   if (ret)
+   return ret;
+
if (adev->autodump.dumping.done) {
reinit_completion(>autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
-   mutex_unlock(>lock_reset);
+
+   up_read(>reset_sem);
 
return ret;
 }
@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(>lock_reset);
+   r = down_read_killable(>reset_sem);
+   if (r)
+   return r;
 
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
kthread_unpark(ring->sched.thread);
}
 
-   mutex_unlock(>lock_reset);
+   up_read(>reset_sem);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(>lock_reset);
+   r = down_read_killable(>reset_sem);
+   if (r)
+   goto pro_end;
 
/* stop the scheduler */
kthread_park(ring->sched.thread);
@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   mutex_unlock(>lock_reset);
+   up_read(>reset_sem);
 
ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
 
+pro_end:
kfree(fences);
 
-   return 0;
+   return r;
 }
 
 static int amdgpu_debugfs_sclk_set(void *data, u64 val)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
atomic_set(>in_gpu_reset, 0);
-   mutex_init(>lock_reset);
+   init_rwsem(>reset_sem);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
 
@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device 
*adev)
if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
return false;
 
-   mutex_lock(>lock_reset);
+   down_write(>reset_sem);
 
atomic_inc(>gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) {
@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
atomic_set(>in_gpu_reset, 0);
-   mutex_unlock(>lock_reset);
+   up_write(>reset_sem);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f27d83f2de78..8ac63f13fc6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-   int locked;
 
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
 *

[PATCH] drm/amdgpu: refine message print for devices of hive

2020-08-19 Thread Dennis Li
Using dev_xxx instead of DRM_xxx/pr_xxx to indicate which device
of a hive is the message for.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 81b1d9a1dca0..08548e051cc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3370,7 +3370,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
 {
int r;
 
-   DRM_INFO("amdgpu: finishing device.\n");
+   dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(>delayed_init_work);
adev->shutdown = true;
 
@@ -3555,12 +3555,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
if (amdgpu_device_need_post(adev)) {
r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
if (r)
-   DRM_ERROR("amdgpu asic init failed\n");
+   dev_err(adev->dev, "amdgpu asic init failed\n");
}
 
r = amdgpu_device_ip_resume(adev);
if (r) {
-   DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
+   dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
return r;
}
amdgpu_fence_driver_resume(adev);
@@ -3584,7 +3584,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
if (r == 0) {
r = amdgpu_bo_pin(aobj, 
AMDGPU_GEM_DOMAIN_VRAM);
if (r != 0)
-   DRM_ERROR("Failed to pin cursor 
BO (%d)\n", r);
+   dev_err(adev->dev, "Failed to 
pin cursor BO (%d)\n", r);
amdgpu_crtc->cursor_addr = 
amdgpu_bo_gpu_offset(aobj);
amdgpu_bo_unreserve(aobj);
}
@@ -3674,7 +3674,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hang =

adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
if (adev->ip_blocks[i].status.hang) {
-   DRM_INFO("IP block:%s is hung!\n", 
adev->ip_blocks[i].version->funcs->name);
+   dev_info(adev->dev, "IP block:%s is hung!\n", 
adev->ip_blocks[i].version->funcs->name);
asic_hang = true;
}
}
@@ -3735,7 +3735,7 @@ static bool amdgpu_device_ip_need_full_reset(struct 
amdgpu_device *adev)
(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 
||
 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 
{
if (adev->ip_blocks[i].status.hang) {
-   DRM_INFO("Some block need full reset!\n");
+   dev_info(adev->dev, "Some block need full 
reset!\n");
return true;
}
}
@@ -3823,7 +3823,7 @@ static int amdgpu_device_recover_vram(struct 
amdgpu_device *adev)
else
tmo = msecs_to_jiffies(100);
 
-   DRM_INFO("recover vram bo from shadow start\n");
+   dev_info(adev->dev, "recover vram bo from shadow start\n");
mutex_lock(>shadow_list_lock);
list_for_each_entry(shadow, >shadow_list, shadow_list) {
 
@@ -3859,11 +3859,11 @@ static int amdgpu_device_recover_vram(struct 
amdgpu_device *adev)
dma_fence_put(fence);
 
if (r < 0 || tmo <= 0) {
-   DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is 
%ld\n", r, tmo);
+   dev_err(adev->dev, "recover vram bo from shadow failed, r is 
%ld, tmo is %ld\n", r, tmo);
return -EIO;
}
 
-   DRM_INFO("recover vram bo from shadow done\n");
+   dev_info(adev->dev, "recover vram bo from shadow done\n");
return 0;
 }
 
@@ -3962,7 +3962,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device 
*adev)
 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
 {
if (!amdgpu_device_ip_check_soft_reset(adev)) {
-   DRM_INFO("Timeout, but no hardware hang detected.\n");
+   dev_info(adev->dev, "Timeout, but no hardware hang 
detected.\n");
return false;
}
 
@@ -4002,7 +4002,7 @@ bool amdgpu_device_should_recover_gpu(struct 
amdgpu_device *adev)
return true;
 
 disabled:
-   DRM_INFO("GPU recovery disabled.\n");
+  

[PATCH] drm/amdgpu: fix the nullptr issue when reenter GPU recovery

2020-08-19 Thread Dennis Li
in single gpu system, if driver reenter gpu recovery,
amdgpu_device_lock_adev will return false, but hive is
nullptr now.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 82242e2f5658..81b1d9a1dca0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4371,8 +4371,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (!amdgpu_device_lock_adev(tmp_adev)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another 
already in progress",
  job ? job->base.id : -1);
-   mutex_unlock(>hive_lock);
-   return 0;
+   r = 0;
+   goto skip_recovery;
}
 
/*
@@ -4505,6 +4505,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_device_unlock_adev(tmp_adev);
}
 
+skip_recovery:
if (hive) {
atomic_set(>in_reset, 0);
mutex_unlock(>hive_lock);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore

2020-08-19 Thread Dennis Li
clients don't need reset-lock for synchronization when no
GPU recovery.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {
 
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..0090e850eab9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,17 @@ static int amdgpu_debugfs_autodump_open(struct inode 
*inode, struct file *file)
 
file->private_data = adev;
 
-   mutex_lock(>lock_reset);
+   if (down_read_killable(>reset_sem))
+   return -EINTR;
+
if (adev->autodump.dumping.done) {
reinit_completion(>autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
-   mutex_unlock(>lock_reset);
+
+   up_read(>reset_sem);
 
return ret;
 }
@@ -1242,7 +1245,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(>lock_reset);
+   if (down_read_killable(>reset_sem))
+   return -EINTR;
 
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1269,7 +1273,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
void *data)
kthread_unpark(ring->sched.thread);
}
 
-   mutex_unlock(>lock_reset);
+   up_read(>reset_sem);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1463,10 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   mutex_lock(>lock_reset);
+   if (down_read_killable(>reset_sem)) {
+   kfree(fences);
+   return -EINTR;
+   }
 
/* stop the scheduler */
kthread_park(ring->sched.thread);
@@ -1500,7 +1507,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   mutex_unlock(>lock_reset);
+   up_read(>reset_sem);
 
ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
atomic_set(>in_gpu_reset, 0);
-   mutex_init(>lock_reset);
+   init_rwsem(>reset_sem);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
 
@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device 
*adev)
if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
return false;
 
-   mutex_lock(>lock_reset);
+   down_write(>reset_sem);
 
atomic_inc(>gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) {
@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
atomic_set(>in_gpu_reset, 0);
-   mutex_unlock(>lock_reset);
+   up_write(>reset_sem);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f27d83f2de78..8ac63f13fc6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-   int locked;
 
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
 * otherwise the mailbox msg will be ruined/reseted by
 * the VF FLR.
-*
-* we can unlock the lock_reset to allow "amdgpu_job_timedout"
-* to run gpu_recover() aft

[PATCH v2] drm/amdgpu: refine codes to avoid reentering GPU recovery

2020-08-19 Thread Dennis Li
if other threads have holden the reset lock, recovery will
fail to try_lock. Therefore we introduce atomic hive->in_reset
and adev->in_gpu_reset, to avoid reentering GPU recovery.

v2:
drop "? true : false" in the definition of amdgpu_in_reset

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8ba389780001..c8aec832b244 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -952,7 +952,7 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   boolin_gpu_reset;
+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct mutex  lock_reset;
struct amdgpu_doorbell_index doorbell_index;
@@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
+static inline int amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(>in_gpu_reset);
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..b872cdb0b705 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
 #if 0
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..832a200bb62f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index ccd635b812b5..d0940121a6a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 4cd851fc5c82..64fdb6a81c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 35fed75a4397..79b397800cbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct 
file *file, struct poll_
 
poll_wait(file, >autodump.gpu_hang, poll_table);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return POLLIN | POLLRDNORM | POLLWRNORM;
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6573e1112462..78fd2c9a7b7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].status.hw == true)
break;
 
-   if (adev->in_gpu_reset || adev->in_suspend) {
+   if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = 
adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> 
failed %d\n",
@@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct 
amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
 
-   if (!adev->in_gpu_reset)
+   if (!amdgpu_in_r

[PATCH] drm/amdgpu: refine codes to avoid reentering GPU recovery

2020-08-19 Thread Dennis Li
if other threads have holden the reset lock, recovery will
fail to try_lock. Therefore we introduce atomic hive->in_reset
and adev->in_gpu_reset, to avoid reentering GPU recovery.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8ba389780001..0fba65efdb48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -952,7 +952,7 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   boolin_gpu_reset;
+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct mutex  lock_reset;
struct amdgpu_doorbell_index doorbell_index;
@@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
+static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(>in_gpu_reset) ? true : false;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..b872cdb0b705 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
 #if 0
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..832a200bb62f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index ccd635b812b5..d0940121a6a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 4cd851fc5c82..64fdb6a81c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 35fed75a4397..79b397800cbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct 
file *file, struct poll_
 
poll_wait(file, >autodump.gpu_hang, poll_table);
 
-   if (adev->in_gpu_reset)
+   if (amdgpu_in_reset(adev))
return POLLIN | POLLRDNORM | POLLWRNORM;
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6573e1112462..78fd2c9a7b7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].status.hw == true)
break;
 
-   if (adev->in_gpu_reset || adev->in_suspend) {
+   if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = 
adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> 
failed %d\n",
@@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct 
amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
 
-   if (!adev->in_gpu_reset)
+   if (!amdgpu_in_reset(adev))
return false;
 
/*
@

[PATCH v3] drm/amdgpu: refine create and release logic of hive info

2020-08-18 Thread Dennis Li
Change to dynamically create and release hive info object,
which help driver support more hives in the future.

v2:
Change to save hive object pointer in adev, to avoid locking
xgmi_mutex every time when calling amdgpu_get_xgmi_hive.

v3:
1. Change type of hive object pointer in adev from void* to
amdgpu_hive_info*.
2. remove unnecessary variable initialization.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 98d0c6e5ab3c..e25f952d8836 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -251,6 +251,7 @@ struct amdgpu_fpriv;
 struct amdgpu_bo_va_mapping;
 struct amdgpu_atif;
 struct kfd_vm_fault_info;
+struct amdgpu_hive_info;
 
 enum amdgpu_cp_irq {
AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP = 0,
@@ -730,7 +731,7 @@ struct amdgpu_device {
 #ifdef CONFIG_DRM_AMD_ACP
struct amdgpu_acp   acp;
 #endif
-
+   struct amdgpu_hive_info *hive;
/* ASIC */
enum amd_asic_type  asic_type;
uint32_tfamily;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f323281c82b0..bc6ef0caf157 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2857,7 +2857,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
 {
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
/* It's a bug to not have a hive within this function */
if (WARN_ON(!hive))
@@ -2895,6 +2895,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 adev->asic_reset_res, adev->ddev->unique);
+   amdgpu_put_xgmi_hive(hive);
 }
 
 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
@@ -4315,7 +4316,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * We always reset all schedulers for device and all devices for XGMI
 * hive so that should take care of them too.
 */
-   hive = amdgpu_get_xgmi_hive(adev, false);
+   hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(>in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bf71f0a58786..18cdd259d568 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1555,9 +1555,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
struct amdgpu_device *remote_adev = NULL;
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle =  NULL;
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
 
if (!ras->disable_ras_err_cnt_harvest) {
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
/* Build list of devices to query RAS related errors */
if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
device_list_handle = >device_list;
@@ -1570,6 +1571,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head)
amdgpu_ras_log_on_err_counter(remote_adev);
+
+   amdgpu_put_xgmi_hive(hive);
}
 
if (amdgpu_device_should_recover_gpu(ras->adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 7a61dc6738eb..08ed4dddfaf1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -35,11 +35,9 @@
 
 static DEFINE_MUTEX(xgmi_mutex);
 
-#define AMDGPU_MAX_XGMI_HIVE   8
 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4
 
-static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
-static unsigned hive_count = 0;
+static LIST_HEAD(xgmi_hive_list);
 
 static const int xgmi_pcs_err_status_reg_vg20[] = {
smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
@@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field 
wafl_pcs_ras_fields[] = {
  *
  */
 
+static struct attribute amdgpu_xgmi_hive_id = {
+   .name = "xgmi_hive_id",
+   .mode = S_IRUGO
+};
 
-static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
-   struct device_attribute *attr, char *buf)
-{
-

[PATCH v2] drm/amdgpu: refine create and release logic of hive info

2020-08-18 Thread Dennis Li
Change to dynamically create and release hive info object,
which help driver support more hives in the future.

v2:
Change to save hive object pointer in adev, to avoid locking
xgmi_mutex every time when calling amdgpu_get_xgmi_hive.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 98d0c6e5ab3c..894886d6381b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -730,7 +730,7 @@ struct amdgpu_device {
 #ifdef CONFIG_DRM_AMD_ACP
struct amdgpu_acp   acp;
 #endif
-
+   void*hive;
/* ASIC */
enum amd_asic_type  asic_type;
uint32_tfamily;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f323281c82b0..bc6ef0caf157 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2857,7 +2857,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
 {
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
/* It's a bug to not have a hive within this function */
if (WARN_ON(!hive))
@@ -2895,6 +2895,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 adev->asic_reset_res, adev->ddev->unique);
+   amdgpu_put_xgmi_hive(hive);
 }
 
 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
@@ -4315,7 +4316,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * We always reset all schedulers for device and all devices for XGMI
 * hive so that should take care of them too.
 */
-   hive = amdgpu_get_xgmi_hive(adev, false);
+   hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(>in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bf71f0a58786..18cdd259d568 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1555,9 +1555,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
struct amdgpu_device *remote_adev = NULL;
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle =  NULL;
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
 
if (!ras->disable_ras_err_cnt_harvest) {
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
/* Build list of devices to query RAS related errors */
if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
device_list_handle = >device_list;
@@ -1570,6 +1571,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head)
amdgpu_ras_log_on_err_counter(remote_adev);
+
+   amdgpu_put_xgmi_hive(hive);
}
 
if (amdgpu_device_should_recover_gpu(ras->adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 7a61dc6738eb..c6bd5f0c1339 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -35,11 +35,9 @@
 
 static DEFINE_MUTEX(xgmi_mutex);
 
-#define AMDGPU_MAX_XGMI_HIVE   8
 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4
 
-static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
-static unsigned hive_count = 0;
+static LIST_HEAD(xgmi_hive_list);
 
 static const int xgmi_pcs_err_status_reg_vg20[] = {
smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
@@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field 
wafl_pcs_ras_fields[] = {
  *
  */
 
+static struct attribute amdgpu_xgmi_hive_id = {
+   .name = "xgmi_hive_id",
+   .mode = S_IRUGO
+};
 
-static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
-   struct device_attribute *attr, char *buf)
-{
-   struct amdgpu_hive_info *hive =
-   container_of(attr, struct amdgpu_hive_info, dev_attr);
-
-   return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
-}
+static struct attribute *amdgpu_xgmi_hive_attrs[] = {
+   _xgmi_hive_id,
+   NULL
+};
 
-static int amdgpu_xgmi_sysfs_create(struct am

[PATCH] drm/amdgpu: refine create and release logic of hive info

2020-08-17 Thread Dennis Li
Change to dynamically create and release hive info object,
which help driver support more hives in the future.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8a55b0bc044a..fdfdc2f678c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
 {
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
/* It's a bug to not have a hive within this function */
if (WARN_ON(!hive))
@@ -2878,6 +2878,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 adev->asic_reset_res, adev->ddev->unique);
+   amdgpu_put_xgmi_hive(hive);
 }
 
 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
@@ -4286,11 +4287,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 * We always reset all schedulers for device and all devices for XGMI
 * hive so that should take care of them too.
 */
-   hive = amdgpu_get_xgmi_hive(adev, false);
+   hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(>in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
job ? job->base.id : -1, hive->hive_id);
+   amdgpu_put_xgmi_hive(hive);
return 0;
}
mutex_lock(>hive_lock);
@@ -4456,6 +4458,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (hive) {
atomic_set(>in_reset, 0);
mutex_unlock(>hive_lock);
+   amdgpu_put_xgmi_hive(hive);
}
 
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5680f7eafcb1..e18606e322e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
struct amdgpu_device *remote_adev = NULL;
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle =  NULL;
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
/* Build list of devices to query RAS related errors */
if  (hive && adev->gmc.xgmi.num_physical_nodes > 1)
@@ -1525,6 +1525,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
device_list_handle = _list;
}
 
+   amdgpu_put_xgmi_hive(hive);
+
list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
amdgpu_ras_log_on_err_counter(remote_adev);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 67a756f4337b..5315d16539f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -35,11 +35,9 @@
 
 static DEFINE_MUTEX(xgmi_mutex);
 
-#define AMDGPU_MAX_XGMI_HIVE   8
 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4
 
-static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
-static unsigned hive_count = 0;
+static LIST_HEAD(xgmi_hive_list);
 
 static const int xgmi_pcs_err_status_reg_vg20[] = {
smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
@@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field 
wafl_pcs_ras_fields[] = {
  *
  */
 
+static struct attribute amdgpu_xgmi_hive_id = {
+   .name = "xgmi_hive_id",
+   .mode = S_IRUGO
+};
 
-static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
-   struct device_attribute *attr, char *buf)
-{
-   struct amdgpu_hive_info *hive =
-   container_of(attr, struct amdgpu_hive_info, dev_attr);
-
-   return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
-}
+static struct attribute *amdgpu_xgmi_hive_attrs[] = {
+   _xgmi_hive_id,
+   NULL
+};
 
-static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
-   struct amdgpu_hive_info *hive)
+static ssize_t amdgpu_xgmi_show_hive_id(struct kobject *kobj,
+   struct attribute *attr, char *buf)
 {
-   int ret = 0;
-
-   if (WARN_ON(hive->kobj))
-   return -EINVAL;
-
-   hive->kobj = kobject_create_and_add(&q

[PATCH] drm/amdgpu: fix a potential circular locking dependency

2020-08-11 Thread Dennis Li
[  653.902305] ==
[  653.902928] WARNING: possible circular locking dependency detected
[  653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
[  653.904098] --
[  653.904675] amdgpu_test/3975 is trying to acquire lock:
[  653.905241] 97848f8647a0 (>reset_sem){.+.+}, at: 
amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
[  653.905953]
   but task is already holding lock:
[  653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, at: 
ttm_eu_reserve_buffers+0x1ae/0x520 [ttm]
[  653.907694]
   which lock already depends on the new lock.

[  653.909423]
   the existing dependency chain (in reverse order) is:
[  653.910594]
   -> #1 (reservation_ww_class_mutex){+.+.}:
[  653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120
[  653.912350]ww_mutex_lock+0x73/0x80
[  653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu]
[  653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu]
[  653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu]
[  653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu]
[  653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  653.916959]local_pci_probe+0x47/0xa0
[  653.917570]work_for_cpu_fn+0x1a/0x30
[  653.918184]process_one_work+0x29e/0x630
[  653.918803]worker_thread+0x22b/0x3f0
[  653.919427]kthread+0x12f/0x150
[  653.920047]ret_from_fork+0x3a/0x50
[  653.920661]
   -> #0 (>reset_sem){.+.+}:
[  653.921893]__lock_acquire+0x13ec/0x16e0
[  653.922531]lock_acquire+0xb8/0x1c0
[  653.923174]down_read+0x48/0x230
[  653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu]
[  653.924588]drm_ioctl_kernel+0xb6/0x100 [drm]
[  653.925283]drm_ioctl+0x389/0x450 [drm]
[  653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu]
[  653.926686]ksys_ioctl+0x98/0xb0
[  653.927357]__x64_sys_ioctl+0x1a/0x20
[  653.928030]do_syscall_64+0x5f/0x250
[  653.928697]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  653.929373]
   other info that might help us debug this:

[  653.931356]  Possible unsafe locking scenario:

[  653.932647]CPU0CPU1
[  653.933287]
[  653.933911]   lock(reservation_ww_class_mutex);
[  653.934530]lock(>reset_sem);
[  653.935154]lock(reservation_ww_class_mutex);
[  653.935766]   lock(>reset_sem);
[  653.936360]
*** DEADLOCK ***

[  653.938028] 2 locks held by amdgpu_test/3975:
[  653.938574]  #0: b2a862d6bcd0 (reservation_ww_class_acquire){+.+.}, at: 
amdgpu_gem_va_ioctl+0x39b/0x4f0 [amdgpu]
[  653.939233]  #1: 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, at: 
ttm_eu_reserve_buffers+0x1ae/0x520 [ttm]

change the order of reservation_ww_class_mutex and adev->reset_sem in
amdgpu_gem_va_ioctl the same as ones in amdgpu_amdkfd_alloc_gtt_mem, to
avoid potential dead lock.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index ee1e8fff83b2..fc889c477696 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -652,6 +652,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
abo = NULL;
}
 
+   down_read(>reset_sem);
+
amdgpu_vm_get_pd_bo(>vm, , _pd);
 
r = ttm_eu_reserve_buffers(, , true, );
@@ -670,8 +672,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
bo_va = NULL;
}
 
-   down_read(>reset_sem);
-
switch (args->operation) {
case AMDGPU_VA_OP_MAP:
va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@@ -701,12 +701,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void 
*data,
amdgpu_gem_va_update_vm(adev, >vm, bo_va,
args->operation);
 
-   up_read(>reset_sem);
-
 error_backoff:
ttm_eu_backoff_reservation(, );
 
 error_unref:
+   up_read(>reset_sem);
drm_gem_object_put_unlocked(gobj);
return r;
 }
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: refine create and release logic of hive info

2020-08-11 Thread Dennis Li
Change to dynamically create and release hive info object,
which help driver support more hives in the future.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8a55b0bc044a..fdfdc2f678c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
 {
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
/* It's a bug to not have a hive within this function */
if (WARN_ON(!hive))
@@ -2878,6 +2878,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 adev->asic_reset_res, adev->ddev->unique);
+   amdgpu_put_xgmi_hive(hive);
 }
 
 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
@@ -4286,11 +4287,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 * We always reset all schedulers for device and all devices for XGMI
 * hive so that should take care of them too.
 */
-   hive = amdgpu_get_xgmi_hive(adev, false);
+   hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(>in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
job ? job->base.id : -1, hive->hive_id);
+   amdgpu_put_xgmi_hive(hive);
return 0;
}
mutex_lock(>hive_lock);
@@ -4456,6 +4458,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (hive) {
atomic_set(>in_reset, 0);
mutex_unlock(>hive_lock);
+   amdgpu_put_xgmi_hive(hive);
}
 
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5680f7eafcb1..e18606e322e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
struct amdgpu_device *remote_adev = NULL;
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle =  NULL;
-   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
/* Build list of devices to query RAS related errors */
if  (hive && adev->gmc.xgmi.num_physical_nodes > 1)
@@ -1525,6 +1525,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
device_list_handle = _list;
}
 
+   amdgpu_put_xgmi_hive(hive);
+
list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
amdgpu_ras_log_on_err_counter(remote_adev);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 67a756f4337b..5315d16539f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -35,11 +35,9 @@
 
 static DEFINE_MUTEX(xgmi_mutex);
 
-#define AMDGPU_MAX_XGMI_HIVE   8
 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4
 
-static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
-static unsigned hive_count = 0;
+static LIST_HEAD(xgmi_hive_list);
 
 static const int xgmi_pcs_err_status_reg_vg20[] = {
smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
@@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field 
wafl_pcs_ras_fields[] = {
  *
  */
 
+static struct attribute amdgpu_xgmi_hive_id = {
+   .name = "xgmi_hive_id",
+   .mode = S_IRUGO
+};
 
-static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
-   struct device_attribute *attr, char *buf)
-{
-   struct amdgpu_hive_info *hive =
-   container_of(attr, struct amdgpu_hive_info, dev_attr);
-
-   return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
-}
+static struct attribute *amdgpu_xgmi_hive_attrs[] = {
+   _xgmi_hive_id,
+   NULL
+};
 
-static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
-   struct amdgpu_hive_info *hive)
+static ssize_t amdgpu_xgmi_show_hive_id(struct kobject *kobj,
+   struct attribute *attr, char *buf)
 {
-   int ret = 0;
-
-   if (WARN_ON(hive->kobj))
-   return -EINVAL;
-
-   hive->kobj = kobject_create_and_add(&q

[PATCH v3] drm/amdgpu: annotate a false positive recursive locking

2020-08-10 Thread Dennis Li
[  584.110304] 
[  584.110590] WARNING: possible recursive locking detected
[  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
[  584.64] 
[  584.111456] kworker/38:1/553 is trying to acquire lock:
[  584.111721] 9b15ff0a47a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.112112]
   but task is already holding lock:
[  584.112673] 9b1603d247a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.113068]
   other info that might help us debug this:
[  584.113689]  Possible unsafe locking scenario:

[  584.114350]CPU0
[  584.114685]
[  584.115014]   lock(>reset_sem);
[  584.115349]   lock(>reset_sem);
[  584.115678]
*** DEADLOCK ***

[  584.116624]  May be due to missing lock nesting notation

[  584.117284] 4 locks held by kworker/38:1/553:
[  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: 
process_one_work+0x21f/0x630
[  584.117967]  #1: ac708e1c3e58 
((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  584.118358]  #2: c1c2a5d0 (>hive_lock){+.+.}, at: 
amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu]
[  584.118786]  #3: 9b1603d247a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.119222]
   stack backtrace:
[  584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G 
  OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1
[  584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 
05/23/2019
[  584.121223] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[  584.121638] Call Trace:
[  584.122050]  dump_stack+0x98/0xd5
[  584.122499]  __lock_acquire+0x1139/0x16e0
[  584.122931]  ? trace_hardirqs_on+0x3b/0xf0
[  584.123358]  ? cancel_delayed_work+0xa6/0xc0
[  584.123771]  lock_acquire+0xb8/0x1c0
[  584.124197]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.124599]  down_write+0x49/0x120
[  584.125032]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.125472]  amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.125910]  ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu]
[  584.126367]  amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  584.126789]  process_one_work+0x29e/0x630
[  584.127208]  worker_thread+0x3c/0x3f0
[  584.127621]  ? __kthread_parkme+0x61/0x90
[  584.128014]  kthread+0x12f/0x150
[  584.128402]  ? process_one_work+0x630/0x630
[  584.128790]  ? kthread_park+0x90/0x90
[  584.129174]  ret_from_fork+0x3a/0x50

Each adev has owned lock_class_key to avoid false positive
recursive locking.

v2:
1. register adev->lock_key into lockdep, otherwise lockdep will
report the below warning

[ 1216.705820] BUG: key 890183b647d0 has not been registered!
[ 1216.705924] [ cut here ]
[ 1216.705972] DEBUG_LOCKS_WARN_ON(1)
[ 1216.705997] WARNING: CPU: 20 PID: 541 at kernel/locking/lockdep.c:3743 
lockdep_init_map+0x150/0x210

v3:
change to use down_write_nest_lock to annotate the false dead-lock
warning.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac97fbd2..8a55b0bc044a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4145,12 +4145,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
return r;
 }
 
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct 
amdgpu_hive_info *hive)
 {
if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
return false;
 
-   down_write(>reset_sem);
+   if (hive) {
+   down_write_nest_lock(>reset_sem, >hive_lock);
+   } else
+   down_write(>reset_sem);
 
atomic_inc(>gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) {
@@ -4312,7 +4315,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-   if (!amdgpu_device_lock_adev(tmp_adev)) {
+   if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another 
already in progress",
  job ? job->base.id : -1);
r = 0;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/amdgpu: annotate a false positive recursive locking

2020-08-07 Thread Dennis Li
[  584.110304] 
[  584.110590] WARNING: possible recursive locking detected
[  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
[  584.64] 
[  584.111456] kworker/38:1/553 is trying to acquire lock:
[  584.111721] 9b15ff0a47a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.112112]
   but task is already holding lock:
[  584.112673] 9b1603d247a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.113068]
   other info that might help us debug this:
[  584.113689]  Possible unsafe locking scenario:

[  584.114350]CPU0
[  584.114685]
[  584.115014]   lock(>reset_sem);
[  584.115349]   lock(>reset_sem);
[  584.115678]
*** DEADLOCK ***

[  584.116624]  May be due to missing lock nesting notation

[  584.117284] 4 locks held by kworker/38:1/553:
[  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: 
process_one_work+0x21f/0x630
[  584.117967]  #1: ac708e1c3e58 
((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  584.118358]  #2: c1c2a5d0 (>hive_lock){+.+.}, at: 
amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu]
[  584.118786]  #3: 9b1603d247a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.119222]
   stack backtrace:
[  584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G 
  OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1
[  584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 
05/23/2019
[  584.121223] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[  584.121638] Call Trace:
[  584.122050]  dump_stack+0x98/0xd5
[  584.122499]  __lock_acquire+0x1139/0x16e0
[  584.122931]  ? trace_hardirqs_on+0x3b/0xf0
[  584.123358]  ? cancel_delayed_work+0xa6/0xc0
[  584.123771]  lock_acquire+0xb8/0x1c0
[  584.124197]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.124599]  down_write+0x49/0x120
[  584.125032]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.125472]  amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.125910]  ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu]
[  584.126367]  amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  584.126789]  process_one_work+0x29e/0x630
[  584.127208]  worker_thread+0x3c/0x3f0
[  584.127621]  ? __kthread_parkme+0x61/0x90
[  584.128014]  kthread+0x12f/0x150
[  584.128402]  ? process_one_work+0x630/0x630
[  584.128790]  ? kthread_park+0x90/0x90
[  584.129174]  ret_from_fork+0x3a/0x50

Each adev has owned lock_class_key to avoid false positive
recursive locking.

v2:
1. register adev->lock_key into lockdep, otherwise lockdep will
report the below warning

[ 1216.705820] BUG: key 890183b647d0 has not been registered!
[ 1216.705924] [ cut here ]
[ 1216.705972] DEBUG_LOCKS_WARN_ON(1)
[ 1216.705997] WARNING: CPU: 20 PID: 541 at kernel/locking/lockdep.c:3743 
lockdep_init_map+0x150/0x210

Signed-off-by: Dennis Li 
Change-Id: I7571efeccbf15483982031d00504a353031a854a

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e97c088d03b3..766dc8f8c8a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -967,6 +967,7 @@ struct amdgpu_device {
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct rw_semaphore reset_sem;
+   struct lock_class_key lock_key;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac97fbd2..ae0a576f9895 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3037,6 +3037,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
init_rwsem(>reset_sem);
+   lockdep_register_key(>lock_key);
+   lockdep_set_class(>reset_sem, >lock_key);
atomic_set(>in_gpu_reset, 0);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
@@ -3411,6 +3413,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
amdgpu_pmu_fini(adev);
if (adev->discovery_bin)
amdgpu_discovery_fini(adev);
+
+   lockdep_unregister_key(>lock_key);
 }
 
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: annotate a false positive recursive locking

2020-08-06 Thread Dennis Li
[  584.110304] 
[  584.110590] WARNING: possible recursive locking detected
[  584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
[  584.64] 
[  584.111456] kworker/38:1/553 is trying to acquire lock:
[  584.111721] 9b15ff0a47a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.112112]
   but task is already holding lock:
[  584.112673] 9b1603d247a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.113068]
   other info that might help us debug this:
[  584.113689]  Possible unsafe locking scenario:

[  584.114350]CPU0
[  584.114685]
[  584.115014]   lock(>reset_sem);
[  584.115349]   lock(>reset_sem);
[  584.115678]
*** DEADLOCK ***

[  584.116624]  May be due to missing lock nesting notation

[  584.117284] 4 locks held by kworker/38:1/553:
[  584.117616]  #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: 
process_one_work+0x21f/0x630
[  584.117967]  #1: ac708e1c3e58 
((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  584.118358]  #2: c1c2a5d0 (>hive_lock){+.+.}, at: 
amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu]
[  584.118786]  #3: 9b1603d247a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.119222]
   stack backtrace:
[  584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G 
  OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1
[  584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 
05/23/2019
[  584.121223] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[  584.121638] Call Trace:
[  584.122050]  dump_stack+0x98/0xd5
[  584.122499]  __lock_acquire+0x1139/0x16e0
[  584.122931]  ? trace_hardirqs_on+0x3b/0xf0
[  584.123358]  ? cancel_delayed_work+0xa6/0xc0
[  584.123771]  lock_acquire+0xb8/0x1c0
[  584.124197]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.124599]  down_write+0x49/0x120
[  584.125032]  ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.125472]  amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu]
[  584.125910]  ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu]
[  584.126367]  amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  584.126789]  process_one_work+0x29e/0x630
[  584.127208]  worker_thread+0x3c/0x3f0
[  584.127621]  ? __kthread_parkme+0x61/0x90
[  584.128014]  kthread+0x12f/0x150
[  584.128402]  ? process_one_work+0x630/0x630
[  584.128790]  ? kthread_park+0x90/0x90
[  584.129174]  ret_from_fork+0x3a/0x50

Each adev has owned lock_class_key to avoid false positive
recursive locking.

Signed-off-by: Dennis Li 
Change-Id: I7571efeccbf15483982031d00504a353031a854a

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e97c088d03b3..766dc8f8c8a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -967,6 +967,7 @@ struct amdgpu_device {
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct rw_semaphore reset_sem;
+   struct lock_class_key lock_key;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6c572db42d92..d78df9312d34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3037,6 +3037,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
init_rwsem(>reset_sem);
+   lockdep_set_class(>reset_sem, >lock_key);
atomic_set(>in_gpu_reset, 0);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/amdgpu: unlock mutex on error

2020-08-05 Thread Dennis Li
Make sure to unlock the mutex when error happen

v2:
1. correct syntax error in the commit comment
2. remove change-Id

Acked-by: Nirmoy Das 
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index a0ea663ecdbc..5e5369abc6fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -632,13 +632,14 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
}
 
ret = amdgpu_ib_schedule(ring, 1, ib, job, );
+
+   up_read(>reset_sem);
+
if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched;
}
 
-   up_read(>reset_sem);
-
ret = dma_fence_wait(f, false);
 
 err_ib_sched:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 4e017f379eb6..67a756f4337b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -545,7 +545,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
}
ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
if (ret)
-   goto exit;
+   goto exit_unlock;
}
 
/* get latest topology info for each device from psp */
@@ -558,7 +558,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret);
/* To do : continue with some node failed or 
disable the whole hive */
-   goto exit;
+   goto exit_unlock;
}
}
}
@@ -566,7 +566,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!ret)
ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 
-
+exit_unlock:
mutex_unlock(>hive_lock);
 exit:
if (!ret)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: annotate a false positive locking dependency

2020-08-05 Thread Dennis Li
[  264.483189] ==
[  264.483487] WARNING: possible circular locking dependency detected
[  264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G   OE
[  264.484076] --
[  264.484370] kworker/39:1/567 is trying to acquire lock:
[  264.484663] c15df4b0 (mgpu_info.mutex){+.+.}, at: 
amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[  264.485081]
   but task is already holding lock:
[  264.485670] 965fd31647a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[  264.486074]
   which lock already depends on the new lock.

[  264.487043]
   the existing dependency chain (in reverse order) is:
[  264.487710]
   -> #3 (>reset_sem){}:
[  264.488400]down_write+0x49/0x120
[  264.488783]amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[  264.489179]amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  264.489544]process_one_work+0x29e/0x630
[  264.489910]worker_thread+0x3c/0x3f0
[  264.490279]kthread+0x12f/0x150
[  264.490649]ret_from_fork+0x3a/0x50
[  264.491020]
   -> #2 (>hive_lock){+.+.}:
[  264.491764]__mutex_lock+0x95/0xa20
[  264.492137]mutex_lock_nested+0x1b/0x20
[  264.492553]amdgpu_get_xgmi_hive+0x352/0x400 [amdgpu]
[  264.492972]amdgpu_xgmi_add_device+0xb8/0x460 [amdgpu]
[  264.493387]amdgpu_device_init+0x12fb/0x1e10 [amdgpu]
[  264.493807]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  264.494226]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  264.494617]local_pci_probe+0x47/0xa0
[  264.494998]work_for_cpu_fn+0x1a/0x30
[  264.495369]process_one_work+0x29e/0x630
[  264.495746]worker_thread+0x22b/0x3f0
[  264.496124]kthread+0x12f/0x150
[  264.496504]ret_from_fork+0x3a/0x50
[  264.496876]
   -> #1 (xgmi_mutex){+.+.}:
[  264.497596]__mutex_lock+0x95/0xa20
[  264.497954]mutex_lock_nested+0x1b/0x20
[  264.498346]amdgpu_get_xgmi_hive+0x38/0x400 [amdgpu]
[  264.498741]amdgpu_xgmi_set_pstate+0x10/0x20 [amdgpu]
[  264.499126]amdgpu_device_ip_late_init+0x219/0x230 [amdgpu]
[  264.499506]amdgpu_device_init+0x1401/0x1e10 [amdgpu]
[  264.499886]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  264.500264]amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  264.500608]local_pci_probe+0x47/0xa0
[  264.500945]work_for_cpu_fn+0x1a/0x30
[  264.501276]process_one_work+0x29e/0x630
[  264.501603]worker_thread+0x22b/0x3f0
[  264.501927]kthread+0x12f/0x150
[  264.502239]ret_from_fork+0x3a/0x50
[  264.502541]
   -> #0 (mgpu_info.mutex){+.+.}:
[  264.503126]__lock_acquire+0x13ec/0x16e0
[  264.503411]lock_acquire+0xb8/0x1c0
[  264.503693]__mutex_lock+0x95/0xa20
[  264.504019]mutex_lock_nested+0x1b/0x20
[  264.504354]amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[  264.504691]amdgpu_device_gpu_recover+0x360/0x1030 [amdgpu]
[  264.505029]amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  264.505334]process_one_work+0x29e/0x630
[  264.505617]worker_thread+0x3c/0x3f0
[  264.505897]kthread+0x12f/0x150
[  264.506176]ret_from_fork+0x3a/0x50
[  264.506453]
   other info that might help us debug this:

[  264.507267] Chain exists of:
 mgpu_info.mutex --> >hive_lock --> >reset_sem

[  264.508102]  Possible unsafe locking scenario:

[  264.508664]CPU0CPU1
[  264.508945]
[  264.509221]   lock(>reset_sem);
[  264.509524]lock(>hive_lock);
[  264.509818]lock(>reset_sem);
[  264.510111]   lock(mgpu_info.mutex);
[  264.510401]
*** DEADLOCK ***

[  264.511224] 4 locks held by kworker/39:1/567:
[  264.511499]  #0: 961ff5c1d348 ((wq_completion)events){+.+.}, at: 
process_one_work+0x21f/0x630
[  264.511793]  #1: afa90e233e58 
((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  264.512100]  #2: c16245d0 (>hive_lock){+.+.}, at: 
amdgpu_device_gpu_recover+0xb0/0x1030 [amdgpu]
[  264.512450]  #3: 965fd31647a0 (>reset_sem){}, at: 
amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]

Remove the lock(>hive_lock) out of amdgpu_get_xgmi_hive,
to disable its locking dependency on xgmi_mutex.

Signed-off-by: Dennis Li 
Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac97fbd2..6c572db42d92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2840,7 +2840,7 @@ static 

[PATCH] drm/amdgpu: unlock mutex on error

2020-08-05 Thread Dennis Li
Make sure unlock the mutex when error happen

Signed-off-by: Dennis Li 
Change-Id: I6c36a193df5fe70516282d8136b4eadf32d20915

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index a0ea663ecdbc..5e5369abc6fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -632,13 +632,14 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
}
 
ret = amdgpu_ib_schedule(ring, 1, ib, job, );
+
+   up_read(>reset_sem);
+
if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched;
}
 
-   up_read(>reset_sem);
-
ret = dma_fence_wait(f, false);
 
 err_ib_sched:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 4e017f379eb6..67a756f4337b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -545,7 +545,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
}
ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
if (ret)
-   goto exit;
+   goto exit_unlock;
}
 
/* get latest topology info for each device from psp */
@@ -558,7 +558,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret);
/* To do : continue with some node failed or 
disable the whole hive */
-   goto exit;
+   goto exit_unlock;
}
}
}
@@ -566,7 +566,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!ret)
ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 
-
+exit_unlock:
mutex_unlock(>hive_lock);
 exit:
if (!ret)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v5] drm/amdgpu: fix system hang issue during GPU reset

2020-07-18 Thread Dennis Li
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.

During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;

[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 
05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249]  dump_stack+0x98/0xd5
[ 1230.179443]  amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673]  gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882]  amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098]  amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239]  ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394]  ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558]  ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707]  ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832]  ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979]  ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522]  amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833]  free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143]  destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475]  pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819]  kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154]  kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458]  ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656]  ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831]  ksys_ioctl+0x98/0xb0
[ 1230.204004]  __x64_sys_ioctl+0x1a/0x20
[ 1230.205174]  do_syscall_64+0x5f/0x250
[ 1230.206339]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.

v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset

v5:
1. Fix some style issues.

Signed-off-by: Dennis Li 
Reviewed-by: Andrey Grodzovsky 
Reviewed-by: Christian König 
Reviewed-by: Felix Kuehling 
Reviewed-by: Lijo Lazar 
Suggested-by: Luben Tukov 

Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 80f32b3beb88..be8bd3ae783a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -963,9 +963,9 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   boolin_gpu_reset;
+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
@@ -1280,4 +1280,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
+static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(>in_gpu_reset) ? true : false;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1b865fed74ca..a0ea663ecdbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
size_t size,
if (cp_mqd_gfx9)
bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
 
+   if (!down_read_trylock(>reset_sem))
+   return -EIO;
+
r = amdgpu_bo_create(adev, , );
if (r) {
dev_err(adev->dev,
"failed to allocate BO for amdkfd (%d)\n", r);
-   return r;
+   goto err;
}
 
/* map the buffer */
@@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t 
size,
 
amdgpu_bo_unreserve(bo);
 
+   up_read(>reset_sem);
return 0;
 
 allocate_mem_kmap_bo_failed:
@@ -291,19 +295,25 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
size_t size,
amdgpu_bo_unreserve(bo);
 allocate_mem_reserve_bo_failed:
amdgpu_bo_unref();
-
+err:
+   up_read(>reset_sem);
return r;
 }
 
 void amdgpu_amdkf

[PATCH v4] drm/amdgpu: fix system hang issue during GPU reset

2020-07-16 Thread Dennis Li
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.

During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;

[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 
05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249]  dump_stack+0x98/0xd5
[ 1230.179443]  amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673]  gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882]  amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098]  amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239]  ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394]  ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558]  ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707]  ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832]  ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979]  ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522]  amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833]  free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143]  destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475]  pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819]  kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154]  kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458]  ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656]  ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831]  ksys_ioctl+0x98/0xb0
[ 1230.204004]  __x64_sys_ioctl+0x1a/0x20
[ 1230.205174]  do_syscall_64+0x5f/0x250
[ 1230.206339]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.

v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset

Signed-off-by: Dennis Li 
Reviewed-by: Andrey Grodzovsky 
Reviewed-by: Christian König 
Reviewed-by: Felix Kuehling 
Reviewed-by: Lijo Lazar 

Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 80f32b3beb88..be8bd3ae783a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -963,9 +963,9 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   boolin_gpu_reset;
+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
@@ -1280,4 +1280,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
+static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(>in_gpu_reset) ? true : false;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1b865fed74ca..3fc2229fc533 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
size_t size,
if (cp_mqd_gfx9)
bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
 
+   if(!down_read_trylock(>reset_sem))
+   return -EIO;
+
r = amdgpu_bo_create(adev, , );
if (r) {
dev_err(adev->dev,
"failed to allocate BO for amdkfd (%d)\n", r);
-   return r;
+   goto err;
}
 
/* map the buffer */
@@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t 
size,
 
amdgpu_bo_unreserve(bo);
 
+   up_read(>reset_sem);
return 0;
 
 allocate_mem_kmap_bo_failed:
@@ -291,19 +295,25 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
size_t size,
amdgpu_bo_unreserve(bo);
 allocate_mem_reserve_bo_failed:
amdgpu_bo_unref();
-
+err:
+   up_read(>reset_sem);
return r;
 }
 
 void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
 {
+   s

[PATCH v2] drm/amdgpu: fix system hang issue during GPU reset

2020-07-08 Thread Dennis Li
During GPU reset, driver should hold on all external access to
GPU, otherwise psp will randomly fail to do post, and then cause
system hang.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

Signed-off-by: Dennis Li 
Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 80f32b3beb88..f235492799d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -963,9 +963,9 @@ struct amdgpu_device {
boolin_suspend;
boolin_hibernate;
 
-   boolin_gpu_reset;
+   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..af71d8e93081 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (atomic_read(>in_gpu_reset))
return -EIO;
 
 #if 0
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..750a8308c868 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
 
-   if (adev->in_gpu_reset)
+   if (atomic_read(>in_gpu_reset))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index ccd635b812b5..027793e0c1ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (atomic_read(>in_gpu_reset))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2ac5e7..e4a77f7a4c2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -541,7 +541,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
 
-   if (adev->in_gpu_reset)
+   if (atomic_read(>in_gpu_reset))
return -EIO;
 
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index ffbcaf4bfb8b..a94b3f862fc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
parser.adev = adev;
parser.filp = filp;
 
+   down_read(>reset_sem);
+
r = amdgpu_cs_parser_init(, data);
if (r) {
DRM_ERROR("Failed to initialize parser %d!\n", r);
@@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
 out:
amdgpu_cs_parser_fini(, r, reserved_buffers);
 
+   up_read(>reset_sem);
+
return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 8842c55d4490..d85d13f7a043 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -358,6 +358,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(>guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
+   down_read(>reset_sem);
+
/*query ue count*/
ras_counter = amdgpu_ras_query_error_count(adev, false);
/*ras counter is monotonic increasing*/
@@ -373,6 +375,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
ctx->ras_counter_ce = ras_counter;
}
 
+   up_read(>reset_sem);
+
m

[PATCH] drm/amdgpu: fix system hang issue during GPU reset

2020-07-06 Thread Dennis Li
During GPU reset, driver should hold on all external access to
GPU, otherwise psp will randomly fail to do post, and then cause
system hang.

Signed-off-by: Dennis Li 
Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6c7dd0a707c9..34bfc2a147ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -965,7 +965,7 @@ struct amdgpu_device {
 
boolin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct mutex  lock_reset;
+   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index ad59ac4423b8..4139c81389a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
/* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid;
 
+   down_read(>reset_sem);
ret = amdgpu_ib_schedule(ring, 1, ib, job, );
+   up_read(>reset_sem);
if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched;
@@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, 
uint16_t vmid)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
 
+   down_read(>reset_sem);
+
if (adev->family == AMDGPU_FAMILY_AI) {
int i;
 
@@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, 
uint16_t vmid)
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
}
 
+   up_read(>reset_sem);
+
return 0;
 }
 
@@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev 
*kgd, uint16_t pasid)
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
const uint32_t flush_type = 0;
bool all_hub = false;
+   int ret = 0;
 
if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true;
 
-   return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
+   down_read(>reset_sem);
+
+   ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
+
+   up_read(>reset_sem);
+
+   return ret;
 }
 
 bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..db5d533dd406 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long end_jiffies;
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
+   int ret = 0;
 
if (adev->in_gpu_reset)
return -EIO;
@@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
 #endif
 
+   down_read(>reset_sem);
+
acquire_queue(kgd, pipe_id, queue_id);
 
if (m->cp_hqd_vmid == 0)
@@ -633,14 +636,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue preemption time out.\n");
-   release_queue(kgd);
-   return -ETIME;
+   ret = -ETIME;
+   goto pro_end;
}
usleep_range(500, 1000);
}
 
+pro_end:
release_queue(kgd);
-   return 0;
+   up_read(>reset_sem);
+   return ret;
 }
 
 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..cf27fe5091aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -424,10 +424,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
enum hqd_dequeue_request_type type;
unsigned long flags, end_jiffies;
int retry;
+   int ret = 0;
 
if (adev->in_gpu_reset)
return -EIO;
 
+   down_read(>reset_sem);
+
acquire_queue(kgd, pipe_id, queue_id);
WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0);
 
@@ -506,14 +509,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue preemption time out\n");
- 

[PATCH] drm/amdkfd: change to return status when flush tlb

2020-07-06 Thread Dennis Li
If GPU hang, driver will fail to flush tlb, return the hang error
to callers, make callers have a chance to handle the error.

Signed-off-by: Dennis Li 
Change-Id: Ie305ad0a77675f6eab7d5b8f68e279b7f4e7a8b9

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e9b96ad3d9a5..18e243183b5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1488,7 +1488,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
peer_pdd = kfd_get_process_device_data(peer, p);
if (WARN_ON_ONCE(!peer_pdd))
continue;
-   kfd_flush_tlb(peer_pdd);
+   err = kfd_flush_tlb(peer_pdd);
}
 
kfree(devices_arr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 21eb0998c4ae..d636cbf7d32f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -263,6 +263,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
struct queue *q)
 {
int allocated_vmid = -1, i;
+   int ret = 0;
 
for (i = dqm->dev->vm_info.first_vmid_kfd;
i <= dqm->dev->vm_info.last_vmid_kfd; i++) {
@@ -295,13 +296,26 @@ static int allocate_vmid(struct device_queue_manager *dqm,
qpd->vmid,
qpd->page_table_base);
/* invalidate the VM context after pasid and vmid mapping is set up */
-   kfd_flush_tlb(qpd_to_pdd(qpd));
+   ret = kfd_flush_tlb(qpd_to_pdd(qpd));
+   if (ret) {
+   pr_err("Failed to flush tlb\n");
+   goto pro_failed;
+   }
 
if (dqm->dev->kfd2kgd->set_scratch_backing_va)
dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
qpd->sh_hidden_private_base, qpd->vmid);
 
return 0;
+
+pro_failed:
+   /* Release the vmid mapping */
+   set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+   dqm->vmid_pasid[qpd->vmid] = 0;
+
+   qpd->vmid = 0;
+   q->properties.vmid = 0;
+   return ret;
 }
 
 static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
@@ -326,12 +340,17 @@ static void deallocate_vmid(struct device_queue_manager 
*dqm,
struct qcm_process_device *qpd,
struct queue *q)
 {
+   int ret = 0;
+
/* On GFX v7, CP doesn't flush TC at dequeue */
if (q->device->device_info->asic_family == CHIP_HAWAII)
if (flush_texture_cache_nocpsch(q->device, qpd))
pr_err("Failed to flush TC\n");
 
-   kfd_flush_tlb(qpd_to_pdd(qpd));
+   ret = kfd_flush_tlb(qpd_to_pdd(qpd));
+   if (ret) {
+   pr_err("Failed to flush tlb\n");
+   }
 
/* Release the vmid mapping */
set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -795,7 +814,9 @@ static int restore_process_queues_nocpsch(struct 
device_queue_manager *dqm,
dqm->dev->kgd,
qpd->vmid,
qpd->page_table_base);
-   kfd_flush_tlb(pdd);
+   ret = kfd_flush_tlb(pdd);
+   if (ret)
+   goto out;
}
 
/* Take a safe reference to the mm_struct, which may otherwise
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 51ba2020732e..31ea72946d06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1081,7 +1081,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, 
unsigned int pasid,
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
 
-void kfd_flush_tlb(struct kfd_process_device *pdd);
+int kfd_flush_tlb(struct kfd_process_device *pdd);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8616a204e4c3..3919cc88813c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1444,21 +1444,24 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct 
kfd_process *process,
   KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
 }
 
-void kfd_flush_tlb(struct kfd_process_device *pdd)
+int kfd_flush_tlb(struct kfd_process_device *pdd)
 {
struct kfd_dev *dev = pdd->dev;
+   int ret = 0;
 
if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
/* Nothing to flush until a VMID is assigned, which
 * only happens when the first que

[PATCH v2] drm/amdgpu: set error query ready after all IPs late init

2020-04-21 Thread Dennis Li
If set error query ready in amdgpu_ras_late_init, which will
cause some IP blocks aren't initialized, but their error query
is ready.

v2: change the prefix of title to "drm/amdgpu" and remove
the unnecessary "{}".

Change-Id: I5087527261cb1b462afd82ad7592cf1ef73b15bd
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 423eed223aa5..e37e0982cd46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2218,6 +2218,8 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.late_initialized = true;
}
 
+   amdgpu_ras_set_error_query_ready(adev, true);
+
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 68b82f7b0b80..8b14aee370c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1921,10 +1921,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
}
 
/* in resume phase, no need to create ras fs node */
-   if (adev->in_suspend || adev->in_gpu_reset) {
-   amdgpu_ras_set_error_query_ready(adev, true);
+   if (adev->in_suspend || adev->in_gpu_reset)
return 0;
-   }
 
if (ih_info->cb) {
r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
@@ -1936,8 +1934,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
if (r)
goto sysfs;
 
-   amdgpu_ras_set_error_query_ready(adev, true);
-
return 0;
 cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/amdgpu: set error query ready after all IPs late init

2020-04-21 Thread Dennis Li
If set error query ready in amdgpu_ras_late_init, which will
cause some IP blocks aren't initialized, but their error query
is ready.

Change-Id: I5087527261cb1b462afd82ad7592cf1ef73b15bd
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
old mode 100644
new mode 100755
index 423eed223aa5..e37e0982cd46
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2218,6 +2218,8 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.late_initialized = true;
}
 
+   amdgpu_ras_set_error_query_ready(adev, true);
+
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 68b82f7b0b80..060866d372a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1922,7 +1922,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
 
/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || adev->in_gpu_reset) {
-   amdgpu_ras_set_error_query_ready(adev, true);
return 0;
}
 
@@ -1936,8 +1935,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
if (r)
goto sysfs;
 
-   amdgpu_ras_set_error_query_ready(adev, true);
-
return 0;
 cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: replace DRM prefix with PCI device info for gfx/mmhub

2020-04-17 Thread Dennis Li
Prefix RAS message printing in gfx/mmhub with PCI device info,
which assists the debug in multiple GPU case.

Change-Id: Iceba7cafd5aac7d0251d9f871503745cc617fba2
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
old mode 100644
new mode 100755
index dce945ef21a5..46351db36922
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -732,7 +732,8 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
sec_count = REG_GET_FIELD(data, VML2_WALKER_MEM_ECC_CNTL,
  SEC_COUNT);
if (sec_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, SEC %d\n", i,
 vml2_walker_mems[i], sec_count);
err_data->ce_count += sec_count;
}
@@ -740,7 +741,8 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
ded_count = REG_GET_FIELD(data, VML2_WALKER_MEM_ECC_CNTL,
  DED_COUNT);
if (ded_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, DED %d\n", i,
 vml2_walker_mems[i], ded_count);
err_data->ue_count += ded_count;
}
@@ -752,14 +754,16 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
 
sec_count = REG_GET_FIELD(data, UTCL2_MEM_ECC_CNTL, SEC_COUNT);
if (sec_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, SEC %d\n", i,
 utcl2_router_mems[i], sec_count);
err_data->ce_count += sec_count;
}
 
ded_count = REG_GET_FIELD(data, UTCL2_MEM_ECC_CNTL, DED_COUNT);
if (ded_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, DED %d\n", i,
 utcl2_router_mems[i], ded_count);
err_data->ue_count += ded_count;
}
@@ -772,7 +776,8 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
sec_count = REG_GET_FIELD(data, ATC_L2_CACHE_2M_DSM_CNTL,
  SEC_COUNT);
if (sec_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, SEC %d\n", i,
 atc_l2_cache_2m_mems[i], sec_count);
err_data->ce_count += sec_count;
}
@@ -780,7 +785,8 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
ded_count = REG_GET_FIELD(data, ATC_L2_CACHE_2M_DSM_CNTL,
  DED_COUNT);
if (ded_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, DED %d\n", i,
 atc_l2_cache_2m_mems[i], ded_count);
err_data->ue_count += ded_count;
}
@@ -793,7 +799,8 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
sec_count = REG_GET_FIELD(data, ATC_L2_CACHE_4K_DSM_CNTL,
  SEC_COUNT);
if (sec_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: SubBlock %s, SEC %d\n", i,
 atc_l2_cache_4k_mems[i], sec_count);
err_data->ce_count += sec_count;
}
@@ -801,7 +808,8 @@ static int gfx_v9_4_query_utc_edc_status(struct 
amdgpu_device *adev,
ded_count = REG_GET_FIELD(data, ATC_L2_CACHE_4K_DSM_CNTL,
  DED_COUNT);
if (ded_count) {
-   DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
+   dev_info(adev->dev,
+"Instance[%d]: Su

[PATCH] drm/amdgpu: fix the coverage issue to clear ArcVPGRs

2020-03-22 Thread Dennis Li
Set ComputePGMRSRC1.VGPRS as 0x3f to clear all ArcVGPRs.

Change-Id: I296c3a162c0d5c7b84d4b48dc2002340a5c22e2a
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
old mode 100644
new mode 100755
index 324838baa71c..44fb64460c1f
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4264,7 +4264,7 @@ static const struct soc15_reg_entry 
vgpr_init_regs_arcturus[] = {
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 0x40 },
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 4 },
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Z), 1 },
-   { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x81 },
+   { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0xbf },
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC2), 0x40 },  /* 64KB LDS */
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE0), 0x },
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0x },
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH v2] drm/amdgpu: add codes to clear AccVGPR for arcturus

2020-03-12 Thread Dennis Li
AccVGPRs are newly added in arcturus. Before reading these
registers, they should be initialized. Otherwise edc error
happens, when RAS is enabled.

v2: reuse the existing logical to calculate register size

Change-Id: I4ed384f0cc4b781a10cfd6ad1e3a132445bdc261
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
old mode 100644
new mode 100755
index c78ffdc51373..324838baa71c
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4144,6 +4144,101 @@ static const u32 sgpr_init_compute_shader[] =
0xbe800080, 0xbf81,
 };
 
+static const u32 vgpr_init_compute_shader_arcturus[] = {
+   0xd3d94000, 0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880,
+   0xd3d94003, 0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880,
+   0xd3d94006, 0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880,
+   0xd3d94009, 0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880,
+   0xd3d9400c, 0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880,
+   0xd3d9400f, 0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880,
+   0xd3d94012, 0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880,
+   0xd3d94015, 0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880,
+   0xd3d94018, 0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880,
+   0xd3d9401b, 0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880,
+   0xd3d9401e, 0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880,
+   0xd3d94021, 0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880,
+   0xd3d94024, 0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880,
+   0xd3d94027, 0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880,
+   0xd3d9402a, 0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880,
+   0xd3d9402d, 0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880,
+   0xd3d94030, 0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880,
+   0xd3d94033, 0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880,
+   0xd3d94036, 0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880,
+   0xd3d94039, 0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880,
+   0xd3d9403c, 0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880,
+   0xd3d9403f, 0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880,
+   0xd3d94042, 0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880,
+   0xd3d94045, 0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880,
+   0xd3d94048, 0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880,
+   0xd3d9404b, 0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880,
+   0xd3d9404e, 0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880,
+   0xd3d94051, 0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880,
+   0xd3d94054, 0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880,
+   0xd3d94057, 0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880,
+   0xd3d9405a, 0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880,
+   0xd3d9405d, 0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880,
+   0xd3d94060, 0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880,
+   0xd3d94063, 0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880,
+   0xd3d94066, 0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880,
+   0xd3d94069, 0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880,
+   0xd3d9406c, 0x1880, 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880,
+   0xd3d9406f, 0x1880, 0xd3d94070, 0x1880, 0xd3d94071, 0x1880,
+   0xd3d94072, 0x1880, 0xd3d94073, 0x1880, 0xd3d94074, 0x1880,
+   0xd3d94075, 0x1880, 0xd3d94076, 0x1880, 0xd3d94077, 0x1880,
+   0xd3d94078, 0x1880, 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880,
+   0xd3d9407b, 0x1880, 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880,
+   0xd3d9407e, 0x1880, 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880,
+   0xd3d94081, 0x1880, 0xd3d94082, 0x1880, 0xd3d94083, 0x1880,
+   0xd3d94084, 0x1880, 0xd3d94085, 0x1880, 0xd3d94086, 0x1880,
+   0xd3d94087, 0x1880, 0xd3d94088, 0x1880, 0xd3d94089, 0x1880,
+   0xd3d9408a, 0x1880, 0xd3d9408b, 0x1880, 0xd3d9408c, 0x1880,
+   0xd3d9408d, 0x1880, 0xd3d9408e, 0x1880, 0xd3d9408f, 0x1880,
+   0xd3d94090, 0x1880, 0xd3d94091, 0x1880, 0xd3d94092, 0x1880,
+   0xd3d94093, 0x1880, 0xd3d94094, 0x1880, 0xd3d94095, 0x1880,
+   0xd3d94096, 0x1880, 0xd3d94097, 0x1880, 0xd3d94098, 0x1880,
+   0xd3d94099, 0x1880, 0xd3d9409a, 0x1880, 0xd3d9409b, 0x1880,
+   0xd3d9409c, 0x1880, 0xd3d9409d, 0x1880, 0xd3d9409e, 0x1880,
+   0xd3d9409f, 0x1880, 0xd3d940a0, 0x1880

[PATCH] drm/amdgpu: add codes to clear AccVGPR for arcturus

2020-03-12 Thread Dennis Li
AccVGPRs are newly added in arcturus. Before reading these
registers, they should be initialized. Otherwise edc error
happens, when RAS is enabled.

Change-Id: I4ed384f0cc4b781a10cfd6ad1e3a132445bdc261
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
old mode 100644
new mode 100755
index c78ffdc51373..d5dd754bfb85
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4144,6 +4144,101 @@ static const u32 sgpr_init_compute_shader[] =
0xbe800080, 0xbf81,
 };
 
+static const u32 vgpr_init_compute_shader_arcturus[] = {
+   0xd3d94000, 0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880,
+   0xd3d94003, 0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880,
+   0xd3d94006, 0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880,
+   0xd3d94009, 0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880,
+   0xd3d9400c, 0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880,
+   0xd3d9400f, 0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880,
+   0xd3d94012, 0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880,
+   0xd3d94015, 0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880,
+   0xd3d94018, 0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880,
+   0xd3d9401b, 0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880,
+   0xd3d9401e, 0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880,
+   0xd3d94021, 0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880,
+   0xd3d94024, 0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880,
+   0xd3d94027, 0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880,
+   0xd3d9402a, 0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880,
+   0xd3d9402d, 0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880,
+   0xd3d94030, 0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880,
+   0xd3d94033, 0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880,
+   0xd3d94036, 0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880,
+   0xd3d94039, 0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880,
+   0xd3d9403c, 0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880,
+   0xd3d9403f, 0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880,
+   0xd3d94042, 0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880,
+   0xd3d94045, 0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880,
+   0xd3d94048, 0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880,
+   0xd3d9404b, 0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880,
+   0xd3d9404e, 0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880,
+   0xd3d94051, 0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880,
+   0xd3d94054, 0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880,
+   0xd3d94057, 0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880,
+   0xd3d9405a, 0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880,
+   0xd3d9405d, 0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880,
+   0xd3d94060, 0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880,
+   0xd3d94063, 0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880,
+   0xd3d94066, 0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880,
+   0xd3d94069, 0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880,
+   0xd3d9406c, 0x1880, 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880,
+   0xd3d9406f, 0x1880, 0xd3d94070, 0x1880, 0xd3d94071, 0x1880,
+   0xd3d94072, 0x1880, 0xd3d94073, 0x1880, 0xd3d94074, 0x1880,
+   0xd3d94075, 0x1880, 0xd3d94076, 0x1880, 0xd3d94077, 0x1880,
+   0xd3d94078, 0x1880, 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880,
+   0xd3d9407b, 0x1880, 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880,
+   0xd3d9407e, 0x1880, 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880,
+   0xd3d94081, 0x1880, 0xd3d94082, 0x1880, 0xd3d94083, 0x1880,
+   0xd3d94084, 0x1880, 0xd3d94085, 0x1880, 0xd3d94086, 0x1880,
+   0xd3d94087, 0x1880, 0xd3d94088, 0x1880, 0xd3d94089, 0x1880,
+   0xd3d9408a, 0x1880, 0xd3d9408b, 0x1880, 0xd3d9408c, 0x1880,
+   0xd3d9408d, 0x1880, 0xd3d9408e, 0x1880, 0xd3d9408f, 0x1880,
+   0xd3d94090, 0x1880, 0xd3d94091, 0x1880, 0xd3d94092, 0x1880,
+   0xd3d94093, 0x1880, 0xd3d94094, 0x1880, 0xd3d94095, 0x1880,
+   0xd3d94096, 0x1880, 0xd3d94097, 0x1880, 0xd3d94098, 0x1880,
+   0xd3d94099, 0x1880, 0xd3d9409a, 0x1880, 0xd3d9409b, 0x1880,
+   0xd3d9409c, 0x1880, 0xd3d9409d, 0x1880, 0xd3d9409e, 0x1880,
+   0xd3d9409f, 0x1880, 0xd3d940a0, 0x1880, 0xd3d940a1, 0x1880,
+   0xd3d940a2, 0x1880

[PATCH] drm/amdgpu: fix a bug NULL pointer dereference

2020-02-18 Thread Dennis Li
check whether the queue of entity is null to avoid null
pointer dereference.

Change-Id: I08d56774012cf229ba2fe7a011c1359e8d1e2781
Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 4cc7881f438c..67cca463ddcc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -95,6 +95,9 @@ static int amdgpu_vm_sdma_commit(struct 
amdgpu_vm_update_params *p,
int r;
 
entity = p->direct ? >vm->direct : >vm->delayed;
+   if (!entity->rq)
+   return 0;
+
ring = container_of(entity->rq->sched, struct amdgpu_ring, sched);
 
WARN_ON(ib->length_dw == 0);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 0/2] query edc counter for more mmhub sub-blocks of Acrturus

2020-01-18 Thread Dennis Li
1. Add RAS support for MAM D(0~3)_MEM in mmhub.
2. Add RAS support for other mmhub ranges from 2 to 7.

Dennis Li (2):
  drm/amdgpu: update mmhub 9.4.1 header files for Acrturus
  drm/amdgpu: enable RAS feature for more mmhub sub-blocks of Acrturus

 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c   | 701 +-
 .../asic_reg/mmhub/mmhub_9_4_1_sh_mask.h  | 128 
 2 files changed, 823 insertions(+), 6 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/2] drm/amdgpu: update mmhub 9.4.1 header files for Acrturus

2020-01-18 Thread Dennis Li
Add mask & shift definition of MAM_D(0~3)MEM for all mmhub
ranges.

Change-Id: I65c8a3040611198273a4b6da77c1a1ad2ffe7fd3
Signed-off-by: Dennis Li 
---
 .../asic_reg/mmhub/mmhub_9_4_1_sh_mask.h  | 128 ++
 1 file changed, 128 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h
index 40dfbf16bd34..111a71b434e2 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h
@@ -11185,6 +11185,14 @@
 #define MMEA0_EDC_CNT2__GMIWR_DATAMEM_DED_COUNT__SHIFT 
   0xa
 #define MMEA0_EDC_CNT2__GMIRD_PAGEMEM_SED_COUNT__SHIFT 
   0xc
 #define MMEA0_EDC_CNT2__GMIWR_PAGEMEM_SED_COUNT__SHIFT 
   0xe
+#define MMEA0_EDC_CNT2__MAM_D0MEM_SED_COUNT__SHIFT 
   0x10
+#define MMEA0_EDC_CNT2__MAM_D1MEM_SED_COUNT__SHIFT 
   0x12
+#define MMEA0_EDC_CNT2__MAM_D2MEM_SED_COUNT__SHIFT 
   0x14
+#define MMEA0_EDC_CNT2__MAM_D3MEM_SED_COUNT__SHIFT 
   0x16
+#define MMEA0_EDC_CNT2__MAM_D0MEM_DED_COUNT__SHIFT 
   0x18
+#define MMEA0_EDC_CNT2__MAM_D1MEM_DED_COUNT__SHIFT 
   0x1a
+#define MMEA0_EDC_CNT2__MAM_D2MEM_DED_COUNT__SHIFT 
   0x1c
+#define MMEA0_EDC_CNT2__MAM_D3MEM_DED_COUNT__SHIFT 
   0x1e
 #define MMEA0_EDC_CNT2__GMIRD_CMDMEM_SEC_COUNT_MASK
   0x0003L
 #define MMEA0_EDC_CNT2__GMIRD_CMDMEM_DED_COUNT_MASK
   0x000CL
 #define MMEA0_EDC_CNT2__GMIWR_CMDMEM_SEC_COUNT_MASK
   0x0030L
@@ -11193,6 +11201,14 @@
 #define MMEA0_EDC_CNT2__GMIWR_DATAMEM_DED_COUNT_MASK   
   0x0C00L
 #define MMEA0_EDC_CNT2__GMIRD_PAGEMEM_SED_COUNT_MASK   
   0x3000L
 #define MMEA0_EDC_CNT2__GMIWR_PAGEMEM_SED_COUNT_MASK   
   0xC000L
+#define MMEA0_EDC_CNT2__MAM_D0MEM_SED_COUNT_MASK   
   0x0003L
+#define MMEA0_EDC_CNT2__MAM_D1MEM_SED_COUNT_MASK   
   0x000CL
+#define MMEA0_EDC_CNT2__MAM_D2MEM_SED_COUNT_MASK   
   0x0030L
+#define MMEA0_EDC_CNT2__MAM_D3MEM_SED_COUNT_MASK   
   0x00C0L
+#define MMEA0_EDC_CNT2__MAM_D0MEM_DED_COUNT_MASK   
   0x0300L
+#define MMEA0_EDC_CNT2__MAM_D1MEM_DED_COUNT_MASK   
   0x0C00L
+#define MMEA0_EDC_CNT2__MAM_D2MEM_DED_COUNT_MASK   
   0x3000L
+#define MMEA0_EDC_CNT2__MAM_D3MEM_DED_COUNT_MASK   
   0xC000L
 //MMEA0_DSM_CNTL
 #define MMEA0_DSM_CNTL__DRAMRD_CMDMEM_DSM_IRRITATOR_DATA__SHIFT
   0x0
 #define MMEA0_DSM_CNTL__DRAMRD_CMDMEM_ENABLE_SINGLE_WRITE__SHIFT   
   0x2
@@ -14197,6 +14213,14 @@
 #define MMEA1_EDC_CNT2__GMIWR_DATAMEM_DED_COUNT__SHIFT 
   0xa
 #define MMEA1_EDC_CNT2__GMIRD_PAGEMEM_SED_COUNT__SHIFT 
   0xc
 #define MMEA1_EDC_CNT2__GMIWR_PAGEMEM_SED_COUNT__SHIFT 
   0xe
+#define MMEA1_EDC_CNT2__MAM_D0MEM_SED_COUNT__SHIFT 
   0x10
+#define MMEA1_EDC_CNT2__MAM_D1MEM_SED_COUNT__SHIFT 
   0x12
+#define MMEA1_EDC_CNT2__MAM_D2MEM_SED_COUNT__SHIFT 
   0x14
+#define MMEA1_EDC_CNT2__MAM_D3MEM_SED_COUNT__SHIFT 
   0x16
+#define MMEA1_EDC_CNT2__MAM_D0MEM_DED_COUNT__SHIFT 
   0x18
+#define MMEA1_EDC_CNT2__MAM_D1MEM_DED_COUNT__SHIFT 
   0x1a
+#de

[PATCH 2/2] drm/amdgpu: enable RAS feature for more mmhub sub-blocks of Acrturus

2020-01-18 Thread Dennis Li
Compared with Vg20, the size of mmhub range is changed from 2 to 8.

Change-Id: I529c0ff0aaed200e5b102d482563ed9dc2278260
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 701 +++-
 1 file changed, 695 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 5c42387c9274..e8bb0a91ed65 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -663,6 +663,7 @@ void mmhub_v9_4_get_clockgating(struct amdgpu_device *adev, 
u32 *flags)
 }
 
 static const struct soc15_ras_field_entry mmhub_v9_4_ras_fields[] = {
+   /* MMHUB Range 0 */
{ "MMEA0_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT),
SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT),
@@ -751,6 +752,24 @@ static const struct soc15_ras_field_entry 
mmhub_v9_4_ras_fields[] = {
0, 0,
SOC15_REG_FIELD(MMEA0_EDC_CNT3, GMIWR_PAGEMEM_DED_COUNT),
},
+   { "MMEA0_MAM_D0MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D0MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D0MEM_DED_COUNT),
+   },
+   { "MMEA0_MAM_D1MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D1MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D1MEM_DED_COUNT),
+   },
+   { "MMEA0_MAM_D2MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D2MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D2MEM_DED_COUNT),
+   },
+   { "MMEA0_MAM_D3MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D3MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D3MEM_DED_COUNT),
+   },
+
+   /* MMHUB Range 1 */
{ "MMEA1_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT),
SOC15_REG_FIELD(MMEA1_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT),
SOC15_REG_FIELD(MMEA1_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT),
@@ -838,16 +857,686 @@ static const struct soc15_ras_field_entry 
mmhub_v9_4_ras_fields[] = {
{ "MMEA1_GMIWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT3),
0, 0,
SOC15_REG_FIELD(MMEA1_EDC_CNT3, GMIWR_PAGEMEM_DED_COUNT),
+   },
+   { "MMEA1_MAM_D0MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D0MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D0MEM_DED_COUNT),
+   },
+   { "MMEA1_MAM_D1MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D1MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D1MEM_DED_COUNT),
+   },
+   { "MMEA1_MAM_D2MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D2MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D2MEM_DED_COUNT),
+   },
+   { "MMEA1_MAM_D3MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D3MEM_SED_COUNT),
+   SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D3MEM_DED_COUNT),
+   },
+
+   /* MMHUB Range 2*/
+   { "MMEA2_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT),
+   },
+   { "MMEA2_DRAMWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_CMDMEM_DED_COUNT),
+   },
+   { "MMEA2_DRAMWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_DATAMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_DATAMEM_DED_COUNT),
+   },
+   { "MMEA2_RRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, RRET_TAGMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, RRET_TAGMEM_DED_COUNT),
+   },
+   { "MMEA2_WRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, WRET_TAGMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, WRET_TAGMEM_DED_COUNT),
+   },
+   { "MMEA2_DRAMRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMRD_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA2_DRAMWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA2_IORD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT),
+   SO

[PATCH 4/4] drm/amdgpu: add RAS support for the gfx block of Arcturus

2020-01-18 Thread Dennis Li
Implement functions to do the RAS error injection and
query EDC counter.

Change-Id: I4d947511331a19c1967551b9d42997698073f795
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  26 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 978 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h |  35 +
 4 files changed, 1039 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 83ee1c676e3a..ccfdcfc6a526 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -120,6 +120,7 @@ amdgpu-y += \
amdgpu_rlc.o \
gfx_v8_0.o \
gfx_v9_0.o \
+   gfx_v9_4.o \
gfx_v10_0.o
 
 # add async DMA block
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 35b5ca7a9272..7c5b3ad25d51 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -48,6 +48,8 @@
 
 #include "amdgpu_ras.h"
 
+#include "gfx_v9_4.h"
+
 #define GFX9_NUM_GFX_RINGS 1
 #define GFX9_MEC_HPD_SIZE 4096
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000L
@@ -1822,6 +1824,17 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs 
= {
.query_ras_error_count = _v9_0_query_ras_error_count
 };
 
+static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
+   .get_gpu_clock_counter = _v9_0_get_gpu_clock_counter,
+   .select_se_sh = _v9_0_select_se_sh,
+   .read_wave_data = _v9_0_read_wave_data,
+   .read_wave_sgprs = _v9_0_read_wave_sgprs,
+   .read_wave_vgprs = _v9_0_read_wave_vgprs,
+   .select_me_pipe_q = _v9_0_select_me_pipe_q,
+   .ras_error_inject = _v9_4_ras_error_inject,
+   .query_ras_error_count = _v9_4_query_ras_error_count
+};
+
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
 {
u32 gb_addr_config;
@@ -1873,6 +1886,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device 
*adev)
gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
break;
case CHIP_ARCTURUS:
+   adev->gfx.funcs = _v9_4_gfx_funcs;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -4232,7 +4246,17 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
goto fail;
}
 
-   gfx_v9_0_clear_ras_edc_counter(adev);
+   switch (adev->asic_type)
+   {
+   case CHIP_VEGA20:
+   gfx_v9_0_clear_ras_edc_counter(adev);
+   break;
+   case CHIP_ARCTURUS:
+   gfx_v9_4_clear_ras_edc_counter(adev);
+   break;
+   default:
+   break;
+   }
 
 fail:
amdgpu_ib_free(adev, , NULL);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
new file mode 100644
index ..e19d275f3f7d
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include 
+
+#include "amdgpu.h"
+#include "amdgpu_gfx.h"
+#include "soc15.h"
+#include "soc15d.h"
+#include "amdgpu_atomfirmware.h"
+#include "amdgpu_pm.h"
+
+#include "gc/gc_9_4_1_offset.h"
+#include "gc/gc_9_4_1_sh_mask.h"
+#include "soc15_common.h"
+
+#include "gfx_v9_4.h"
+#include "amdgpu_ras.h"
+
+static const struct soc15_reg_entry gfx_v9_4_edc_counter_regs[] = {
+   /* CPC */
+   { SOC15_R

[PATCH 0/4] Enable RAS feature for the gc of Arcturus

2020-01-18 Thread Dennis Li
Refactor the ras related codes of vega20:
1. refine the security check for RAS functions.
2. abstract clearing edc counters to a separated function.
3. add ip prefix to ip related codes.

Implementation of RAS feature for Arcturus gfx:
1. add new register head files for gfx v9.4.1.
2. add codes to support querying of EDC counter and error injection.

Dennis Li (4):
  drm/amdgpu: refine the security check for RAS functions
  drm/amdgpu: abstract EDC counter clear to a separated function
  drm/amdgpu: add EDC counter registers of gc for Arcturus
  drm/amdgpu: add RAS support for the gfx block of Arcturus

 drivers/gpu/drm/amd/amdgpu/Makefile   |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 138 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 978 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h |  35 +
 .../amd/include/asic_reg/gc/gc_9_4_1_offset.h | 264 +
 .../include/asic_reg/gc/gc_9_4_1_sh_mask.h| 748 ++
 6 files changed, 2128 insertions(+), 36 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/4] drm/amdgpu: refine the security check for RAS functions

2020-01-18 Thread Dennis Li
To avoid calling RAS related functions when RAS feature isn't
supported in hardware. Change to check supported features, instead
of checking asic type.

v2: reuse amdgpu_ras_is_supported function, instead of introducing
a new flag for hardware ras feature.

Change-Id: Ia3f73bd9ee41ee3d0dd18d6f46e67124cf88d653
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e3d466bd5c4e..759d8144f9c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5967,7 +5967,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device 
*adev,
int ret;
struct ta_ras_trigger_error_input block_info = { 0 };
 
-   if (adev->asic_type != CHIP_VEGA20)
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return -EINVAL;
 
if (info->head.sub_block_index >= ARRAY_SIZE(ras_gfx_subblocks))
@@ -6218,7 +6218,7 @@ static int gfx_v9_0_query_ras_error_count(struct 
amdgpu_device *adev,
uint32_t i, j, k;
uint32_t reg_value;
 
-   if (adev->asic_type != CHIP_VEGA20)
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return -EINVAL;
 
err_data->ue_count = 0;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/4] drm/amdgpu: add EDC counter registers of gc for Arcturus

2020-01-18 Thread Dennis Li
add reg headers to gc includes

v2: remove unused registers and fields in this patch set

Change-Id: If3476c0b0ed88e5d11bdb8bec1278ae10fc5af25
Signed-off-by: Dennis Li 
---
 .../amd/include/asic_reg/gc/gc_9_4_1_offset.h | 264 +++
 .../include/asic_reg/gc/gc_9_4_1_sh_mask.h| 748 ++
 2 files changed, 1012 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
new file mode 100644
index ..f41556abfbbc
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2020  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _gc_9_4_1_OFFSET_HEADER
+#define _gc_9_4_1_OFFSET_HEADER
+
+// addressBlock: gc_grbmdec
+// base address: 0x8000
+#define mmGRBM_CNTL
0x
+#define mmGRBM_CNTL_BASE_IDX   
0
+#define mmGRBM_SKEW_CNTL   
0x0001
+#define mmGRBM_SKEW_CNTL_BASE_IDX  
0
+#define mmGRBM_STATUS2 
0x0002
+#define mmGRBM_STATUS2_BASE_IDX
0
+#define mmGRBM_PWR_CNTL
0x0003
+#define mmGRBM_PWR_CNTL_BASE_IDX   
0
+#define mmGRBM_STATUS  
0x0004
+#define mmGRBM_STATUS_BASE_IDX 
0
+#define mmGRBM_STATUS_SE0  
0x0005
+#define mmGRBM_STATUS_SE0_BASE_IDX 
0
+#define mmGRBM_STATUS_SE1  
0x0006
+#define mmGRBM_STATUS_SE1_BASE_IDX 
0
+#define mmGRBM_SOFT_RESET  
0x0008
+#define mmGRBM_SOFT_RESET_BASE_IDX 
0
+#define mmGRBM_GFX_CLKEN_CNTL  
0x000c
+#define mmGRBM_GFX_CLKEN_CNTL_BASE_IDX 
0
+#define mmGRBM_WAIT_IDLE_CLOCKS
0x000d
+#define mmGRBM_WAIT_IDLE_CLOCKS_BASE_IDX   
0
+#define mmGRBM_STATUS_SE2  
0x000e
+#define mmGRBM_STATUS_SE2_BASE_IDX 
0
+#define mmGRBM_STATUS_SE3  
0x000f
+#define mmGRBM_STATUS_SE3_BASE_IDX 
0
+#define mmGRBM_READ_ERROR  
0x0016
+#define mmGRBM_READ_ERROR_BASE_IDX 
0
+#defin

[PATCH 2/4] drm/amdgpu: abstract EDC counter clear to a separated function

2020-01-18 Thread Dennis Li
1. Add IP prefix for the IP related codes.
2. Refactor the code to clear EDC counter.

Change-Id: I1cd9ec304a7ace9a74480264d24368fd11a87833
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 112 ++
 1 file changed, 77 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 759d8144f9c0..35b5ca7a9272 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -736,6 +736,7 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring 
*ring);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
  void *ras_error_status);
+static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev);
 static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
 void *inject_if);
 
@@ -3996,7 +3997,7 @@ static const struct soc15_reg_entry sgpr2_init_regs[] = {
{ SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE7), 0xff00 },
 };
 
-static const struct soc15_reg_entry sec_ded_counter_registers[] = {
+static const struct soc15_reg_entry gfx_v9_0_edc_counter_regs[] = {
{ SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), 0, 1, 1},
{ SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_UCODE_CNT), 0, 1, 1},
{ SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), 0, 1, 1},
@@ -4085,7 +4086,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
struct amdgpu_ring *ring = >gfx.compute_ring[0];
struct amdgpu_ib ib;
struct dma_fence *f = NULL;
-   int r, i, j, k;
+   int r, i;
unsigned total_size, vgpr_offset, sgpr_offset;
u64 gpu_addr;
 
@@ -4231,18 +4232,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
goto fail;
}
 
-   /* read back registers to clear the counters */
-   mutex_lock(>grbm_idx_mutex);
-   for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) {
-   for (j = 0; j < sec_ded_counter_registers[i].se_num; j++) {
-   for (k = 0; k < sec_ded_counter_registers[i].instance; 
k++) {
-   gfx_v9_0_select_se_sh(adev, j, 0x0, k);
-   
RREG32(SOC15_REG_ENTRY_OFFSET(sec_ded_counter_registers[i]));
-   }
-   }
-   }
-   WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX, 0xe000);
-   mutex_unlock(>grbm_idx_mutex);
+   gfx_v9_0_clear_ras_edc_counter(adev);
 
 fail:
amdgpu_ib_free(adev, , NULL);
@@ -5519,7 +5509,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device 
*adev,
 }
 
 
-static const struct soc15_ras_field_entry gc_ras_fields_vg20[] = {
+static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = {
{ "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT),
  SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, SEC_COUNT),
  SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, DED_COUNT)
@@ -6092,7 +6082,7 @@ static int gfx_v9_0_query_utc_edc_status(struct 
amdgpu_device *adev,
WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);
WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT, 0);
 
-   for (i = 0; i < 16; i++) {
+   for (i = 0; i < ARRAY_SIZE(vml2_mems); i++) {
WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, i);
data = RREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT);
 
@@ -6111,7 +6101,7 @@ static int gfx_v9_0_query_utc_edc_status(struct 
amdgpu_device *adev,
}
}
 
-   for (i = 0; i < 7; i++) {
+   for (i = 0; i < ARRAY_SIZE(vml2_walker_mems); i++) {
WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, i);
data = RREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_CNT);
 
@@ -6132,7 +6122,7 @@ static int gfx_v9_0_query_utc_edc_status(struct 
amdgpu_device *adev,
}
}
 
-   for (i = 0; i < 4; i++) {
+   for (i = 0; i < ARRAY_SIZE(atc_l2_cache_2m_mems); i++) {
WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, i);
data = RREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_CNT);
 
@@ -6144,7 +6134,7 @@ static int gfx_v9_0_query_utc_edc_status(struct 
amdgpu_device *adev,
}
}
 
-   for (i = 0; i < 32; i++) {
+   for (i = 0; i < ARRAY_SIZE(atc_l2_cache_4k_mems); i++) {
WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, i);
data = RREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT);
 
@@ -6171,36 +6161,36 @@ static int gfx_v9_0_query_utc_edc_status(struct 
amdgpu_device *adev,
return 0;
 }
 
-static int __get_ras_error_count(const struct soc15_reg_entry *reg,
+static int gfx_v9_0_ras_error_count(const struct soc15_reg_entry *reg,
uint32_t se_id, uint32_t inst_id, u

[PATCH v2 2/3] drm/amdgpu: refine query function of mmhub EDC counter in vg20

2019-11-20 Thread Dennis Li
Add codes to print the detail EDC info for the subblock of mmhub

v2: Move the EDC_CNT registers' defintion from mmhub_9_4 header
files to mmhub_1_0 ones. Add mmhub_v1_0_ prefix for the local
static variable and function.

Change-Id: I1d5b3df38caa8f0b437c96b78091662aaeaf264b
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 232 
 .../include/asic_reg/mmhub/mmhub_1_0_offset.h |  16 ++
 .../asic_reg/mmhub/mmhub_1_0_sh_mask.h| 122 +
 .../asic_reg/mmhub/mmhub_9_4_0_offset.h   |  53 
 .../asic_reg/mmhub/mmhub_9_4_0_sh_mask.h  | 257 --
 5 files changed, 318 insertions(+), 362 deletions(-)
 delete mode 100644 
drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_offset.h
 delete mode 100644 
drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_sh_mask.h

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 6965e1e6fa9e..d7575ac27038 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -27,17 +27,13 @@
 #include "mmhub/mmhub_1_0_offset.h"
 #include "mmhub/mmhub_1_0_sh_mask.h"
 #include "mmhub/mmhub_1_0_default.h"
-#include "mmhub/mmhub_9_4_0_offset.h"
 #include "vega10_enum.h"
-
+#include "soc15.h"
 #include "soc15_common.h"
 
 #define mmDAGB0_CNTL_MISC2_RV 0x008f
 #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0
 
-#define EA_EDC_CNT_MASK 0x3
-#define EA_EDC_CNT_SHIFT 0x2
-
 u64 mmhub_v1_0_get_fb_location(struct amdgpu_device *adev)
 {
u64 base = RREG32_SOC15(MMHUB, 0, mmMC_VM_FB_LOCATION_BASE);
@@ -562,59 +558,191 @@ void mmhub_v1_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
*flags |= AMD_CG_SUPPORT_MC_LS;
 }
 
+static const struct soc15_ras_field_entry mmhub_v1_0_ras_fields[] = {
+   { "MMEA0_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMRD_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMRD_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_DRAMWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_DRAMWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_DATAMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_DATAMEM_DED_COUNT),
+   },
+   { "MMEA0_RRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, RRET_TAGMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, RRET_TAGMEM_DED_COUNT),
+   },
+   { "MMEA0_WRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, WRET_TAGMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, WRET_TAGMEM_DED_COUNT),
+   },
+   { "MMEA0_DRAMRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMRD_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_DRAMWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_IORD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, IORD_CMDMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_IOWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, IOWR_CMDMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_IOWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, IOWR_DATAMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_GMIRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT2_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIRD_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIRD_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_GMIWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT2_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_GMIWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT2_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_DATAMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_DATAMEM_DED_COUNT),
+   },
+   { "MMEA0_GMIRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, 
mmMMEA0_EDC_CNT2_VG20),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIRD_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_GMIWR

[PATCH v2 0/3] RAS support for mmhub

2019-11-20 Thread Dennis Li
This set of patches is a continuation of RAS enablement patches for AMDGPU. 

1. The new struct soc15_ras_field_entry will be reused by gfx, mmhub and other 
IP blocks.
2. Refine the query function of RAS error counter for VG20, add codes to help 
user to locate which sub-block of mmhub cause error.
3. Implement the query function of RAS error counter for Mi100

v2:
1. Fix some comment issues.
2. Add IP name prefix for the local static variable and function.
3. Move the EDC_CNT registers' defintion from mmhub_9_4 header files to 
mmhub_1_0 ones for vg20.

Dennis Li (3):
  drm/amdgpu: define soc15_ras_field_entry for reuse
  drm/amdgpu: refine query function of mmhub EDC counter in vg20
  drm/amdgpu: implement querying ras error count for mmhub9.4

 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  34 +--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |   3 +
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 232 
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c   | 253 -
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h   |   2 +
 drivers/gpu/drm/amd/amdgpu/soc15.h|  12 +
 .../include/asic_reg/mmhub/mmhub_1_0_offset.h |  16 ++
 .../asic_reg/mmhub/mmhub_1_0_sh_mask.h| 122 +
 .../asic_reg/mmhub/mmhub_9_4_0_offset.h   |  53 
 .../asic_reg/mmhub/mmhub_9_4_0_sh_mask.h  | 257 --
 10 files changed, 598 insertions(+), 386 deletions(-)
 delete mode 100644 
drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_offset.h
 delete mode 100644 
drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_sh_mask.h

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 1/3] drm/amdgpu: define soc15_ras_field_entry for reuse

2019-11-20 Thread Dennis Li
The struct soc15_ras_field_entry will be reused by
other IPs, such as mmhub and gc

v2: rename ras_subblock_regs to gc_ras_fields_vg20,
because the future asic maybe have a different table.

Change-Id: I6c3388a09b5fbf927ad90fcd626baa448d1681a6
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 34 +--
 drivers/gpu/drm/amd/amdgpu/soc15.h| 12 ++
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index c7ae685d6f74..8073fcd4720e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -131,18 +131,6 @@ MODULE_FIRMWARE("amdgpu/renoir_rlc.bin");
 #define mmTCP_CHAN_STEER_5_ARCT
0x0b0c
 #define mmTCP_CHAN_STEER_5_ARCT_BASE_IDX   
0
 
-struct ras_gfx_subblock_reg {
-   const char *name;
-   uint32_t hwip;
-   uint32_t inst;
-   uint32_t seg;
-   uint32_t reg_offset;
-   uint32_t sec_count_mask;
-   uint32_t sec_count_shift;
-   uint32_t ded_count_mask;
-   uint32_t ded_count_shift;
-};
-
 enum ta_ras_gfx_subblock {
/*CPC*/
TA_RAS_BLOCK__GFX_CPC_INDEX_START = 0,
@@ -5487,7 +5475,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device 
*adev,
 }
 
 
-static const struct ras_gfx_subblock_reg ras_subblock_regs[] = {
+static const struct soc15_ras_field_entry gc_ras_fields_vg20[] = {
{ "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT),
  SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, SEC_COUNT),
  SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, DED_COUNT)
@@ -6146,29 +6134,29 @@ static int __get_ras_error_count(const struct 
soc15_reg_entry *reg,
uint32_t i;
uint32_t sec_cnt, ded_cnt;
 
-   for (i = 0; i < ARRAY_SIZE(ras_subblock_regs); i++) {
-   if(ras_subblock_regs[i].reg_offset != reg->reg_offset ||
-   ras_subblock_regs[i].seg != reg->seg ||
-   ras_subblock_regs[i].inst != reg->inst)
+   for (i = 0; i < ARRAY_SIZE(gc_ras_fields_vg20); i++) {
+   if(gc_ras_fields_vg20[i].reg_offset != reg->reg_offset ||
+   gc_ras_fields_vg20[i].seg != reg->seg ||
+   gc_ras_fields_vg20[i].inst != reg->inst)
continue;
 
sec_cnt = (value &
-   ras_subblock_regs[i].sec_count_mask) >>
-   ras_subblock_regs[i].sec_count_shift;
+   gc_ras_fields_vg20[i].sec_count_mask) >>
+   gc_ras_fields_vg20[i].sec_count_shift;
if (sec_cnt) {
DRM_INFO("GFX SubBlock %s, Instance[%d][%d], SEC %d\n",
-   ras_subblock_regs[i].name,
+   gc_ras_fields_vg20[i].name,
se_id, inst_id,
sec_cnt);
*sec_count += sec_cnt;
}
 
ded_cnt = (value &
-   ras_subblock_regs[i].ded_count_mask) >>
-   ras_subblock_regs[i].ded_count_shift;
+   gc_ras_fields_vg20[i].ded_count_mask) >>
+   gc_ras_fields_vg20[i].ded_count_shift;
if (ded_cnt) {
DRM_INFO("GFX SubBlock %s, Instance[%d][%d], DED %d\n",
-   ras_subblock_regs[i].name,
+   gc_ras_fields_vg20[i].name,
se_id, inst_id,
ded_cnt);
*ded_count += ded_cnt;
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h 
b/drivers/gpu/drm/amd/amdgpu/soc15.h
index 9af6c6ffbfa2..344280b869c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
@@ -60,6 +60,18 @@ struct soc15_allowed_register_entry {
bool grbm_indexed;
 };
 
+struct soc15_ras_field_entry {
+   const char *name;
+   uint32_t hwip;
+   uint32_t inst;
+   uint32_t seg;
+   uint32_t reg_offset;
+   uint32_t sec_count_mask;
+   uint32_t sec_count_shift;
+   uint32_t ded_count_mask;
+   uint32_t ded_count_shift;
+};
+
 #define SOC15_REG_ENTRY(ip, inst, reg) ip##_HWIP, inst, reg##_BASE_IDX, reg
 
 #define SOC15_REG_ENTRY_OFFSET(entry)  
(adev->reg_offset[entry.hwip][entry.inst][entry.seg] + entry.reg_offset)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 3/3] drm/amdgpu: implement querying ras error count for mmhub9.4

2019-11-20 Thread Dennis Li
Get mmhub error counter by accessing EDC_CNT registers.

v2: Add mmhub_v9_4_ prefix for local static variable and function

Change-Id: I728d4183a08707aaf0fc71d184e86322a681e725
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |   3 +
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 253 +++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h |   2 +
 3 files changed, 257 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index ee615d050837..5f4a6cdf83a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -658,6 +658,9 @@ static void gmc_v9_0_set_mmhub_funcs(struct amdgpu_device 
*adev)
case CHIP_VEGA20:
adev->mmhub.funcs = _v1_0_funcs;
break;
+   case CHIP_ARCTURUS:
+   adev->mmhub.funcs = _v9_4_funcs;
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 2c5adfe803a2..6fe5c39e5581 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -21,6 +21,7 @@
  *
  */
 #include "amdgpu.h"
+#include "amdgpu_ras.h"
 #include "mmhub_v9_4.h"
 
 #include "mmhub/mmhub_9_4_1_offset.h"
@@ -29,7 +30,7 @@
 #include "athub/athub_1_0_offset.h"
 #include "athub/athub_1_0_sh_mask.h"
 #include "vega10_enum.h"
-
+#include "soc15.h"
 #include "soc15_common.h"
 
 #define MMHUB_NUM_INSTANCES2
@@ -651,3 +652,253 @@ void mmhub_v9_4_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
if (data & ATCL2_0_ATC_L2_MISC_CG__MEM_LS_ENABLE_MASK)
*flags |= AMD_CG_SUPPORT_MC_LS;
 }
+
+static const struct soc15_ras_field_entry mmhub_v9_4_ras_fields[] = {
+   { "MMEA0_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_DRAMWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_DRAMWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_DATAMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_DATAMEM_DED_COUNT),
+   },
+   { "MMEA0_RRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, RRET_TAGMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, RRET_TAGMEM_DED_COUNT),
+   },
+   { "MMEA0_WRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, WRET_TAGMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, WRET_TAGMEM_DED_COUNT),
+   },
+   { "MMEA0_DRAMRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_DRAMWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_IORD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, IORD_CMDMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_IOWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, IOWR_CMDMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_IOWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT, IOWR_DATAMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_GMIRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIRD_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIRD_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_GMIWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_CMDMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_CMDMEM_DED_COUNT),
+   },
+   { "MMEA0_GMIWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_DATAMEM_SEC_COUNT),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_DATAMEM_DED_COUNT),
+   },
+   { "MMEA0_GMIRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIRD_PAGEMEM_SED_COUNT),
+   0, 0,
+   },
+   { "MMEA0_GMIWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2),
+   SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_PAGEMEM_SED_COUNT),
+   

[PATCH 3/3] drm/amdgpu: add RAS support for VML2 and ATCL2

2019-10-10 Thread Dennis Li
Add codes to query the EDC count of VML2 & ATCL2

Change-Id: If2c251481ba0a1a34ce3405a85f86d65eecee461
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 167 ++
 1 file changed, 167 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2a95093b85a5..22be6177938e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6152,6 +6152,171 @@ static int gfx_v9_0_ras_error_inject(struct 
amdgpu_device *adev,
return ret;
 }
 
+static const char *vml2_mems[] = {
+   "UTC_VML2_BANK_CACHE_0_BIGK_MEM0",
+   "UTC_VML2_BANK_CACHE_0_BIGK_MEM1",
+   "UTC_VML2_BANK_CACHE_0_4K_MEM0",
+   "UTC_VML2_BANK_CACHE_0_4K_MEM1",
+   "UTC_VML2_BANK_CACHE_1_BIGK_MEM0",
+   "UTC_VML2_BANK_CACHE_1_BIGK_MEM1",
+   "UTC_VML2_BANK_CACHE_1_4K_MEM0",
+   "UTC_VML2_BANK_CACHE_1_4K_MEM1",
+   "UTC_VML2_BANK_CACHE_2_BIGK_MEM0",
+   "UTC_VML2_BANK_CACHE_2_BIGK_MEM1",
+   "UTC_VML2_BANK_CACHE_2_4K_MEM0",
+   "UTC_VML2_BANK_CACHE_2_4K_MEM1",
+   "UTC_VML2_BANK_CACHE_3_BIGK_MEM0",
+   "UTC_VML2_BANK_CACHE_3_BIGK_MEM1",
+   "UTC_VML2_BANK_CACHE_3_4K_MEM0",
+   "UTC_VML2_BANK_CACHE_3_4K_MEM1",
+};
+
+static const char *vml2_walker_mems[] = {
+   "UTC_VML2_CACHE_PDE0_MEM0",
+   "UTC_VML2_CACHE_PDE0_MEM1",
+   "UTC_VML2_CACHE_PDE1_MEM0",
+   "UTC_VML2_CACHE_PDE1_MEM1",
+   "UTC_VML2_CACHE_PDE2_MEM0",
+   "UTC_VML2_CACHE_PDE2_MEM1",
+   "UTC_VML2_RDIF_LOG_FIFO",
+};
+
+static const char *atc_l2_cache_2m_mems[] = {
+   "UTC_ATCL2_CACHE_2M_BANK0_WAY0_MEM",
+   "UTC_ATCL2_CACHE_2M_BANK0_WAY1_MEM",
+   "UTC_ATCL2_CACHE_2M_BANK1_WAY0_MEM",
+   "UTC_ATCL2_CACHE_2M_BANK1_WAY1_MEM",
+};
+
+static const char *atc_l2_cache_4k_mems[] = {
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM0",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM1",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM2",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM3",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM4",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM5",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM6",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM7",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM0",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM1",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM2",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM3",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM4",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM5",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM6",
+   "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM7",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM0",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM1",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM2",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM3",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM4",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM5",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM6",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM7",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM0",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM1",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM2",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM3",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM4",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM5",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM6",
+   "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM7",
+};
+
+static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev,
+struct ras_err_data *err_data)
+{
+   uint32_t i, data;
+   uint32_t sec_count, ded_count;
+
+   WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, 255);
+   WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT, 0);
+   WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, 255);
+   WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_CNT, 0);
+   WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, 255);
+   WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_CNT, 0);
+   WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);
+   WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT, 0);
+
+   for (i = 0; i < 16; i++) {
+   WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, i);
+   data = RREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT);
+
+   sec_count = REG_GET_FIELD(data, VM_L2_MEM_ECC_CNT, SEC_COUNT);
+   if (sec_count) {
+   DRM_INFO("Instance[%d]: SubBlock %s, SEC %

[PATCH 1/3] drm/amdgpu: change to query the actual EDC counter

2019-10-10 Thread Dennis Li
For the potential request in the future, change to
query the actual EDC counter.

Change-Id: I783ccd76f4c65f9829f7a8967a539a23ae5484b5
Signed-off-by: Dennis Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 819 --
 drivers/gpu/drm/amd/amdgpu/soc15.h|   2 +
 2 files changed, 496 insertions(+), 325 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9a1f91cf0ee8..2a95093b85a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -127,6 +127,18 @@ MODULE_FIRMWARE("amdgpu/renoir_rlc.bin");
 #define mmTCP_CHAN_STEER_5_ARCT
0x0b0c
 #define mmTCP_CHAN_STEER_5_ARCT_BASE_IDX   
0
 
+struct ras_gfx_subblock_reg {
+   const char *name;
+   uint32_t hwip;
+   uint32_t inst;
+   uint32_t seg;
+   uint32_t reg_offset;
+   uint32_t sec_count_mask;
+   uint32_t sec_count_shift;
+   uint32_t ded_count_mask;
+   uint32_t ded_count_shift;
+};
+
 enum ta_ras_gfx_subblock {
/*CPC*/
TA_RAS_BLOCK__GFX_CPC_INDEX_START = 0,
@@ -4172,6 +4184,7 @@ static const struct soc15_reg_entry 
sec_ded_counter_registers[] = {
{ SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT), 0, 1, 16},
{ SOC15_REG_ENTRY(GC, 0, mmTCP_ATC_EDC_GATCL1_CNT), 0, 4, 16},
{ SOC15_REG_ENTRY(GC, 0, mmTCP_EDC_CNT), 0, 4, 16},
+   { SOC15_REG_ENTRY(GC, 0, mmTCP_EDC_CNT_NEW), 0, 4, 16},
{ SOC15_REG_ENTRY(GC, 0, mmTD_EDC_CNT), 0, 4, 16},
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT2), 0, 4, 6},
{ SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_CNT), 0, 4, 16},
@@ -5652,301 +5665,445 @@ static int gfx_v9_0_process_ras_data_cb(struct 
amdgpu_device *adev,
return AMDGPU_RAS_SUCCESS;
 }
 
-static const struct {
-   const char *name;
-   uint32_t ip;
-   uint32_t inst;
-   uint32_t seg;
-   uint32_t reg_offset;
-   uint32_t per_se_instance;
-   int32_t num_instance;
-   uint32_t sec_count_mask;
-   uint32_t ded_count_mask;
-} gfx_ras_edc_regs[] = {
-   { "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), 0, 1,
- REG_FIELD_MASK(CPC_EDC_SCRATCH_CNT, SEC_COUNT),
- REG_FIELD_MASK(CPC_EDC_SCRATCH_CNT, DED_COUNT) },
-   { "CPC_UCODE", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_UCODE_CNT), 0, 1,
- REG_FIELD_MASK(CPC_EDC_UCODE_CNT, SEC_COUNT),
- REG_FIELD_MASK(CPC_EDC_UCODE_CNT, DED_COUNT) },
-   { "CPF_ROQ_ME1", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), 0, 1,
- REG_FIELD_MASK(CPF_EDC_ROQ_CNT, COUNT_ME1), 0 },
-   { "CPF_ROQ_ME2", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), 0, 1,
- REG_FIELD_MASK(CPF_EDC_ROQ_CNT, COUNT_ME2), 0 },
-   { "CPF_TAG", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_TAG_CNT), 0, 1,
- REG_FIELD_MASK(CPF_EDC_TAG_CNT, SEC_COUNT),
- REG_FIELD_MASK(CPF_EDC_TAG_CNT, DED_COUNT) },
-   { "CPG_DMA_ROQ", SOC15_REG_ENTRY(GC, 0, mmCPG_EDC_DMA_CNT), 0, 1,
- REG_FIELD_MASK(CPG_EDC_DMA_CNT, ROQ_COUNT), 0 },
-   { "CPG_DMA_TAG", SOC15_REG_ENTRY(GC, 0, mmCPG_EDC_DMA_CNT), 0, 1,
- REG_FIELD_MASK(CPG_EDC_DMA_CNT, TAG_SEC_COUNT),
- REG_FIELD_MASK(CPG_EDC_DMA_CNT, TAG_DED_COUNT) },
-   { "CPG_TAG", SOC15_REG_ENTRY(GC, 0, mmCPG_EDC_TAG_CNT), 0, 1,
- REG_FIELD_MASK(CPG_EDC_TAG_CNT, SEC_COUNT),
- REG_FIELD_MASK(CPG_EDC_TAG_CNT, DED_COUNT) },
-   { "DC_CSINVOC", SOC15_REG_ENTRY(GC, 0, mmDC_EDC_CSINVOC_CNT), 0, 1,
- REG_FIELD_MASK(DC_EDC_CSINVOC_CNT, COUNT_ME1), 0 },
-   { "DC_RESTORE", SOC15_REG_ENTRY(GC, 0, mmDC_EDC_RESTORE_CNT), 0, 1,
- REG_FIELD_MASK(DC_EDC_RESTORE_CNT, COUNT_ME1), 0 },
-   { "DC_STATE", SOC15_REG_ENTRY(GC, 0, mmDC_EDC_STATE_CNT), 0, 1,
- REG_FIELD_MASK(DC_EDC_STATE_CNT, COUNT_ME1), 0 },
-   { "GDS_MEM", SOC15_REG_ENTRY(GC, 0, mmGDS_EDC_CNT), 0, 1,
- REG_FIELD_MASK(GDS_EDC_CNT, GDS_MEM_SEC),
- REG_FIELD_MASK(GDS_EDC_CNT, GDS_MEM_DED) },
-   { "GDS_INPUT_QUEUE", SOC15_REG_ENTRY(GC, 0, mmGDS_EDC_CNT), 0, 1,
- REG_FIELD_MASK(GDS_EDC_CNT, GDS_INPUT_QUEUE_SED), 0 },
+static const struct ras_gfx_subblock_reg ras_subblock_regs[] = {
+   { "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT),
+ SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, SEC_COUNT),
+ SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, DED_COUNT)
+   },
+   { "CPC_UCODE", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_UCODE_CNT),
+ SOC15_REG_FIELD(CPC_EDC_UCODE_CNT, SEC_COUNT),
+ SOC15_REG_FIELD(CPC_EDC_UCODE_CNT, DED_COUNT)
+   },
+   { "CPF_ROQ_ME1", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT),
+ SOC15_REG_FIELD(CPF_EDC_ROQ_CNT, COUNT_ME1),
+ 0, 0
+   },
+   { "CPF_R

[PATCH 2/3] drm/amd/include: add register define for VML2 and ATCL2

2019-10-10 Thread Dennis Li
Add VML2 and ATCL2 ECC registers to support VEGA20 RAS

Change-Id: I8860f2e37fa7afd8d6123290fb7b9dcee56edd6e
Signed-off-by: Dennis Li 
---
 .../amd/include/asic_reg/gc/gc_9_0_offset.h| 18 --
 .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h   | 18 --
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
index ca16d9125fbc..2bfaaa8157d0 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
@@ -1146,7 +1146,14 @@
 #define mmATC_L2_MEM_POWER_LS_BASE_IDX 
0
 #define mmATC_L2_CGTT_CLK_CTRL 
0x080c
 #define mmATC_L2_CGTT_CLK_CTRL_BASE_IDX
0
-
+#define mmATC_L2_CACHE_4K_EDC_INDEX
0x080e
+#define mmATC_L2_CACHE_4K_EDC_INDEX_BASE_IDX   
0
+#define mmATC_L2_CACHE_2M_EDC_INDEX
0x080f
+#define mmATC_L2_CACHE_2M_EDC_INDEX_BASE_IDX   
0
+#define mmATC_L2_CACHE_4K_EDC_CNT  
0x0810
+#define mmATC_L2_CACHE_4K_EDC_CNT_BASE_IDX 
0
+#define mmATC_L2_CACHE_2M_EDC_CNT  
0x0811
+#define mmATC_L2_CACHE_2M_EDC_CNT_BASE_IDX 
0
 
 // addressBlock: gc_utcl2_vml2pfdec
 // base address: 0xa100
@@ -1206,7 +1213,14 @@
 #define mmVM_L2_CACHE_PARITY_CNTL_BASE_IDX 
0
 #define mmVM_L2_CGTT_CLK_CTRL  
0x085e
 #define mmVM_L2_CGTT_CLK_CTRL_BASE_IDX 
0
-
+#define mmVM_L2_MEM_ECC_INDEX  
0x0860
+#define mmVM_L2_MEM_ECC_INDEX_BASE_IDX 
0
+#define mmVM_L2_WALKER_MEM_ECC_INDEX   
0x0861
+#define mmVM_L2_WALKER_MEM_ECC_INDEX_BASE_IDX  
0
+#define mmVM_L2_MEM_ECC_CNT
0x0862
+#define mmVM_L2_MEM_ECC_CNT_BASE_IDX   
0
+#define mmVM_L2_WALKER_MEM_ECC_CNT 
0x0863
+#define mmVM_L2_WALKER_MEM_ECC_CNT_BASE_IDX
0
 
 // addressBlock: gc_utcl2_vml2vcdec
 // base address: 0xa200
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
index 064c4bb1dc62..d4c613a85352 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
@@ -6661,7 +6661,6 @@
 #define ATC_L2_CGTT_CLK_CTRL__SOFT_STALL_OVERRIDE_MASK 
   0x00FFL
 #define ATC_L2_CGTT_CLK_CTRL__SOFT_OVERRIDE_MASK   
   0xFF00L
 
-
 // addressBlock: gc_utcl2_vml2pfdec
 //VM_L2_CNTL
 #define VM_L2_CNTL__ENABLE_L2_CACHE__SHIFT 
   0x0
@@ -6991,7 +6990,22 @@
 #define VM_L2_CGTT_CLK_CTRL__MGLS_OVERRIDE_MASK
   0x8000L
 #define VM_L2_CGTT_CLK_CTRL__SOFT_STALL_OVERRIDE_MASK  
   0x00FFL
 #define VM_L2_CGTT_CLK_CTRL__SOFT_OVERRIDE_MASK
   0xFF00L
-
+//VM_L2_MEM_ECC_INDEX
+#define VM_L2_MEM_ECC_INDEX__INDEX__SHIFT  
   0x0
+#define VM_L2_MEM_ECC_INDEX__INDEX_MASK
   0x00FFL
+//VM_L2_WALKER_MEM_ECC_INDEX
+#define VM_L2_WALKER_MEM_ECC_INDEX__INDEX__SHIFT   
   0x0
+#define VM_L2_WALKER_MEM_ECC_INDEX__INDEX_MASK 
   0x00FFL
+//VM_L2_MEM_ECC_CNT
+#define VM_L2_MEM_ECC_CNT__SEC_COUNT__SHIFT

[PATCH 0/3] RAS Support for GFX blocks

2019-10-10 Thread Dennis Li
1. Add the EDC count from hardware.
2. Add RAS support for VML2 amd ATCL2 sub blocks.

Dennis Li (3):
  drm/amdgpu: change to query the actual EDC counter
  drm/amd/include: add register define for VML2 and ATCL2
  drm/amdgpu: add RAS support for VML2 and ATCL2

 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 986 --
 drivers/gpu/drm/amd/amdgpu/soc15.h|   2 +
 .../amd/include/asic_reg/gc/gc_9_0_offset.h   |  18 +-
 .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h  |  18 +-
 4 files changed, 695 insertions(+), 329 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx