from:"Hawking Zhang"

[PATCH] drm/amdkfd: Select reset method for poison handling

2024-09-06 Thread Hawking Zhang

Driver mode-2 is only supported by relative new
smc firmware.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 40 +++
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index fecdbbab9894..d46a13156ee9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -167,11 +167,23 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
block = AMDGPU_RAS_BLOCK__GFX;
-   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3) ||
-   amdgpu_ip_version(dev->adev, GC_HWIP, 0) == 
IP_VERSION(9, 4, 4))
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-   else
+   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3)) {
+   /* driver mode-2 for gfx poison is only supported by
+* pmfw 0x00557300 and onwards */
+   if (dev->adev->pm.fw_version < 0x00557300)
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   else
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   } else if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == 
IP_VERSION(9, 4, 4)) {
+   /* driver mode-2 for gfx poison is only supported by
+* pmfw 0x05550C00 and onwards */
+   if (dev->adev->pm.fw_version < 0x05550C00)
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   else
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   } else {
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   }
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
@@ -184,11 +196,23 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
-   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3) ||
-   amdgpu_ip_version(dev->adev, GC_HWIP, 0) == 
IP_VERSION(9, 4, 4))
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-   else
+   if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == 
IP_VERSION(4, 4, 2)) {
+   /* driver mode-2 for gfx poison is only supported by
+* pmfw 0x00557300 and onwards */
+   if (dev->adev->pm.fw_version < 0x00557300)
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   else
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   } else if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == 
IP_VERSION(4, 4, 5)) {
+   /* driver mode-2 for gfx poison is only supported by
+* pmfw 0x05550C00 and onwards */
+   if (dev->adev->pm.fw_version < 0x05550C00)
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   else
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   } else {
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   }
break;
default:
dev_warn(dev->adev->dev,
-- 
2.17.1

[PATCH] drm/amd/pm: Do not support swSMU if SMU IP is disabled

2024-08-19 Thread Hawking Zhang

When SMU IP is disabled by ip_block_mask, driver
should not refer to any dpm/swSMU callback. Instead,
any driver call into swSMU/dpm callback needs to
return error code EOPNOTSUPP.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c803c903e47f..31943b4db276 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -549,7 +549,8 @@ bool is_support_sw_smu(struct amdgpu_device *adev)
if (adev->asic_type == CHIP_VEGA20)
return false;
 
-   if (amdgpu_ip_version(adev, MP1_HWIP, 0) >= IP_VERSION(11, 0, 0))
+   if ((amdgpu_ip_version(adev, MP1_HWIP, 0) >= IP_VERSION(11, 0, 0)) &&
+   amdgpu_device_ip_is_valid(adev, AMD_IP_BLOCK_TYPE_SMC))
return true;
 
return false;
-- 
2.17.1

[PATCH v2 1/3] drm/amdkfd: Check int source id for utcl2 poison event

2024-08-19 Thread Hawking Zhang

Traditional utcl2 fault_status polling does not
work in SRIOV environment. The polling of fault
status register from guest side will be dropped
by hardware.

Driver should switch to check utcl2 interrupt
source id to identify utcl2 poison event. It is
set to 1 when poisoned data interrupts are
signaled.

v2: drop the unused local variable (Tao)

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c| 18 +-
 drivers/gpu/drm/amd/amdkfd/soc15_int.h |  1 +
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a9c3580be8c9..fecdbbab9894 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -431,25 +431,9 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
-   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
-   uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-   int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
-   /* gfxhub */
-   if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
-   hub_inst = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
-   node_id);
-   if (hub_inst < 0)
-   hub_inst = 0;
-   }
-
-   /* mmhub */
-   if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
-   hub_inst = node_id / 4;
-
-   if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
-   hub_inst, vmid_type)) {
+   if (source_id == SOC15_INTSRC_VMC_UTCL2_POISON) {
event_interrupt_poison_consumption_v9(dev, pasid, 
client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h 
b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
index 10138676f27f..e5c0205f2618 100644
--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -29,6 +29,7 @@
 #define SOC15_INTSRC_CP_BAD_OPCODE 183
 #define SOC15_INTSRC_SQ_INTERRUPT_MSG  239
 #define SOC15_INTSRC_VMC_FAULT 0
+#define SOC15_INTSRC_VMC_UTCL2_POISON  1
 #define SOC15_INTSRC_SDMA_TRAP 224
 #define SOC15_INTSRC_SDMA_ECC  220
 #define SOC21_INTSRC_SDMA_TRAP 49
-- 
2.17.1

[PATCH 1/3] drm/amdkfd: Check int source id for utcl2 poison event

2024-08-19 Thread Hawking Zhang

Traditional utcl2 fault_status polling does not
work in SRIOV environment. The polling of fault
status register from guest side will be dropped
by hardware.

Driver should switch to check utcl2 interrupt
source id to identify utcl2 poison event. It is
set to 1 when poisoned data interrupts are
signaled.

v2: drop the unused local variable (Tao)

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c| 18 +-
 drivers/gpu/drm/amd/amdkfd/soc15_int.h |  1 +
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a9c3580be8c9..fecdbbab9894 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -431,25 +431,9 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
-   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
-   uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-   int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
-   /* gfxhub */
-   if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
-   hub_inst = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
-   node_id);
-   if (hub_inst < 0)
-   hub_inst = 0;
-   }
-
-   /* mmhub */
-   if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
-   hub_inst = node_id / 4;
-
-   if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
-   hub_inst, vmid_type)) {
+   if (source_id == SOC15_INTSRC_VMC_UTCL2_POISON) {
event_interrupt_poison_consumption_v9(dev, pasid, 
client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h 
b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
index 10138676f27f..e5c0205f2618 100644
--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -29,6 +29,7 @@
 #define SOC15_INTSRC_CP_BAD_OPCODE 183
 #define SOC15_INTSRC_SQ_INTERRUPT_MSG  239
 #define SOC15_INTSRC_VMC_FAULT 0
+#define SOC15_INTSRC_VMC_UTCL2_POISON  1
 #define SOC15_INTSRC_SDMA_TRAP 224
 #define SOC15_INTSRC_SDMA_ECC  220
 #define SOC21_INTSRC_SDMA_TRAP 49
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Retire query_utcl2_poison_status callback

2024-08-19 Thread Hawking Zhang

Driver switches to interrupt source id to identify
utcl2 poison event. polling interface is not needed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h  |  2 --
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c   | 18 --
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c   | 17 -
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c| 17 -
 7 files changed, 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 64a989cbc301..4f08b153cb66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -783,22 +783,6 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct 
amdgpu_device *adev,
return 0;
 }
 
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int hub_inst, int hub_type)
-{
-   if (!hub_type) {
-   if (adev->gfxhub.funcs->query_utcl2_poison_status)
-   return 
adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
-   else
-   return false;
-   } else {
-   if (adev->mmhub.funcs->query_utcl2_poison_status)
-   return 
adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
-   else
-   return false;
-   }
-}
-
 int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
 {
return kgd2kfd_check_and_lock_kfd();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 825c7ffe4bc9..f9d119448442 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -350,8 +350,6 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int hub_inst, int hub_type);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index 103a837ccc71..c7b44aeb671b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -38,8 +38,6 @@ struct amdgpu_gfxhub_funcs {
void (*mode2_save_regs)(struct amdgpu_device *adev);
void (*mode2_restore_regs)(struct amdgpu_device *adev);
void (*halt)(struct amdgpu_device *adev);
-   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
-   int xcc_id);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 95d676ee207f..1ca9d4ed8063 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -63,8 +63,6 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev,
 bool enable);
-   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
-   int hub_inst);
 };
 
 struct amdgpu_mmhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index d200310d1731..0e3ddea7b8e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -443,23 +443,6 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
 }
 
-static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   u32 status = 0;
-   struct amdgpu_vmhub *hub;
-
-   if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
-   return false;
-
-   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
-   status = RREG32(hub->vm_l2_pro_fault_status);
-   /* reset page fault status */
-   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
-
-   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
-}
-
 const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
.setup_vm_pt_regs = gfxhub_v1_0_setup_vm_pt_regs,
@@ -468,5 +451,4 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_defa

[PATCH 2/3] drm/amdkfd: Drop poison hanlding from gfx v10

2024-08-19 Thread Hawking Zhang

Not supported.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 71 ---
 1 file changed, 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 8e0d0356e810..bb8cbfc39b90 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -129,63 +129,6 @@ enum SQ_INTERRUPT_ERROR_TYPE {
KFD_DEBUG_CP_BAD_OP_ECODE_MASK) \
>> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT)
 
-static void event_interrupt_poison_consumption(struct kfd_node *dev,
-   uint16_t pasid, uint16_t client_id)
-{
-   enum amdgpu_ras_block block = 0;
-   int old_poison, ret = -EINVAL;
-   uint32_t reset = 0;
-   struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
-
-   if (!p)
-   return;
-
-   /* all queues of a process will be unmapped in one time */
-   old_poison = atomic_cmpxchg(&p->poison, 0, 1);
-   kfd_unref_process(p);
-   if (old_poison)
-   return;
-
-   switch (client_id) {
-   case SOC15_IH_CLIENTID_SE0SH:
-   case SOC15_IH_CLIENTID_SE1SH:
-   case SOC15_IH_CLIENTID_SE2SH:
-   case SOC15_IH_CLIENTID_SE3SH:
-   case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
-   block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   break;
-   case SOC15_IH_CLIENTID_SDMA0:
-   case SOC15_IH_CLIENTID_SDMA1:
-   case SOC15_IH_CLIENTID_SDMA2:
-   case SOC15_IH_CLIENTID_SDMA3:
-   case SOC15_IH_CLIENTID_SDMA4:
-   block = AMDGPU_RAS_BLOCK__SDMA;
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   break;
-   default:
-   break;
-   }
-
-   kfd_signal_poison_consumed_event(dev, pasid);
-
-   /* resetting queue passes, do page retirement without gpu reset
-* resetting queue fails, fallback to gpu reset solution
-*/
-   if (!ret)
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, unmap queue flow succeeded: 
client id %d\n",
-   client_id);
-   else
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, fall back to gpu reset flow: 
client id %d\n",
-   client_id);
-
-   amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
-}
-
 static bool event_interrupt_isr_v10(struct kfd_node *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
@@ -332,11 +275,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
REG_GET_FIELD(context_id1, 
SQ_INTERRUPT_WORD_WAVE_CTXID1,
WGP_ID),
sq_intr_err_type);
-   if (sq_intr_err_type != 
SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
-   sq_intr_err_type != 
SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-   event_interrupt_poison_consumption(dev, 
pasid, source_id);
-   return;
-   }
break;
default:
break;
@@ -362,9 +300,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_SDMA7) {
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 
0xfff, 28);
-   } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-   event_interrupt_poison_consumption(dev, pasid, 
source_id);
-   return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
   client_id == SOC15_IH_CLIENTID_VMC1 ||
@@ -388,12 +323,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
hub_inst = node_id / 4;
 
-   if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
-   hub_inst, vmid_type)) {
-   event_interrupt_poison_consumption(dev, pasid, 
client_id);
-   return;
-   }
-
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
-- 
2.17.1

[PATCH 1/3] drm/amdkfd: Check int source id for utcl2 poison event

2024-08-19 Thread Hawking Zhang

Traditional utcl2 fault_status polling does not
work in SRIOV environment. The polling of fault
status register from guest side will be dropped
by hardware.

Driver should switch to check utcl2 interrupt
source id to identify utcl2 poison event. It is
set to 1 when poisoned data interrupts are
signaled.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 +--
 drivers/gpu/drm/amd/amdkfd/soc15_int.h  | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a9c3580be8c9..1196dccbe6bc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -448,8 +448,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
hub_inst = node_id / 4;
 
-   if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
-   hub_inst, vmid_type)) {
+   if (source_id == SOC15_INTSRC_VMC_UTCL2_POISON) {
event_interrupt_poison_consumption_v9(dev, pasid, 
client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h 
b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
index 10138676f27f..e5c0205f2618 100644
--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -29,6 +29,7 @@
 #define SOC15_INTSRC_CP_BAD_OPCODE 183
 #define SOC15_INTSRC_SQ_INTERRUPT_MSG  239
 #define SOC15_INTSRC_VMC_FAULT 0
+#define SOC15_INTSRC_VMC_UTCL2_POISON  1
 #define SOC15_INTSRC_SDMA_TRAP 224
 #define SOC15_INTSRC_SDMA_ECC  220
 #define SOC21_INTSRC_SDMA_TRAP 49
-- 
2.17.1

[PATCH] drm/amdgpu: Add debug option to enable mode2 for poison recovery

2024-08-11 Thread Hawking Zhang

Add debug option to enable mode2 for poison recovery
for testing purpose only.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  6 ++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 16 ++--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e6b641cb362a..c34819f947ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1201,6 +1201,7 @@ struct amdgpu_device {
booldebug_disable_soft_recovery;
booldebug_use_vram_fw_buf;
booldebug_enable_ras_aca;
+   booldebug_mode2_for_poison_recovery;
 };
 
 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index afe3b8bd35a1..be6b920933d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -133,6 +133,7 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
+   AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY = BIT(5),
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2229,6 +2230,11 @@ static void amdgpu_init_debug_options(struct 
amdgpu_device *adev)
pr_info("debug: enable RAS ACA\n");
adev->debug_enable_ras_aca = true;
}
+
+   if (amdgpu_debug_mask & AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY) {
+   pr_info("debug: enable mode2 reset for poison consumption 
recovery");
+   adev->debug_mode2_for_poison_recovery = true;
+   }
 }
 
 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long 
flags)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 816800555f7f..a355b2bc2214 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -164,10 +164,12 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
block = AMDGPU_RAS_BLOCK__GFX;
-   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3))
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-   else
+   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3)) {
+   reset = ((dev->adev->debug_mode2_for_poison_recovery) ?
+AMDGPU_RAS_GPU_RESET_MODE2_RESET : 
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+   } else {
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   }
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
@@ -180,10 +182,12 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
-   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3))
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-   else
+   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3)) {
+   reset = ((dev->adev->debug_mode2_for_poison_recovery) ?
+AMDGPU_RAS_GPU_RESET_MODE2_RESET : 
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+   } else {
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   }
break;
default:
dev_warn(dev->adev->dev,
-- 
2.17.1

[PATCH] drm/amdgpu: Do not init ta microcode from guest side

2024-08-11 Thread Hawking Zhang

TA should not be loaded from guest side.

Signed-off-by: Hawking Zhang 
Reviewed-by: Shiwu Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 85ec9e35690a..749d8143b1e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -132,9 +132,11 @@ static int psp_v13_0_init_microcode(struct psp_context 
*psp)
(adev->emu_flags & AMDGPU_EMU_dGPU_SIDEWINDER))
break;
/* It's not necessary to load ras ta on Guest side */
-   err = psp_init_ta_microcode(psp, ucode_prefix);
-   if (err)
-   return err;
+   if (!amdgpu_sriov_vf(adev)) {
+   err = psp_init_ta_microcode(psp, ucode_prefix);
+   if (err)
+   return err;
+   }
break;
default:
BUG();
-- 
2.17.1

[PATCH] drm/amdgpu: Add more types for boot time error reporting

2024-07-31 Thread Hawking Zhang

Data abort exception and unknown errors are supported.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 12ab48f26bd5..7aff6150898b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4769,6 +4769,16 @@ static void amdgpu_ras_boot_time_error_reporting(struct 
amdgpu_device *adev,
dev_info(adev->dev,
 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm 
bist test failed\n",
 socket_id, aid_id, hbm_id, fw_status);
+
+   if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error))
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, data abort 
exception\n",
+socket_id, aid_id, fw_status);
+
+   if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, unknown boot 
time errors\n",
+socket_id, aid_id, fw_status);
 }
 
 static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7ddd13d5c06b..0d49b74bfe5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,6 +46,8 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
AMDGPU_GET_REG_FIELD(x, 10, 8)
 #define AMDGPU_RAS_GPU_ERR_AID_ID(x)   AMDGPU_GET_REG_FIELD(x, 
12, 11)
 #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
14, 13)
+#define AMDGPU_RAS_GPU_ERR_DATA_ABORT(x)   AMDGPU_GET_REG_FIELD(x, 
29, 29)
+#define AMDGPU_RAS_GPU_ERR_UNKNOWN(x)  AMDGPU_GET_REG_FIELD(x, 
30, 30)
 
 #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   100
 #define AMDGPU_RAS_BOOT_STEADY_STATUS  0xBA
-- 
2.17.1

[PATCH] drm/amdgpu: Fix hbm stack id in boot error report

2024-06-28 Thread Hawking Zhang

To align with firmware, hbm id field 0x1 refers to
hbm stack 0, 0x2 refers to hbm statck 1.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4edd8e333d36..6d1f974e2987 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4565,7 +4565,7 @@ static void amdgpu_ras_boot_time_error_reporting(struct 
amdgpu_device *adev,
 
socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
-   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+   hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1);
 
if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
dev_info(adev->dev,
-- 
2.17.1

[PATCH] drm/amdgpu: Correct register used to clear fault status

2024-06-28 Thread Hawking Zhang

Driver should write to fault_cntl registers to do
one-shot address/status clear.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index 8d7267a013d2..621761a17ac7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -569,7 +569,7 @@ static bool mmhub_v1_8_query_utcl2_poison_status(struct 
amdgpu_device *adev,
if (!amdgpu_sriov_vf(adev)) {
/* clear page fault status and address */
WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
-regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+regVM_L2_PROTECTION_FAULT_CNTL), 1, ~1);
}
 
return fed;
-- 
2.17.1

[PATCH] drm/amdgpu: Fix smatch static checker warning

2024-06-21 Thread Hawking Zhang

adev->gfx.imu.funcs could be NULL

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index b4575765d7a8..5c17409439f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4498,11 +4498,11 @@ static int gfx_v11_0_hw_init(void *handle)
/* RLC autoload sequence 1: Program rlc ram */
if (adev->gfx.imu.funcs->program_rlc_ram)
adev->gfx.imu.funcs->program_rlc_ram(adev);
+   /* rlc autoload firmware */
+   r = gfx_v11_0_rlc_backdoor_autoload_enable(adev);
+   if (r)
+   return r;
}
-   /* rlc autoload firmware */
-   r = gfx_v11_0_rlc_backdoor_autoload_enable(adev);
-   if (r)
-   return r;
} else {
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
if (adev->gfx.imu.funcs && (amdgpu_dpm > 0)) {
-- 
2.17.1

[PATCH] drm/amdgpu: Fix smatch static checker warning

2024-06-21 Thread Hawking Zhang

adev->gfx.imu.funcs could be NULL.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 8 
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index b4575765d7a8..5c17409439f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4498,11 +4498,11 @@ static int gfx_v11_0_hw_init(void *handle)
/* RLC autoload sequence 1: Program rlc ram */
if (adev->gfx.imu.funcs->program_rlc_ram)
adev->gfx.imu.funcs->program_rlc_ram(adev);
+   /* rlc autoload firmware */
+   r = gfx_v11_0_rlc_backdoor_autoload_enable(adev);
+   if (r)
+   return r;
}
-   /* rlc autoload firmware */
-   r = gfx_v11_0_rlc_backdoor_autoload_enable(adev);
-   if (r)
-   return r;
} else {
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
if (adev->gfx.imu.funcs && (amdgpu_dpm > 0)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 460bf33a22b1..16fc5c5b15f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -3258,11 +3258,11 @@ static int gfx_v12_0_hw_init(void *handle)
/* RLC autoload sequence 1: Program rlc ram */
if (adev->gfx.imu.funcs->program_rlc_ram)
adev->gfx.imu.funcs->program_rlc_ram(adev);
+   /* rlc autoload firmware */
+   r = gfx_v12_0_rlc_backdoor_autoload_enable(adev);
+   if (r)
+   return r;
}
-   /* rlc autoload firmware */
-   r = gfx_v12_0_rlc_backdoor_autoload_enable(adev);
-   if (r)
-   return r;
} else {
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
if (adev->gfx.imu.funcs && (amdgpu_dpm > 0)) {
-- 
2.17.1

[PATCH] drm/amdgpu: Fix register access violation

2024-06-20 Thread Hawking Zhang

fault_status is read only register. fault_cntl
is not accessible from guest environment.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 8 +---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c| 3 ++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c  | 8 +---
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index e14acab5cceb..72109abe7c86 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -629,9 +629,11 @@ static bool gfxhub_v1_2_query_utcl2_poison_status(struct 
amdgpu_device *adev,
 
status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regVM_L2_PROTECTION_FAULT_STATUS);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
-   /* reset page fault status */
-   WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id),
-   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+   if (!amdgpu_sriov_vf(adev)) {
+   /* clear page fault status and address */
+   WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id),
+regVM_L2_PROTECTION_FAULT_CNTL), 1, ~1);
+   }
 
return fed;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 88b4644f8e96..b73136d390cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -672,7 +672,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
(amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
return 0;
 
-   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+   if (!amdgpu_sriov_vf(adev))
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index 7a1ff298417a..8d7267a013d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -566,9 +566,11 @@ static bool mmhub_v1_8_query_utcl2_poison_status(struct 
amdgpu_device *adev,
 
status = RREG32_SOC15(MMHUB, hub_inst, 
regVM_L2_PROTECTION_FAULT_STATUS);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
-   /* reset page fault status */
-   WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
-   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+   if (!amdgpu_sriov_vf(adev)) {
+   /* clear page fault status and address */
+   WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
+regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+   }
 
return fed;
 }
-- 
2.17.1

[PATCH] drm/amdgpu: Update programming for boot error reporting

2024-05-30 Thread Hawking Zhang

AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid.
The polling sequence is also simplifed according to
the latest firmware change.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +-
 2 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index eedf2b613ac2..2c338d39cd45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4416,64 +4416,74 @@ int amdgpu_ras_error_statistic_de_count(struct 
ras_err_data *err_data,
 #define mmMP0_SMN_C2PMSG_920x1609C
 #define mmMP0_SMN_C2PMSG_126   0x160BE
 static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
-u32 instance, u32 boot_error)
+u32 instance)
 {
u32 socket_id, aid_id, hbm_id;
-   u32 reg_data;
+   u32 fw_status;
+   u32 boot_error;
u64 reg_addr;
 
-   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
-   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
-   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
-
/* The pattern for smn addressing in other SOC could be different from
 * the one for aqua_vanjaram. We should revisit the code if the pattern
 * is changed. In such case, replace the aqua_vanjaram implementation
 * with more common helper */
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
   aqua_vanjaram_encode_ext_smn_addressing(instance);
+   fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+   boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
 
-   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
-   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-   socket_id, aid_id, reg_data);
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
 
if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
+   dev_info(adev->dev,
+"socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory 
training failed\n",
+socket_id, aid_id, hbm_id, fw_status);
 
if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, firmware load 
failed at boot time\n",
+socket_id, aid_id, fw_status);
 
if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, wafl link 
training failed\n",
+socket_id, aid_id, fw_status);
 
if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, xgmi link 
training failed\n",
+socket_id, aid_id, fw_status);
 
if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
-socket_id, aid_id);
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, usr cp link 
training failed\n",
+socket_id, aid_id, fw_status);
 
if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
-socket_id, aid_id);
+   dev_info(adev->dev,
+"socket: %d, aid: %d, fw_status: 0x%x, usr dp link 
training failed\n",
+socket_id, aid_id, fw_status);
 
if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm

[PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2

2024-05-27 Thread Hawking Zhang

Add estimate of how much vram we need to reserve for RAS
when caculating the total available vram.

v2: apply the change to MP0 v13_0_2 and v13_0_14

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 20 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  4 
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index e98927529f61..ad813772f8a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
uint64_t vram_size = 0;
@@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
-vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size) +
+vram_size - reserved_for_pt - reserved_for_ras - 
atomic64_read(&adev->vram_pin_size) +
 atomic64_read(&adev->kfd.vram_pinned))) {
ret = -ENOMEM;
goto release;
@@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
ssize_t available;
uint64_t vram_available, system_mem_available, ttm_mem_available;
 
@@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
- adev->kfd.vram_used_aligned[xcp_id]
- atomic64_read(&adev->vram_pin_size)
+ atomic64_read(&adev->kfd.vram_pinned)
-   - reserved_for_pt;
+   - reserved_for_pt
+   - reserved_for_ras;
 
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_available = no_system_mem_limit ?
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ecce022c657b..f28bf5765380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct 
amdgpu_device *adev)
amdgpu_put_xgmi_hive(hive);
 }
 
+static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (!con || (adev->flags & AMD_IS_APU))
+   return;
+
+   switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+   case IP_VERSION(13, 0, 2):
+   case IP_VERSION(13, 0, 6):
+   case IP_VERSION(13, 0, 14):
+   con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+   break;
+   default:
+   break;
+   }
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3422,6 +3440,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
 
+   amdgpu_ras_init_reserved_vram_size(adev);
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6a8c7b1609df..bd61f77a7134 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -64,6 +64,9 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe000
 
+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain */
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE  (16ULL << 20)
 /* The high three bits indicates socketid */
 #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & 
~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
 
@@ -541,6 +544,7 @@ struct amdgpu_ras {
struct ras_event_manager __event_mgr;
struct ras_event_manager *event_mgr;
 
+   uint64_t reserved_pages_in_bytes;
 };
 
 struct ras_fs_data {
-- 
2.17.1

[PATCH] drm/amdgpu: Estimate RAS reservation when report capacity

2024-05-27 Thread Hawking Zhang

Add estimate of how much vram we need to reserve for RAS
when caculating the total available vram.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c   |  9 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 ++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index e98927529f61..ad813772f8a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
uint64_t vram_size = 0;
@@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
-vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size) +
+vram_size - reserved_for_pt - reserved_for_ras - 
atomic64_read(&adev->vram_pin_size) +
 atomic64_read(&adev->kfd.vram_pinned))) {
ret = -ENOMEM;
goto release;
@@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
ssize_t available;
uint64_t vram_available, system_mem_available, ttm_mem_available;
 
@@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
- adev->kfd.vram_used_aligned[xcp_id]
- atomic64_read(&adev->vram_pin_size)
+ atomic64_read(&adev->kfd.vram_pinned)
-   - reserved_for_pt;
+   - reserved_for_pt
+   - reserved_for_ras;
 
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_available = no_system_mem_limit ?
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ecce022c657b..a6334e0e62dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3317,6 +3317,22 @@ static void amdgpu_ras_event_mgr_init(struct 
amdgpu_device *adev)
amdgpu_put_xgmi_hive(hive);
 }
 
+static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (!con || (adev->flags & AMD_IS_APU))
+   return;
+
+   switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+   case IP_VERSION(13, 0, 6):
+   con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+   break;
+   default:
+   break;
+   }
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3422,6 +3438,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
 
+   amdgpu_ras_init_reserved_vram_size(adev);
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6a8c7b1609df..bee622c4268a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -64,6 +64,7 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe000
 
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE  (16ULL << 20)
 /* The high three bits indicates socketid */
 #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & 
~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
 
@@ -541,6 +542,7 @@ struct amdgpu_ras {
struct ras_event_manager __event_mgr;
struct ras_event_manager *event_mgr;
 
+   uint64_t reserved_pages_in_bytes;
 };
 
 struct ras_fs_data {
-- 
2.17.1

[PATCH] drm/amdgpu: correct hbm field in boot status

2024-05-21 Thread Hawking Zhang

hbm filed takes bit 13 and bit 14 in boot status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c8980d5f6540..7021c4a66fb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,7 +46,7 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x)AMDGPU_GET_REG_FIELD(x, 
7, 7)
 #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
AMDGPU_GET_REG_FIELD(x, 10, 8)
 #define AMDGPU_RAS_GPU_ERR_AID_ID(x)   AMDGPU_GET_REG_FIELD(x, 
12, 11)
-#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
+#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
14, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
 #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
-- 
2.17.1

[PATCH] drm/amdgpu: Use driver mode reset for data poison

2024-04-15 Thread Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++-
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..c3beb872adf8d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
uint16_t pasid, uint16_t client_id)
 {
enum amdgpu_ras_block block = 0;
-   int old_poison, ret = -EINVAL;
+   int old_poison;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
-   break;
+   dev_warn(dev->adev->dev,
+"client %d does not support poison consumption\n", 
client_id);
+   return;
}
 
kfd_signal_poison_consumed_event(dev, pasid);
 
-   /* resetting queue passes, do page retirement without gpu reset
-* resetting queue fails, fallback to gpu reset solution
-*/
-   if (!ret)
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, unmap queue flow succeeded: 
client id %d\n",
-   client_id);
-   else
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, fall back to gpu reset flow: 
client id %d\n",
-   client_id);
+   dev_warn(dev->adev->dev,
+"poison is consumed by client %d, kick off gpu reset flow\n", 
client_id);
 
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
-- 
2.17.1

[PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-15 Thread Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 22 +++
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..94eb2493103ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
uint16_t pasid, uint16_t client_id)
 {
enum amdgpu_ras_block block = 0;
-   int old_poison, ret = -EINVAL;
+   int old_poison;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
 
kfd_signal_poison_consumed_event(dev, pasid);
 
-   /* resetting queue passes, do page retirement without gpu reset
-* resetting queue fails, fallback to gpu reset solution
-*/
-   if (!ret)
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, unmap queue flow succeeded: 
client id %d\n",
-   client_id);
-   else
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, fall back to gpu reset flow: 
client id %d\n",
-   client_id);
-
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
 
-- 
2.17.1

[PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-15 Thread Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..b6caf6eda8a0c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
-- 
2.17.1

[PATCH] drm/amdgpu: Process bif doorbell event

2024-04-01 Thread Hawking Zhang

When BACO exit is triggered by doorbell transaction,
firmware will config bif to issue msi interrupt to
indicate doorbell transaction. If bif ring is not
enabled in such case, driver needs to ack the interrupt
by clearing the interrupt status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 +
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c   | 56 
 drivers/gpu/drm/amd/amdgpu/soc21.c   | 14 +-
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 7b8c03be1d9e..db341921cfc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -102,6 +102,7 @@ struct amdgpu_nbio_funcs {
u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
 u32 *supp_modes);
u64 (*get_pcie_replay_count)(struct amdgpu_device *adev);
+   u32 (*init_bif_doorbell_event)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio {
@@ -111,6 +112,7 @@ struct amdgpu_nbio {
struct ras_common_if *ras_if;
const struct amdgpu_nbio_funcs *funcs;
struct amdgpu_nbio_ras  *ras;
+   struct amdgpu_irq_src bif_doorbell_irq;
 };
 
 int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 7f88a298ac5f..e5a331b6eee9 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -477,6 +477,61 @@ static void nbio_v4_3_program_aspm(struct amdgpu_device 
*adev)
 #endif
 }
 
+static int nbio_v4_3_set_bif_doorbell_irq_state(struct amdgpu_device *adev,
+   struct amdgpu_irq_src *src,
+   unsigned type,
+   enum amdgpu_interrupt_state 
state)
+{
+/*let firmware to config bif doorbell irq state */
+return 0;
+}
+
+static int nbio_v4_3_process_bif_doorbell_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+   /* pmfw will config bif doorbell irq to host if baco exit
+* is triggered by doorbell transaction. In such case, driver
+* needs to clear the interrupt status */
+
+   uint32_t reg_data;
+
+   reg_data = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_RB_CNTL);
+
+   /* if bif ring is enabled, do nothing */
+   if (REG_GET_FIELD(reg_data, BIF_BX0_BIF_RB_CNTL, RB_ENABLE))
+   return 0;
+
+   /* write 1 to clear doorbell interrupt */
+   reg_data = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(reg_data, BIF_BX0_BIF_DOORBELL_INT_CNTL, 
DOORBELL_INTERRUPT_STATUS)) {
+   reg_data = REG_SET_FIELD(reg_data,
+BIF_BX0_BIF_DOORBELL_INT_CNTL,
+DOORBELL_INTERRUPT_CLEAR, 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
reg_data);
+   }
+
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs nbio_v4_3_bif_doorbell_irq_funcs = {
+   .set = nbio_v4_3_set_bif_doorbell_irq_state,
+   .process = nbio_v4_3_process_bif_doorbell_irq,
+};
+
+static u32 nbio_v4_3_init_bif_doorbell_event(struct amdgpu_device *adev)
+{
+   u32 r;
+
+   adev->nbio.bif_doorbell_irq.funcs = &nbio_v4_3_bif_doorbell_irq_funcs;
+   adev->nbio.bif_doorbell_irq.num_types = 1;
+
+   r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF,
+ NBIF_7_4__SRCID__DOORBELL_INTERRUPT,
+ &adev->nbio.bif_doorbell_irq);
+   return r;
+}
+
 const struct amdgpu_nbio_funcs nbio_v4_3_funcs = {
.get_hdp_flush_req_offset = nbio_v4_3_get_hdp_flush_req_offset,
.get_hdp_flush_done_offset = nbio_v4_3_get_hdp_flush_done_offset,
@@ -499,6 +554,7 @@ const struct amdgpu_nbio_funcs nbio_v4_3_funcs = {
.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
.get_rom_offset = nbio_v4_3_get_rom_offset,
.program_aspm = nbio_v4_3_program_aspm,
+   .init_bif_doorbell_event = nbio_v4_3_init_bif_doorbell_event,
 };
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index abe319b0f063..ee6d810589c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -792,6 +792,9 @@ static int soc21_common_late_init(void *handle)
 * nbio v4_3 only support fatal error hanlding
 * just enable the interrupt directly */
amdgpu_irq_get(adev, 
&adev->nbio.ras_err_event_athub_irq, 0);
+   if (adev->nbio.bif_doorbell_irq.funcs &&
+

[PATCH] drm/amdgpu: Bypass asd if display hw is not available

2024-03-30 Thread Hawking Zhang

ASD is not needed by headless GPU.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b310fdb719d..83bf86352267d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
 
+   /* bypass asd if display hardware is not available */
+   if (!amdgpu_device_has_display_hardware(psp->adev) &&
+   amdgpu_ip_version(psp->adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
+   return 0;
+
psp->asd_context.mem_context.shared_mc_addr  = 0;
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
psp->asd_context.ta_load_type= GFX_CMD_ID_LOAD_ASD;
-- 
2.17.1

[PATCH] drm/amdgpu: Bypass asd if display hw is not available

2024-03-29 Thread Hawking Zhang

ASD is not needed by headless GPU.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b310fdb719d..063203865bbe2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
 
+   /* bypass asd if display hardware is not available */
+   if (!amdgpu_device_has_display_hardware(psp->adev) &&
+   amdgpu_ip_version(adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
+   return 0;
+
psp->asd_context.mem_context.shared_mc_addr  = 0;
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
psp->asd_context.ta_load_type= GFX_CMD_ID_LOAD_ASD;
-- 
2.17.1

[PATCH] drm/amdgpu: Bypass display ta if display hw is not available

2024-03-14 Thread Hawking Zhang

Do not load/invoke display TA if display hardware is not
available

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 867397fe2e9d..e7d7fd2cc31d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1830,6 +1830,10 @@ static int psp_hdcp_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass hdcp initialization if dmu is harvested */
+   if (!amdgpu_device_has_display_hardware(psp->adev))
+   return 0;
+
if (!psp->hdcp_context.context.bin_desc.size_bytes ||
!psp->hdcp_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "HDCP: optional hdcp ta ucode is not 
available\n");
@@ -1862,6 +1866,9 @@ int psp_hdcp_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->hdcp_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, &psp->hdcp_context.context);
 }
 
@@ -1897,6 +1904,10 @@ static int psp_dtm_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass dtm initialization if dmu is harvested */
+   if (!amdgpu_device_has_display_hardware(psp->adev))
+   return 0;
+
if (!psp->dtm_context.context.bin_desc.size_bytes ||
!psp->dtm_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "DTM: optional dtm ta ucode is not 
available\n");
@@ -1929,6 +1940,9 @@ int psp_dtm_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->dtm_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, &psp->dtm_context.context);
 }
 
@@ -2063,6 +2077,10 @@ static int psp_securedisplay_initialize(struct 
psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass securedisplay initialization if dmu is harvested */
+   if (!amdgpu_device_has_display_hardware(psp->adev))
+return 0;
+
if (!psp->securedisplay_context.context.bin_desc.size_bytes ||
!psp->securedisplay_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "SECUREDISPLAY: securedisplay ta ucode 
is not available\n");
-- 
2.17.1

[PATCH] drm/amdgpu: Bypass display ta if it is harvested

2024-03-14 Thread Hawking Zhang

Display TA doesn't need to be loaded/invoked if it
is harvested.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 867397fe2e9d..bb4988c45ca9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1830,6 +1830,10 @@ static int psp_hdcp_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass hdcp initialization if dmu is harvested */
+   if (psp->adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
+   return 0;
+
if (!psp->hdcp_context.context.bin_desc.size_bytes ||
!psp->hdcp_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "HDCP: optional hdcp ta ucode is not 
available\n");
@@ -1862,6 +1866,9 @@ int psp_hdcp_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->hdcp_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, &psp->hdcp_context.context);
 }
 
@@ -1897,6 +1904,10 @@ static int psp_dtm_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass dtm initialization if dmu is harvested */
+   if (psp->adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
+   return 0;
+
if (!psp->dtm_context.context.bin_desc.size_bytes ||
!psp->dtm_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "DTM: optional dtm ta ucode is not 
available\n");
@@ -1929,6 +1940,9 @@ int psp_dtm_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->dtm_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, &psp->dtm_context.context);
 }
 
@@ -2063,6 +2077,10 @@ static int psp_securedisplay_initialize(struct 
psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass securedisplay initialization if dmu is harvested */
+   if (psp->adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
+return 0;
+
if (!psp->securedisplay_context.context.bin_desc.size_bytes ||
!psp->securedisplay_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "SECUREDISPLAY: securedisplay ta ucode 
is not available\n");
-- 
2.17.1

[PATCH] drm/amdgpu: Do not enable/disable bif ras irq from guest

2024-02-17 Thread Hawking Zhang

Only do this from host side.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 15033efec2ba..2c8702560090 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1278,7 +1278,8 @@ static int soc15_common_hw_fini(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_put_irq(adev);
 
-   if (adev->nbio.ras_if &&
+   if ((!amdgpu_sriov_vf(adev)) &&
+   adev->nbio.ras_if &&
amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt)
-- 
2.17.1

[PATCH] drm/amdgpu: Update boot time errors polling sequence

2024-01-29 Thread Hawking Zhang

Update boot time errors polling seqeunce to align with
the latest firmware change.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9e67355d4718..9b7a5c1c9af5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4122,6 +4122,18 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
u32 reg_data;
int retry_loop;
 
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == 
AMDGPU_RAS_BOOT_STEADY_STATUS) {
+   *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
+   return 0;
+   }
+   msleep(1);
+   }
+
/* The pattern for smn addressing in other SOC could be different from
 * the one for aqua_vanjaram. We should revisit the code if the pattern
 * is changed. In such case, replace the aqua_vanjaram implementation
@@ -4129,7 +4141,7 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
   aqua_vanjaram_encode_ext_smn_addressing(instance);
 
-   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
*boot_error = reg_data;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0b6ffae1e8bb..d10e5bb0e52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,6 +46,11 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
+#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
+#define AMDGPU_RAS_BOOT_STEADY_STATUS  0xBA
+#define AMDGPU_RAS_BOOT_STATUS_MASK0xFF
+#define AMDGPU_RAS_BOOT_SUCEESS0x8000
+
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
 /* position of instance value in sub_block_index of
  * ta_ras_trigger_error_input, the sub block uses lower 12 bits
-- 
2.17.1

[PATCH] drm/amdgpu: Fix null pointer dereference

2024-01-22 Thread Hawking Zhang

amdgpu_reg_state_sysfs_fini could be invoked at the
time when asic_func is even not initialized, i.e.,
amdgpu_discovery_init fails for some reason.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/include/amdgpu_reg_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/include/amdgpu_reg_state.h 
b/drivers/gpu/drm/amd/include/amdgpu_reg_state.h
index be519c8edf49..335980e2afbf 100644
--- a/drivers/gpu/drm/amd/include/amdgpu_reg_state.h
+++ b/drivers/gpu/drm/amd/include/amdgpu_reg_state.h
@@ -138,7 +138,7 @@ static inline size_t amdgpu_reginst_size(uint16_t num_inst, 
size_t inst_size,
 }
 
 #define amdgpu_asic_get_reg_state_supported(adev) \
-   ((adev)->asic_funcs->get_reg_state ? 1 : 0)
+   (((adev)->asic_funcs && (adev)->asic_funcs->get_reg_state) ? 1 : 0)
 
 #define amdgpu_asic_get_reg_state(adev, state, buf, size)  \
((adev)->asic_funcs->get_reg_state ?   \
-- 
2.17.1

[PATCH v2 v2 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-07 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH v2 v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-07 Thread Hawking Zhang

Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9da14436a373..df3aa69be425 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1330,6 +1330,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc42fb6ee191..a901b00d4949 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+

[PATCH v2 v2 2/5] drm/amdgpu: Init pcie_index/data address as fallback (v2)

2024-01-07 Thread Hawking Zhang

To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

v2: define macro for pcie_index/data/index_hi fallback.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index abad5773714c..05d7cdcf28b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -96,6 +96,9 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
 #define AMDGPU_RESUME_MS   2000
 #define AMDGPU_MAX_RETRY_LIMIT 2
 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) 
== -EINVAL)
+#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
+#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
+#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
 
 static const struct drm_driver amdgpu_kms_driver;
 
@@ -781,12 +784,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
+   pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-07 Thread Hawking Zhang

Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1

[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-07 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-07 Thread Hawking Zhang

Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a39c9fea55c4..abad5773714c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 2addbdf88394..90451cabb919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = &adev->psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_

[PATCH v2 2/3] drm/amdgpu: Query ras capablity from psp v2

2024-01-02 Thread Hawking Zhang

Instead of traditional atomfirmware interfaces for RAS
capability, host driver can query ras capability from
psp starting from psp v13_0_6.

v2: drop redundant local variable from get_ras_capability.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 0dc8686e54f7..af3bc36aef18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2128,6 +2128,16 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp)
+{
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   return psp->funcs->get_ras_capability(psp);
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"
 
 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Hawking Zhang

Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a901b00d4949..2ee82baaf7d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2

[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware

2024-01-02 Thread Hawking Zhang

Driver and firmware share the same ras block enum.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5785b705c692..8b053602c5ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -70,6 +70,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+   AMDGPU_RAS_BLOCK__IH,
+   AMDGPU_RAS_BLOCK__MPIO,
 
AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.17.1

[PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-02 Thread Hawking Zhang

Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..cd91533d641c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc42fb6ee191..a901b00d4949 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-02 Thread Hawking Zhang

Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1

[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-02 Thread Hawking Zhang

To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 001a35fa0f19..873419a5b9aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = (0x38 >> 2);
+   pcie_data = (0x3C >> 2);
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = (0x44 >> 2);
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1

[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-02 Thread Hawking Zhang

Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4b694696930e..001a35fa0f19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 51bfe3757c89..0dc8686e54f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2128,21 +2128,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = &adev->psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_

[PATCH 3/3] drm/amdgpu: Replace DRM_* with dev_* in amdgpu_psp.c

2024-01-01 Thread Hawking Zhang

So kernel message has the device pcie bdf information,
which helps issue debugging especially in multiple GPU
system.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 144 
 1 file changed, 75 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 8a3847d3041f..0d871479ff34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -291,21 +291,22 @@ static int psp_memory_training_init(struct psp_context 
*psp)
struct psp_memory_training_context *ctx = &psp->mem_train_ctx;
 
if (ctx->init != PSP_MEM_TRAIN_RESERVE_SUCCESS) {
-   DRM_DEBUG("memory training is not supported!\n");
+   dev_dbg(psp->adev->dev, "memory training is not supported!\n");
return 0;
}
 
ctx->sys_cache = kzalloc(ctx->train_data_size, GFP_KERNEL);
if (ctx->sys_cache == NULL) {
-   DRM_ERROR("alloc mem_train_ctx.sys_cache failed!\n");
+   dev_err(psp->adev->dev, "alloc mem_train_ctx.sys_cache 
failed!\n");
ret = -ENOMEM;
goto Err_out;
}
 
-   
DRM_DEBUG("train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n",
- ctx->train_data_size,
- ctx->p2c_train_data_offset,
- ctx->c2p_train_data_offset);
+   dev_dbg(psp->adev->dev,
+   
"train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n",
+   ctx->train_data_size,
+   ctx->p2c_train_data_offset,
+   ctx->c2p_train_data_offset);
ctx->init = PSP_MEM_TRAIN_INIT_SUCCESS;
return 0;
 
@@ -407,7 +408,7 @@ static int psp_sw_init(void *handle)
 
psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
if (!psp->cmd) {
-   DRM_ERROR("Failed to allocate memory to command buffer!\n");
+   dev_err(adev->dev, "Failed to allocate memory to command 
buffer!\n");
ret = -ENOMEM;
}
 
@@ -454,13 +455,13 @@ static int psp_sw_init(void *handle)
if (mem_training_ctx->enable_mem_training) {
ret = psp_memory_training_init(psp);
if (ret) {
-   DRM_ERROR("Failed to initialize memory training!\n");
+   dev_err(adev->dev, "Failed to initialize memory 
training!\n");
return ret;
}
 
ret = psp_mem_training(psp, PSP_MEM_TRAIN_COLD_BOOT);
if (ret) {
-   DRM_ERROR("Failed to process memory training!\n");
+   dev_err(adev->dev, "Failed to process memory 
training!\n");
return ret;
}
}
@@ -675,9 +676,11 @@ psp_cmd_submit_buf(struct psp_context *psp,
 */
if (!skip_unsupport && (psp->cmd_buf_mem->resp.status || !timeout) && 
!ras_intr) {
if (ucode)
-   DRM_WARN("failed to load ucode %s(0x%X) ",
- amdgpu_ucode_name(ucode->ucode_id), 
ucode->ucode_id);
-   DRM_WARN("psp gfx command %s(0x%X) failed and response status 
is (0x%X)\n",
+   dev_warn(psp->adev->dev,
+"failed to load ucode %s(0x%X) ",
+amdgpu_ucode_name(ucode->ucode_id), 
ucode->ucode_id);
+   dev_warn(psp->adev->dev,
+"psp gfx command %s(0x%X) failed and response status 
is (0x%X)\n",
 psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), 
psp->cmd_buf_mem->cmd_id,
 psp->cmd_buf_mem->resp.status);
/* If any firmware (including CAP) load fails under SRIOV, it 
should
@@ -807,7 +810,7 @@ static int psp_tmr_init(struct psp_context *psp)
psp->fw_pri_buf) {
ret = psp_load_toc(psp, &tmr_size);
if (ret) {
-   DRM_ERROR("Failed to load toc\n");
+   dev_err(psp->adev->dev, "Failed to load toc\n");
return ret;
}
}
@@ -855,7 +858,7 @@ static int psp_tmr_load(struct psp_context *psp)
 
psp_prep_tmr_cmd_buf(psp, cmd, psp->tmr_mc_addr, psp->tmr_bo);
if (psp->tmr_bo)
-   DRM_INFO("reserve 0x%lx from 0x%llx for PSP TMR\n",
+   dev_info(psp->adev->dev, "reserve 0x%lx from 0x%llx for PSP 
TMR\n",

[PATCH 2/3] Revert "drm/amdgpu: enable mca debug mode on APU by default"

2024-01-01 Thread Hawking Zhang

Not needed any more with firmware fixes

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 842405bb8995..d6e74b4dc6d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3159,8 +3159,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
 
-   /* enable MCA debug on APU device */
-   amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
+   amdgpu_ras_set_mca_debug_mode(adev, false);
 
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Packed socket_id to ras feature mask

2024-01-01 Thread Hawking Zhang

Initialize RAS feature mask bit[31:29] with socket_id.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 72b6e41329b0..842405bb8995 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2936,6 +2936,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
+   /* Packed socket_id to ras feature mask bits[31:29] */
+   if (adev->smuio.funcs &&
+   adev->smuio.funcs->get_socket_id)
+   con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 
29);
+
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
 
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-01 Thread Hawking Zhang

Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5f302b7693b3..72b6e41329b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2

[PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-01 Thread Hawking Zhang

Instead of traditional atomfirmware interfaces for RAS
capability, host driver can query ras capability from
psp starting from psp v13_0_6.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b536e3cada..8a3847d3041f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp)
+{
+   bool ret;
+
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   ret = psp->funcs->get_ras_capability(psp);
+   return ret;
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"
 
 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware

2024-01-01 Thread Hawking Zhang

Driver and firmware share the same ras block enum.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5785b705c692..8b053602c5ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -70,6 +70,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+   AMDGPU_RAS_BLOCK__IH,
+   AMDGPU_RAS_BLOCK__MPIO,
 
AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.17.1

[PATCH 0/3] Add ras cap query from psp

2024-01-01 Thread Hawking Zhang

Driver can query RAS capability through psp or bios.

Hawking Zhang (3):
  drm/amdgpu: Align ras block enum with firmware
  drm/amdgpu: Query ras capablity from psp
  drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  13 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   2 +
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  |  26 
 5 files changed, 136 insertions(+), 77 deletions(-)

-- 
2.17.1

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-01 Thread Hawking Zhang

Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1

[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-01 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Hawking Zhang

Add ras helper function to query boot time gpu
errors.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..db44ec857a31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32)
+#define AMDGPU_SMN_CROSS_AID (1ULL << 34)
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 39399d0f2ce5..5f302b7693b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+   *boot_error = reg_data;
+   return 0;
+   }
+   msleep(1);
+   }
+
+   *boot_error = reg_data;
+   return -ETIME;

[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-01 Thread Hawking Zhang

To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 001a35fa0f19..873419a5b9aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = (0x38 >> 2);
+   pcie_data = (0x3C >> 2);
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = (0x44 >> 2);
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1

[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-01 Thread Hawking Zhang

Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4b694696930e..001a35fa0f19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 1bf975b8d083..94b536e3cada 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = &adev->psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_

[PATCH 0/5] Add boot time error reporting

2024-01-01 Thread Hawking Zhang

For ASICs that support boot time error reporting, poll all
the boot time errors cached in registers and make it available
in kernel log.

Hawking Zhang (5):
  drm/amdgpu: drop psp v13 query_boot_status implementation
  drm/amdgpu: Init pcie_index/data address as fallback
  drm/amdgpu: Add ras helper to query boot errors
  drm/amdgpu: Query boot status if discovery failed
  drm/amdgpu: Query boot status if boot failed

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 22 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 15 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 95 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   | 15 ++-
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c| 89 ++---
 8 files changed, 141 insertions(+), 108 deletions(-)

-- 
2.17.1

[PATCH] drm/amdgpu: Switch to aca bank for xgmi pcs err cnt

2023-12-12 Thread Hawking Zhang

Instead of software managed counters.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h  | 2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index e51e8918e667..b399f1b62887 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -46,6 +46,8 @@
 #define MCA_REG__STATUS__ERRORCODEEXT(x)   MCA_REG_FIELD(x, 21, 16)
 #define MCA_REG__STATUS__ERRORCODE(x)  MCA_REG_FIELD(x, 15, 0)
 
+#define MCA_REG__MISC0__ERRCNT(x)  MCA_REG_FIELD(x, 43, 32)
+
 #define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0)
 
 enum amdgpu_mca_ip {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index ddd782fbee7a..3998c9b31d07 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2537,13 +2537,15 @@ static int mca_pcs_xgmi_mca_get_err_count(const struct 
mca_ras_info *mca_ras, st
  uint32_t *count)
 {
u32 ext_error_code;
+   u32 err_cnt;
 
ext_error_code = 
MCA_REG__STATUS__ERRORCODEEXT(entry->regs[MCA_REG_IDX_STATUS]);
+   err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]);
 
if (type == AMDGPU_MCA_ERROR_TYPE_UE && ext_error_code == 0)
-   *count = 1;
+   *count = err_cnt;
else if (type == AMDGPU_MCA_ERROR_TYPE_CE && ext_error_code == 6)
-   *count = 1;
+   *count = err_cnt;
 
return 0;
 }
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Update fw version for boot time error query

2023-11-19 Thread Hawking Zhang

Boot time error query is not available till a10109

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 3cf4684d0d3f..5f46877f78cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -821,7 +821,7 @@ static int psp_v13_0_query_boot_status(struct psp_context 
*psp)
if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6))
return 0;
 
-   if (RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_59) < 0x00a10007)
+   if (RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_59) < 0x00a10109)
return 0;
 
for_each_inst(i, inst_mask) {
-- 
2.17.1

[PATCH 2/3] drm/amdgpu: Do not issue gpu reset from nbio v7_9 bif interrupt

2023-11-19 Thread Hawking Zhang

In nbio v7_9, host driver should not issu gpu reset

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index 23f26f8caad4..25a3da83e0fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -611,11 +611,6 @@ static void 
nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 
dev_info(adev->dev, "RAS controller interrupt triggered "
"by NBIF error\n");
-
-   /* ras_controller_int is dedicated for nbif ras error,
-* not the global interrupt for sync flood
-*/
-   amdgpu_ras_reset_gpu(adev);
}
 
amdgpu_ras_error_data_fini(&err_data);
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Retire query/reseet_ras_error_status from gfx_v9_4_3

2023-11-19 Thread Hawking Zhang

Not needed anymore.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 114 
 1 file changed, 114 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 40d06d32bb74..5df727be88c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3882,32 +3882,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct 
amdgpu_device *adev,
mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-static void gfx_v9_4_3_inst_query_utc_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t data;
-
-   data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regUTCL2_MEM_ECC_STATUS);
-   if (data) {
-   dev_warn(adev->dev, "GFX UTCL2 Mem Ecc Status: 0x%x!\n", data);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regUTCL2_MEM_ECC_STATUS, 
0x3);
-   }
-
-   data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_MEM_ECC_STATUS);
-   if (data) {
-   dev_warn(adev->dev, "GFX VML2 Mem Ecc Status: 0x%x!\n", data);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_MEM_ECC_STATUS, 
0x3);
-   }
-
-   data = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
-   regVML2_WALKER_MEM_ECC_STATUS);
-   if (data) {
-   dev_warn(adev->dev, "GFX VML2 Walker Mem Ecc Status: 0x%x!\n", 
data);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regVML2_WALKER_MEM_ECC_STATUS,
-   0x3);
-   }
-}
-
 static void gfx_v9_4_3_log_cu_timeout_status(struct amdgpu_device *adev,
uint32_t status, int xcc_id)
 {
@@ -3950,82 +3924,6 @@ static void gfx_v9_4_3_log_cu_timeout_status(struct 
amdgpu_device *adev,
}
 }
 
-static void gfx_v9_4_3_inst_query_sq_timeout_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t se_idx, sh_idx, cu_idx;
-   uint32_t status;
-
-   mutex_lock(&adev->grbm_idx_mutex);
-   for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; 
se_idx++) {
-   for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; 
sh_idx++) {
-   for (cu_idx = 0; cu_idx < 
adev->gfx.config.max_cu_per_sh; cu_idx++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, se_idx, 
sh_idx,
-   cu_idx, xcc_id);
-   status = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
- regSQ_TIMEOUT_STATUS);
-   if (status != 0) {
-   dev_info(
-   adev->dev,
-   "GFX Watchdog Timeout: SE %d, 
SH %d, CU %d\n",
-   se_idx, sh_idx, cu_idx);
-   gfx_v9_4_3_log_cu_timeout_status(
-   adev, status, xcc_id);
-   }
-   /* clear old status */
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id),
-   regSQ_TIMEOUT_STATUS, 0);
-   }
-   }
-   }
-   gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
-   xcc_id);
-   mutex_unlock(&adev->grbm_idx_mutex);
-}
-
-static void gfx_v9_4_3_inst_query_ras_err_status(struct amdgpu_device *adev,
-   void *ras_error_status, int xcc_id)
-{
-   gfx_v9_4_3_inst_query_utc_err_status(adev, xcc_id);
-   gfx_v9_4_3_inst_query_sq_timeout_status(adev, xcc_id);
-}
-
-static void gfx_v9_4_3_inst_reset_utc_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regUTCL2_MEM_ECC_STATUS, 0x3);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_MEM_ECC_STATUS, 0x3);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 
0x3);
-}
-
-static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t se_idx, sh_idx, cu_idx;
-
-   mutex_lock(&adev->grbm_idx_mutex);
-   for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; 
se_idx++) {
-   for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; 
sh_idx++) {
-   for (cu_idx = 0; cu_idx < 
adev->gfx.config.max_cu_per_sh; cu_idx++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, se_idx, 
sh_idx,
-

[PATCH 3/3] drm/amdgpu: Query and report boot status

2023-11-01 Thread Hawking Zhang

Query boot status and report boot errors. A follow
up change is needed to stop GPU initialization if boot
fails.

v2: only invoke the call for dGPU (Le/Lijo)

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Yang Wang 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c317a4869492..02d6246df938 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1070,6 +1070,8 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
+   /* TODO: check the return val and stop device initialization if 
boot fails */
+   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
-- 
2.17.1

[PATCH 2/3] drm/amdgpu: Add psp v13 function to query boot status

2023-11-01 Thread Hawking Zhang

Add psp v13 function to query boot status.

v2: limit the use case to dGPU only (Lijo)

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Yang Wang 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 15 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  3 +
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 78 +
 3 files changed, 96 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e6dc3cfbac0e..66d9c189af29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2120,6 +2120,21 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
+{
+   struct psp_context *psp = &adev->psp;
+   int ret;
+
+   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
+   return 0;
+
+   if (psp->funcs &&
+   psp->funcs->query_boot_status)
+   ret = psp->funcs->query_boot_status(psp);
+
+   return ret;
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 7111dd32e66f..5d36ad3f48c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,6 @@ int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
 
+int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 4142e2fcd866..3cf4684d0d3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -759,6 +759,83 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+
+static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
+  uint32_t inst,
+  uint32_t boot_error)
+{
+   uint32_t socket_id;
+   uint32_t aid_id;
+   uint32_t hbm_id;
+   uint32_t reg_data;
+
+   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
+   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
+   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
+
+   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+socket_id, aid_id, reg_data);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_CP_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_DP_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_HBM_MEM_TEST))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_HBM_BIST_TEST))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+

[PATCH 1/3] drm/amdgpu: Add C2PMSG_109/126 reg field shift/masks

2023-11-01 Thread Hawking Zhang

Add MP0_C2PMSG_109/126 register field shift/masks
that are used to identify boot status by driver.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Yang Wang 
Reviewed-by: Le Ma 
---
 .../include/asic_reg/mp/mp_13_0_2_sh_mask.h   | 28 +++
 1 file changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h
index 6e29a185de51..765d9ca2316f 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h
@@ -242,6 +242,34 @@
 //MP0_SMN_C2PMSG_103
 #define MP0_SMN_C2PMSG_103__CONTENT__SHIFT 
   0x0
 #define MP0_SMN_C2PMSG_103__CONTENT_MASK   
   0xL
+//MP0_SMN_C2PMSG_109
+#define MP0_SMN_C2PMSG_109__CONTENT__SHIFT 
   0x0
+#define MP0_SMN_C2PMSG_109__CONTENT_MASK   
   0xL
+//MP0_SMN_C2PMSG_126
+#define MP0_SMN_C2PMSG_126__GPU_ERR_MEM_TRAINING__SHIFT
   0x0
+#define MP0_SMN_C2PMSG_126__GPU_ERR_FW_LOAD__SHIFT 
   0x1
+#define MP0_SMN_C2PMSG_126__GPU_ERR_WAFL_LINK_TRAINING__SHIFT  
   0x2
+#define MP0_SMN_C2PMSG_126__GPU_ERR_XGMI_LINK_TRAINING__SHIFT  
   0x3
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_CP_LINK_TRAINING__SHIFT
   0x4
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_DP_LINK_TRAINING__SHIFT
   0x5
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_MEM_TEST__SHIFT
   0x6
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_BIST_TEST__SHIFT   
   0x7
+#define MP0_SMN_C2PMSG_126__SOCKET_ID__SHIFT   
   0x8
+#define MP0_SMN_C2PMSG_126__AID_ID__SHIFT  
   0xb
+#define MP0_SMN_C2PMSG_126__HBM_ID__SHIFT  
   0xd
+#define MP0_SMN_C2PMSG_126__BOOT_STATUS__SHIFT 
   0x1f
+#define MP0_SMN_C2PMSG_126__GPU_ERR_MEM_TRAINING_MASK  
   0x0001L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_FW_LOAD_MASK   
   0x0002L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_WAFL_LINK_TRAINING_MASK
   0x0004L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_XGMI_LINK_TRAINING_MASK
   0x0008L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_CP_LINK_TRAINING_MASK  
   0x0010L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_DP_LINK_TRAINING_MASK  
   0x0020L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_MEM_TEST_MASK  
   0x0040L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_BIST_TEST_MASK 
   0x0080L
+#define MP0_SMN_C2PMSG_126__SOCKET_ID_MASK 
   0x0700L
+#define MP0_SMN_C2PMSG_126__AID_ID_MASK
   0x1800L
+#define MP0_SMN_C2PMSG_126__HBM_ID_MASK
   0x2000L
+#define MP0_SMN_C2PMSG_126__BOOT_STATUS_MASK   
   0x8000L
 //MP0_SMN_IH_CREDIT
 #define MP0_SMN_IH_CREDIT__CREDIT_VALUE__SHIFT 
   0x0
 #define MP0_SMN_IH_CREDIT__CLIENT_ID__SHIFT
   0x10
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: Add UVD_VCPU_INT_EN2 to dpg sram

2023-10-18 Thread Hawking Zhang

Add RAS sepcifc programming to dpg sram.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index f85d18cd74ec..810bbfccd6f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1760,6 +1760,11 @@ static void vcn_v4_0_3_enable_ras(struct amdgpu_device 
*adev,
  SOC15_DPG_MODE_OFFSET(VCN, 0, regVCN_RAS_CNTL),
  tmp, 0, indirect);
 
+   tmp = UVD_VCPU_INT_EN2__RASCNTL_VCPU_VCODEC_EN_MASK;
+   WREG32_SOC15_DPG_MODE(inst_idx,
+ SOC15_DPG_MODE_OFFSET(VCN, 0, 
regUVD_VCPU_INT_EN2),
+ tmp, 0, indirect);
+
tmp = UVD_SYS_INT_EN__RASCNTL_VCPU_VCODEC_EN_MASK;
WREG32_SOC15_DPG_MODE(inst_idx,
  SOC15_DPG_MODE_OFFSET(VCN, 0, regUVD_SYS_INT_EN),
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: Enable software RAS in vcn v4_0_3

2023-10-18 Thread Hawking Zhang

Set VCN/JPEG RAS masks to enable software RAS for
VCN and JPEG.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 973073e07b2a..7de1eb7c959d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2606,7 +2606,9 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
IP_VERSION(2, 6, 0) ||
amdgpu_ip_version(adev, VCN_HWIP, 0) ==
-   IP_VERSION(4, 0, 0))
+   IP_VERSION(4, 0, 0) ||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) ==
+   IP_VERSION(4, 0, 3))
adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__VCN |
1 << 
AMDGPU_RAS_BLOCK__JPEG);
else
-- 
2.17.1

[PATCH] drm/amdgpu: fallback to old RAS error message for aqua_vanjaram

2023-09-08 Thread Hawking Zhang

So driver doesn't generate incorrect message until
the new format is settled down for aqua_vanjaram

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8eb6f6943778..632478874f7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1053,7 +1053,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
info->ce_count = obj->err_data.ce_count;
 
if (err_data.ce_count) {
-   if (adev->smuio.funcs &&
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
@@ -1073,7 +1074,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
}
}
if (err_data.ue_count) {
-   if (adev->smuio.funcs &&
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
-- 
2.17.1

[PATCH] drm/amdgpu: fallback to old RAS error message for aqua_vanjaram

2023-09-08 Thread Hawking Zhang

So driver doesn't generate incorrect message until
the new format is settled down for aqua_vanjaram

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8eb6f6943778..dee7b5b705e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1053,7 +1053,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
info->ce_count = obj->err_data.ce_count;
 
if (err_data.ce_count) {
-   if (adev->smuio.funcs &&
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
-- 
2.17.1

[PATCH] drm/amdgpu: Correct se_num and reg_inst for gfx v9_4_3 ras counters

2023-09-06 Thread Hawking Zhang

gfx_v9_4_3_ue|ce_reg_list is an array per gfx core instance
correct the settings of se_num and reg_inst for some of
gfx ras counters so all the available register instances
can be polled for ras status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 40 -
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 0a26a00074a6..a60d1a8405d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3653,19 +3653,19 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ce_reg_list[] = {
AMDGPU_GFX_GC_CANE_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_CE_ERR_STATUS_LO, 
regSPI_CE_ERR_STATUS_HI),
1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SPI"},
-   AMDGPU_GFX_SPI_MEM, 8},
+   AMDGPU_GFX_SPI_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_CE_ERR_STATUS_LO, 
regSP0_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP0"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_CE_ERR_STATUS_LO, 
regSP1_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP1"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_CE_ERR_STATUS_LO, 
regSQ_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SQ"},
-   AMDGPU_GFX_SQ_MEM, 8},
+   AMDGPU_GFX_SQ_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQC_CE_EDC_LO, regSQC_CE_EDC_HI),
5, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SQC"},
-   AMDGPU_GFX_SQC_MEM, 8},
+   AMDGPU_GFX_SQC_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCX_CE_ERR_STATUS_LO, 
regTCX_CE_ERR_STATUS_HI),
2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCX"},
AMDGPU_GFX_TCX_MEM, 1},
@@ -3674,22 +3674,22 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ce_reg_list[] = {
AMDGPU_GFX_TCC_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTA_CE_EDC_LO, regTA_CE_EDC_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TA"},
-   AMDGPU_GFX_TA_MEM, 8},
+   AMDGPU_GFX_TA_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCI_CE_EDC_LO_REG, 
regTCI_CE_EDC_HI_REG),
-   31, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCI"},
+   27, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCI"},
AMDGPU_GFX_TCI_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCP_CE_EDC_LO_REG, 
regTCP_CE_EDC_HI_REG),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCP"},
-   AMDGPU_GFX_TCP_MEM, 8},
+   AMDGPU_GFX_TCP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTD_CE_EDC_LO, regTD_CE_EDC_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TD"},
-   AMDGPU_GFX_TD_MEM, 8},
+   AMDGPU_GFX_TD_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regGCEA_CE_ERR_STATUS_LO, 
regGCEA_CE_ERR_STATUS_HI),
16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"GCEA"},
AMDGPU_GFX_GCEA_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regLDS_CE_ERR_STATUS_LO, 
regLDS_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"LDS"},
-   AMDGPU_GFX_LDS_MEM, 1},
+   AMDGPU_GFX_LDS_MEM, 4},
 };
 
 static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ue_reg_list[] = {
@@ -3713,19 +3713,19 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ue_reg_list[] = {
AMDGPU_GFX_GC_CANE_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_UE_ERR_STATUS_LO, 
regSPI_UE_ERR_STATUS_HI),
1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SPI"},
-   AMDGPU_GFX_SPI_MEM, 8},
+   AMDGPU_GFX_SPI_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_UE_ERR_STATUS_LO, 
regSP0_UE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP0"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_UE_ERR_STATUS_LO, 
regSP1_UE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP1"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_UE_ERR_STATUS_LO, 
regSQ_UE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATU

[PATCH] drm/amdgpu: Free ras cmd input buffer properly

2023-08-29 Thread Hawking Zhang

Do not access the pointer for ras input cmd buffer
if it is even not allocated.

Signed-off-by: Hawking Zhang 
Reviewed-by: Stanley Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e47600a8e88e..8eb6f6943778 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -764,7 +764,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
union ta_ras_cmd_input *info;
-   int ret = 0;
+   int ret;
 
if (!con)
return -EINVAL;
@@ -774,7 +774,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Force issue enable or disable ras feature commands */
if (head->block != AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_ras_is_feature_allowed(adev, head))
-   goto out;
+   return 0;
 
/* Only enable gfx ras feature from host side */
if (head->block == AMDGPU_RAS_BLOCK__GFX &&
@@ -802,16 +802,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
enable ? "enable":"disable",
get_ras_block_str(head),
amdgpu_ras_is_poison_mode_supported(adev), ret);
-   goto out;
+   return ret;
}
+
+   kfree(info);
}
 
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
-out:
-   if (head->block == AMDGPU_RAS_BLOCK__GFX)
-   kfree(info);
-   return ret;
+
+   return 0;
 }
 
 /* Only used in device probe stage and called only once. */
-- 
2.17.1

[PATCH] drm/amdgpu: Free ras cmd input buffer properly

2023-08-29 Thread Hawking Zhang

Do not access the pointer for ras input cmd buffer
if it is even not allocated.

Signed-off-by: Hawking Zhang 
Reviewed-by: Stanley Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e47600a8e88e..8eb6f6943778 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -764,7 +764,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
union ta_ras_cmd_input *info;
-   int ret = 0;
+   int ret;
 
if (!con)
return -EINVAL;
@@ -774,7 +774,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Force issue enable or disable ras feature commands */
if (head->block != AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_ras_is_feature_allowed(adev, head))
-   goto out;
+   return 0;
 
/* Only enable gfx ras feature from host side */
if (head->block == AMDGPU_RAS_BLOCK__GFX &&
@@ -802,16 +802,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
enable ? "enable":"disable",
get_ras_block_str(head),
amdgpu_ras_is_poison_mode_supported(adev), ret);
-   goto out;
+   return ret;
}
+
+   kfree(info);
}
 
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
-out:
-   if (head->block == AMDGPU_RAS_BLOCK__GFX)
-   kfree(info);
-   return ret;
+
+   return 0;
 }
 
 /* Only used in device probe stage and called only once. */
-- 
2.17.1

[PATCH] drm/amdgpu: Free ras cmd input buffer properly

2023-08-29 Thread Hawking Zhang

Do not access the pointer for ras input cmd buffer
if it is even not allocated.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e47600a8e88e..16c5fe487ea0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -804,13 +804,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
amdgpu_ras_is_poison_mode_supported(adev), ret);
goto out;
}
+
+   kfree(info);
}
 
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
-out:
-   if (head->block == AMDGPU_RAS_BLOCK__GFX)
-   kfree(info);
+
return ret;
 }
 
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: Support query ecc cap for aqua_vanjaram

2023-08-24 Thread Hawking Zhang

Driver queries umc_info v4_0 to identify ecc cap
for aqua_vanjaram

Signed-off-by: Hawking Zhang 
Reviewed-by: Candice Li 
---
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c   | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index 835980e94b9e..fb2681dd6b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -217,6 +217,7 @@ union umc_info {
struct atom_umc_info_v3_1 v31;
struct atom_umc_info_v3_2 v32;
struct atom_umc_info_v3_3 v33;
+   struct atom_umc_info_v4_0 v40;
 };
 
 union vram_info {
@@ -508,9 +509,8 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
 
if (amdgpu_atom_parse_data_header(mode_info->atom_context,
index, &size, &frev, &crev, &data_offset)) {
+   umc_info = (union umc_info *)(mode_info->atom_context->bios + 
data_offset);
if (frev == 3) {
-   umc_info = (union umc_info *)
-   (mode_info->atom_context->bios + data_offset);
switch (crev) {
case 1:
umc_config = 
le32_to_cpu(umc_info->v31.umc_config);
@@ -533,6 +533,20 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
/* unsupported crev */
return false;
}
+   } else if (frev == 4) {
+   switch (crev) {
+   case 0:
+   umc_config1 = 
le32_to_cpu(umc_info->v40.umc_config1);
+   ecc_default_enabled =
+   (umc_config1 & 
UMC_CONFIG1__ENABLE_ECC_CAPABLE) ? true : false;
+   break;
+   default:
+   /* unsupported crev */
+   return false;
+   }
+   } else {
+   /* unsupported frev */
+   return false;
}
}
 
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: Add umc_info v4_0 structure

2023-08-24 Thread Hawking Zhang

To be used by aqua_vanjaram

Signed-off-by: Hawking Zhang 
Reviewed-by: Candice Li 
---
 drivers/gpu/drm/amd/include/atomfirmware.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h 
b/drivers/gpu/drm/amd/include/atomfirmware.h
index e68c1e280322..fa7d6ced786f 100644
--- a/drivers/gpu/drm/amd/include/atomfirmware.h
+++ b/drivers/gpu/drm/amd/include/atomfirmware.h
@@ -3117,6 +3117,24 @@ enum atom_umc_config1_def {
UMC_CONFIG1__ENABLE_ECC_CAPABLE = 0x0001,
 };
 
+struct atom_umc_info_v4_0 {
+   struct atom_common_table_header table_header;
+   uint32_t ucode_reserved[5];
+   uint8_t umcip_min_ver;
+   uint8_t umcip_max_ver;
+   uint8_t vram_type;
+   uint8_t umc_config;
+   uint32_t mem_refclk_10khz;
+   uint32_t clk_reserved[4];
+   uint32_t golden_reserved;
+   uint32_t umc_config1;
+   uint32_t reserved[2];
+   uint8_t channel_num;
+   uint8_t channel_width;
+   uint8_t channel_reserve[2];
+   uint8_t umc_info_reserved[16];
+};
+
 /* 
   ***
 Data Table vram_info  structure
-- 
2.17.1

[PATCH] drm/amdgpu: Allow issue disable gfx ras cmd to firmware

2023-08-23 Thread Hawking Zhang

Disable gfx ras command is needed in some use cases
like live migration.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 378478cf9c21..7db6baa16236 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -769,9 +769,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!con)
return -EINVAL;
 
-   /* Do not enable ras feature if it is not allowed */
-   if (enable &&
-   head->block != AMDGPU_RAS_BLOCK__GFX &&
+   /* For non-gfx ip, do not enable ras feature if it is not allowed.
+* For gfx ip, regardless of feature support status,
+* force issue enable or disable ras feature commands */
+   if (head->block != AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_ras_is_feature_allowed(adev, head))
goto out;
 
-- 
2.17.1

[PATCH] drm/amdgpu: Fix the return for gpu mode1_reset

2023-08-18 Thread Hawking Zhang

amdgpu_device_mode1_reset will return gpu mode1_reset
succeed (ret = 0) as long as wait_for_bootloader call
succeed, regardless of the status reported by smu or
psp firmware. This results to driver continue executing
recovery even smu or psp fail to perform mode1 reset.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5586146b8c76..533daba2accb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4701,12 +4701,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)
}
 
if (ret)
-   dev_err(adev->dev, "GPU mode1 reset failed\n");
+   goto mode1_reset_failed;
 
amdgpu_device_load_pci_state(adev->pdev);
ret = amdgpu_psp_wait_for_bootloader(adev);
if (ret)
-   return ret;
+   goto mode1_reset_failed;
 
/* wait for asic to come out of reset */
for (i = 0; i < adev->usec_timeout; i++) {
@@ -4717,8 +4717,17 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
udelay(1);
}
 
+   if (i >= adev->usec_timeout) {
+   ret = -ETIMEDOUT;
+   goto mode1_reset_failed;
+   }
+
amdgpu_atombios_scratch_regs_engine_hung(adev, false);
 
+   return 0;
+
+mode1_reset_failed:
+   dev_err(adev->dev, "GPU mode1 reset failed\n");
return ret;
 }
 
-- 
2.17.1

[PATCH] drm/amdgpu: Remove unnecessary ras cap check

2023-08-09 Thread Hawking Zhang

RAS global isr will only be invoked by hardware
interrupt. Don't need to query ras capability in isr
In addition, amdgpu_ras_interrupt_fatal_error_handler
ensures the isr won't be called from guest linux
side by accident. The RAS cap check in isr that
introduced to fix sriov crash is not needed any more

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 00658c2816dc..c58b31121fd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2970,10 +2970,6 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
-   amdgpu_ras_check_supported(adev);
-   if (!adev->ras_hw_enabled)
-   return;
-
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Issue ras enable_feature for gfx ip only

2023-07-03 Thread Hawking Zhang

For non-GFX IP blocks, set up ras obj if ras feature
is allowed. For GFX IP blocks, force issue ras
enable_feature command to firmware and only set up ras
obj if ras feature is allowed

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8524365761b6..2e9154bbec64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -761,16 +761,6 @@ static int __amdgpu_ras_feature_enable(struct 
amdgpu_device *adev,
return 0;
 }
 
-static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
-   struct ras_common_if *head)
-{
-   if (amdgpu_ras_is_feature_allowed(adev, head) ||
-   amdgpu_ras_is_poison_mode_supported(adev))
-   return 1;
-   else
-   return 0;
-}
-
 /* wrapper of psp_ras_enable_features */
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable)
@@ -782,7 +772,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!con)
return -EINVAL;
 
-   if (head->block == AMDGPU_RAS_BLOCK__GFX) {
+   /* Do not enable ras feature if it is not allowed */
+   if (enable &&
+   head->block != AMDGPU_RAS_BLOCK__GFX &&
+   !amdgpu_ras_is_feature_allowed(adev, head))
+   goto out;
+
+   /* Only enable gfx ras feature from host side */
+   if (head->block == AMDGPU_RAS_BLOCK__GFX &&
+   !amdgpu_sriov_vf(adev) &&
+   !amdgpu_ras_intr_triggered()) {
info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
if (!info)
return -ENOMEM;
@@ -798,16 +797,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
.error_type = 
amdgpu_ras_error_to_ta(head->type),
};
}
-   }
 
-   /* Do not enable if it is not allowed. */
-   if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
-   goto out;
-
-   /* Only enable ras feature operation handle on host side */
-   if (head->block == AMDGPU_RAS_BLOCK__GFX &&
-   !amdgpu_sriov_vf(adev) &&
-   !amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) {
dev_err(adev->dev, "ras %s %s failed poison:%d 
ret:%d\n",
-- 
2.17.1

[PATCH 2/3] drm/amdgpu: Remove gfx v11_0_3 ras_late_init call

2023-07-03 Thread Hawking Zhang

amdgpu_ras_late_init will invoke ras_late_init call
per IP block

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 26 --
 1 file changed, 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 3a7af59e83ca..66d38890d393 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle)
return gfx_v11_0_init_microcode(adev);
 }
 
-static int gfx_v11_0_ras_late_init(void *handle)
-{
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_common_if *gfx_common_if;
-   int ret;
-
-   gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-   if (!gfx_common_if)
-   return -ENOMEM;
-
-   gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX;
-
-   ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true);
-   if (ret)
-   dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n");
-
-   kfree(gfx_common_if);
-   return 0;
-}
-
 static int gfx_v11_0_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle)
if (r)
return r;
 
-   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) {
-   r = gfx_v11_0_ras_late_init(handle);
-   if (r)
-   return r;
-   }
-
return 0;
 }
 
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Apply poison mode check to GFX IP only

2023-07-03 Thread Hawking Zhang

For GFX IP that only supports poison consumption, GFX
RAS won't be marked as enabled. i.e., hardware doesn't
support gfx sram ecc. But driver still needs to issue
firmware to enable poison consumption mode for GFX IP.
In such case, check poison mode and treat GFX IP as
RAS capable IP block.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 041112c7fbbd..8524365761b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3150,6 +3150,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 * that the ras block supports ras function.
 */
if (!ret &&
+   block == AMDGPU_RAS_BLOCK__GFX &&
amdgpu_ras_is_poison_mode_supported(adev) &&
amdgpu_ras_get_ras_block(adev, block, 0))
ret = 1;
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is supported

2023-06-11 Thread Hawking Zhang

GFX v11_0_3 ras needs to be enabled if poison mode
is supported. Driver doesn't need issue an feature
enable call in gfx_v11_0 late init phase. The ras
late init call is already centralized to
amdgpu_ras_late_init.
In addition, move poison_mode check out of common
helper like amdgpu_ras_is_supported and
amdgpu_ras_is_feature_allowed ensure only GFX RAS
is enabled when poison mode is supported.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 -
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 26 -
 2 files changed, 16 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dd7cdc234d7e..35e70860d628 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -126,6 +126,7 @@ static bool amdgpu_ras_check_bad_page_unlock(struct 
amdgpu_ras *con,
uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev);
 #ifdef CONFIG_X86_MCE_AMD
 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
 struct mce_notifier_adev_list {
@@ -757,16 +758,6 @@ static int __amdgpu_ras_feature_enable(struct 
amdgpu_device *adev,
return 0;
 }
 
-static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
-   struct ras_common_if *head)
-{
-   if (amdgpu_ras_is_feature_allowed(adev, head) ||
-   amdgpu_ras_is_poison_mode_supported(adev))
-   return 1;
-   else
-   return 0;
-}
-
 /* wrapper of psp_ras_enable_features */
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable)
@@ -797,7 +788,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
}
 
/* Do not enable if it is not allowed. */
-   if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
+   if (enable && !amdgpu_ras_is_feature_allowed(adev, head))
goto out;
 
/* Only enable ras feature operation handle on host side */
@@ -2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct 
amdgpu_device *adev)
 }
 
 /*
- * this is workaround for vega20 workstation sku,
- * force enable gfx ras, ignore vbios gfx ras flag
- * due to GC EDC can not write
+ * Common helpers for device or IP specific RAS quirks including
+ * a). Enable gfx ras on D16406 or D36002 board
+ * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported
  */
 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
 {
@@ -2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
if (!ctx)
return;
 
+   /* Enable gfx ras on specific board */
if (strnstr(ctx->vbios_version, "D16406",
sizeof(ctx->vbios_version)) ||
-   strnstr(ctx->vbios_version, "D36002",
-   sizeof(ctx->vbios_version)))
+   strnstr(ctx->vbios_version, "D36002",
+   sizeof(ctx->vbios_version)))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) &&
+   amdgpu_ras_is_poison_mode_supported(adev))
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
@@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
   1 << AMDGPU_RAS_BLOCK__MMHUB);
}
 
+   amdgpu_ras_query_poison_mode(adev);
+
amdgpu_ras_get_quirks(adev);
 
/* hw_supported needs to be aligned with RAS block mask. */
@@ -2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
-   amdgpu_ras_query_poison_mode(adev);
-
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, 
struct amdgpu_ras *ras_co
 int amdgpu_ras_is_supported(struct amdgpu_device *adev,
unsigned int block)
 {
-   int ret = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
if (block >= AMDGPU_RAS_BLOCK_COUNT)
return 0;
 
-   ret = ras && (adev->ras_enabled & (1 << block));
-
-   /* For the special asic with mem ecc enabled but sram ecc
-* not enabled, even if the ras block is not supported on
-* .ras_enabled, if the asic supports poison mode and the
-* ras block has ras configuration, it can be considered
-

[PATCH 1/2] drm/amdgpu: Only create err_count sysfs when hw_op is supported

2023-06-11 Thread Hawking Zhang

Some IP blocks only support partial ras feature and don't
have ras counter and/or ras error status register at all.
Driver should not create err_count sysfs node for those
IP blocks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 31 ++---
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a6c3265cdbc4..dd7cdc234d7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2757,23 +2757,28 @@ int amdgpu_ras_block_late_init(struct amdgpu_device 
*adev,
goto cleanup;
}
 
-   r = amdgpu_ras_sysfs_create(adev, ras_block);
-   if (r)
-   goto interrupt;
+   if (ras_obj->hw_ops &&
+   (ras_obj->hw_ops->query_ras_error_count ||
+ras_obj->hw_ops->query_ras_error_status)) {
+   r = amdgpu_ras_sysfs_create(adev, ras_block);
+   if (r)
+   goto interrupt;
 
-   /* Those are the cached values at init.
-*/
-   query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
-   if (!query_info)
-   return -ENOMEM;
-   memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
+   /* Those are the cached values at init.
+*/
+   query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
+   if (!query_info)
+   return -ENOMEM;
+   memcpy(&query_info->head, ras_block, sizeof(struct 
ras_common_if));
 
-   if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, 
query_info) == 0) {
-   atomic_set(&con->ras_ce_count, ce_count);
-   atomic_set(&con->ras_ue_count, ue_count);
+   if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, 
query_info) == 0) {
+   atomic_set(&con->ras_ce_count, ce_count);
+   atomic_set(&con->ras_ue_count, ue_count);
+   }
+
+   kfree(query_info);
}
 
-   kfree(query_info);
return 0;
 
 interrupt:
-- 
2.17.1

[PATCH] drm/amd/pm: Fix power context allocation in SMU13

2023-06-01 Thread Hawking Zhang

From: Lijo Lazar 

Use the right data structure for allocation.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index da059b02a153..09ac66ab9c34 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -534,11 +534,11 @@ int smu_v13_0_init_power(struct smu_context *smu)
if (smu_power->power_context || smu_power->power_context_size != 0)
return -EINVAL;
 
-   smu_power->power_context = kzalloc(sizeof(struct smu_13_0_dpm_context),
+   smu_power->power_context = kzalloc(sizeof(struct 
smu_13_0_power_context),
   GFP_KERNEL);
if (!smu_power->power_context)
return -ENOMEM;
-   smu_power->power_context_size = sizeof(struct smu_13_0_dpm_context);
+   smu_power->power_context_size = sizeof(struct smu_13_0_power_context);
 
return 0;
 }
-- 
2.17.1

[PATCH] drm/amdgpu: drop temp programming for pagefault handling

2023-04-12 Thread Hawking Zhang

Was introduced as workaround. not needed anymore

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c | 22 --
 1 file changed, 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c
index be0d0f47415e..13712640fa46 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c
@@ -417,34 +417,12 @@ static void gfxhub_v3_0_set_fault_enable_default(struct 
amdgpu_device *adev,
tmp = REG_SET_FIELD(tmp, CP_DEBUG, CPG_UTCL1_ERROR_HALT_DISABLE, 1);
WREG32_SOC15(GC, 0, regCP_DEBUG, tmp);
 
-   /**
-* Set GRBM_GFX_INDEX in broad cast mode
-* before programming GL1C_UTCL0_CNTL1 and SQG_CONFIG
-*/
-   WREG32_SOC15(GC, 0, regGRBM_GFX_INDEX, regGRBM_GFX_INDEX_DEFAULT);
-
-   /**
-* Retry respond mode: RETRY
-* Error (no retry) respond mode: SUCCESS
-*/
-   tmp = RREG32_SOC15(GC, 0, regGL1C_UTCL0_CNTL1);
-   tmp = REG_SET_FIELD(tmp, GL1C_UTCL0_CNTL1, RESP_MODE, 0);
-   tmp = REG_SET_FIELD(tmp, GL1C_UTCL0_CNTL1, RESP_FAULT_MODE, 0x2);
-   WREG32_SOC15(GC, 0, regGL1C_UTCL0_CNTL1, tmp);
-
/* These registers are not accessible to VF-SRIOV.
 * The PF will program them instead.
 */
if (amdgpu_sriov_vf(adev))
return;
 
-   /* Disable SQ XNACK interrupt for all VMIDs */
-   tmp = RREG32_SOC15(GC, 0, regSQG_CONFIG);
-   tmp = REG_SET_FIELD(tmp, SQG_CONFIG, XNACK_INTR_MASK,
-   SQG_CONFIG__XNACK_INTR_MASK_MASK >>
-   SQG_CONFIG__XNACK_INTR_MASK__SHIFT);
-   WREG32_SOC15(GC, 0, regSQG_CONFIG, tmp);
-
tmp = RREG32_SOC15(GC, 0, regGCVM_L2_PROTECTION_FAULT_CNTL);
tmp = REG_SET_FIELD(tmp, GCVM_L2_PROTECTION_FAULT_CNTL,
RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
-- 
2.34.1

[PATCH] drm/amdgpu: correct xgmi_wafl block name

2023-03-28 Thread Hawking Zhang

fix backward compatibility issue to stay with
the old name of xgmi_wafl node.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 3fe24348d199..439925477fb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1068,7 +1068,7 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
return err;
}
 
-   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl_pcs");
+   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl");
ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm;
-- 
2.17.1

[PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

2023-03-22 Thread Hawking Zhang

GPU will stop working once fatal error is detected.
it will inform driver to do reset to recover from
the fatal error.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/soc21.c  | 15 -
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c6dc3cd2a9de..5b1779021881 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,7 @@
 #include "amdgpu_atomfirmware.h"
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include "nbio_v4_3.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = &nbio_v7_4_ras;
break;
+   case IP_VERSION(4, 3, 0):
+   if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
+   /* unlike other generation of nbio ras,
+* nbio v4_3 only support fatal error interrupt
+* to inform software that DF is freezed due to
+* system fatal error event. driver should not
+* enable nbio ras in such case. Instead,
+* check DF RAS */
+   adev->nbio.ras = &nbio_v4_3_ras;
+   break;
default:
/* nbio ras is not available */
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 09fdcd20cb91..d5ed9e0e1a5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -26,6 +26,7 @@
 
 #include "nbio/nbio_4_3_0_offset.h"
 #include "nbio/nbio_4_3_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include 
 
 static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev)
@@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
.get_rom_offset = nbio_v4_3_get_rom_offset,
 };
+
+static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device 
*adev,
+  struct amdgpu_irq_src 
*src,
+  unsigned type,
+  enum 
amdgpu_interrupt_state state)
+{
+   /* The ras_controller_irq enablement should be done in psp bl when it
+* tries to enable ras feature. Driver only need to set the correct 
interrupt
+* vector for bare-metal and sriov use case respectively
+*/
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ 
RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+ (state == 
AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+
+   return 0;
+}
+
+static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev,
+struct amdgpu_irq_src *source,
+struct amdgpu_iv_entry *entry)
+{
+   /* By design, the ih cookie for err_event_athub_irq should be written
+* to bif ring. since bif ring is not enabled, just leave process 
callback
+* as a dummy one.
+*/
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs 
nbio_v4_3_ras_err_event_athub_irq_funcs = {
+   .set = nbio_v4_3_set_ras_err_event_athub_irq_state,
+   .process = nbio_v4_3_process_err_event_athub_irq,
+};
+
+static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(bif_doorbell_int_cntl,
+ BIF_DOORBELL_INT_CNTL,
+ RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+   /* driver has to clear the interrupt status when bif ring is 
disabled */
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+   BIF_DOORBELL_INT_CNTL,
+   
RAS_ATHUB_ERR_EVENT_INTER

[PATCH] drm/amdgpu: Initialize umc ras callback

2023-03-20 Thread Hawking Zhang

Fix a coding error which results to null interrupt
handler for umc ras.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index da68ceaa024c..9e2e97207e53 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -232,7 +232,7 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
if (!ras->ras_block.ras_late_init)
ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
 
-   if (ras->ras_block.ras_cb)
+   if (!ras->ras_block.ras_cb)
ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
 
return 0;
-- 
2.17.1

[PATCH 10/10] drm/amdgpu: drop ras check at asic level for new blocks

2023-03-12 Thread Hawking Zhang

amdgpu_ras_register_ras_block should always be invoked
by ras_sw_init, where driver needs to check ras caps
at ip level, instead of asic level.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 834092099bff..c34f51be793c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3076,9 +3076,6 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device 
*adev,
if (!adev || !ras_block_obj)
return -EINVAL;
 
-   if (!amdgpu_ras_asic_supported(adev))
-   return 0;
-
ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
if (!ras_node)
return -ENOMEM;
-- 
2.17.1

[PATCH 08/10] drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init

2023-03-12 Thread Hawking Zhang

To align with other IP blocks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  9 
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  7 ++
 4 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index ab85b85496f2..a407357cb153 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -478,11 +478,10 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
-   if (!adev->gmc.xgmi.connected_to_cpu) {
-   adev->gmc.xgmi.ras = &xgmi_ras;
-   amdgpu_ras_register_ras_block(adev, 
&adev->gmc.xgmi.ras->ras_block);
-   adev->gmc.xgmi.ras_if = &adev->gmc.xgmi.ras->ras_block.ras_comm;
-   }
+   /* xgmi ras block */
+   r = amdgpu_xgmi_ras_sw_init(adev);
+   if (r)
+   return r;
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index fef1575cd0cf..3fe24348d199 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1048,12 +1048,30 @@ struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
 
 struct amdgpu_xgmi_ras xgmi_ras = {
.ras_block = {
-   .ras_comm = {
-   .name = "xgmi_wafl",
-   .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   },
.hw_ops = &xgmi_ras_hw_ops,
.ras_late_init = amdgpu_xgmi_ras_late_init,
},
 };
+
+int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_xgmi_ras *ras;
+
+   if (!adev->gmc.xgmi.ras)
+   return 0;
+
+   ras = adev->gmc.xgmi.ras;
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras 
block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl_pcs");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm;
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 30dcc1681b4e..86fbf56938f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -73,5 +73,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device 
*adev,
adev->gmc.xgmi.hive_id &&
adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
 }
+int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 67c2a5186b8a..2a8dc9b52c2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1381,6 +1381,12 @@ static void gmc_v9_0_set_mca_ras_funcs(struct 
amdgpu_device *adev)
}
 }
 
+static void gmc_v9_0_set_xgmi_ras_funcs(struct amdgpu_device *adev)
+{
+   if (!adev->gmc.xgmi.connected_to_cpu)
+   adev->gmc.xgmi.ras = &xgmi_ras;
+}
+
 static int gmc_v9_0_early_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1404,6 +1410,7 @@ static int gmc_v9_0_early_init(void *handle)
gmc_v9_0_set_gfxhub_funcs(adev);
gmc_v9_0_set_hdp_ras_funcs(adev);
gmc_v9_0_set_mca_ras_funcs(adev);
+   gmc_v9_0_set_xgmi_ras_funcs(adev);
 
adev->gmc.shared_aperture_start = 0x2000ULL;
adev->gmc.shared_aperture_end =
-- 
2.17.1

[PATCH 07/10] drm/amdgpu: Rework mca ras sw_init

2023-03-12 Thread Hawking Zhang

To align with other IP blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 72 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |  9 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 15 +++---
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 44 ++-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.h   |  4 +-
 6 files changed, 103 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 551884dc5245..ab85b85496f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -465,6 +465,19 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
+   /* mca.x ras block */
+   r = amdgpu_mca_mp0_ras_sw_init(adev);
+   if (r)
+   return r;
+
+   r = amdgpu_mca_mp1_ras_sw_init(adev);
+   if (r)
+   return r;
+
+   r = amdgpu_mca_mpio_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = &xgmi_ras;
amdgpu_ras_register_ras_block(adev, 
&adev->gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 51c2a82e2fa4..0b545bdcd636 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -70,3 +70,75 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device 
*adev,
 
amdgpu_mca_reset_error_count(adev, mc_status_addr);
 }
+
+int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_mca_ras_block *ras;
+
+   if (!adev->mca.mp0.ras)
+   return 0;
+
+   ras = adev->mca.mp0.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register mca.mp0 ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mca.mp0.ras_if = &ras->ras_block.ras_comm;
+
+   return 0;
+}
+
+int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev)
+{
+int err;
+struct amdgpu_mca_ras_block *ras;
+
+if (!adev->mca.mp1.ras)
+return 0;
+
+ras = adev->mca.mp1.ras;
+
+err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+if (err) {
+dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
+return err;
+}
+
+strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
+ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+adev->mca.mp1.ras_if = &ras->ras_block.ras_comm;
+
+return 0;
+}
+
+int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
+{
+int err;
+struct amdgpu_mca_ras_block *ras;
+
+if (!adev->mca.mpio.ras)
+return 0;
+
+ras = adev->mca.mpio.ras;
+
+err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+if (err) {
+dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
+return err;
+}
+
+strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
+ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+adev->mca.mpio.ras_if = &ras->ras_block.ras_comm;
+
+return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index 7ce16d16e34b..997a073e2409 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -30,12 +30,7 @@ struct amdgpu_mca_ras {
struct amdgpu_mca_ras_block *ras;
 };
 
-struct amdgpu_mca_funcs {
-   void (*init)(struct amdgpu_device *adev);
-};
-
 struct amdgpu_mca {
-   const struct amdgpu_mca_funcs *funcs;
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
@@ -55,5 +50,7 @@ void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
 void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
  uint64_t mc_status_addr,
  void *ras_error_status);
-
+int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_mca_mpio_r

[PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init

2023-03-12 Thread Hawking Zhang

pcie_bif ras blocks needs to be initialized as early
as possible to handle fatal error detected in hw_init
phase. also align the pcie_bif ras sw_init with other
ras blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 17 ++---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index 37d779b8e4a6..a3bc00577a7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -22,6 +22,29 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_nbio_ras *ras;
+
+   if (!adev->nbio.ras)
+   return 0;
+
+   ras = adev->nbio.ras;
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register pcie_bif ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "pcie_bif");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__PCIE_BIF;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->nbio.ras_if = &ras->ras_block.ras_comm;
+
+   return 0;
+}
+
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index a240336bbc6b..c686ff4bcc39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -106,5 +106,6 @@ struct amdgpu_nbio {
struct amdgpu_nbio_ras  *ras;
 };
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 63dfcc98152d..834092099bff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2555,20 +2555,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 * ras functions so hardware fatal error interrupt
 * can be enabled as early as possible */
switch (adev->asic_type) {
-   case CHIP_VEGA20:
-   case CHIP_ARCTURUS:
-   case CHIP_ALDEBARAN:
-   if (!adev->gmc.xgmi.connected_to_cpu) {
+   case IP_VERSION(7, 4, 0):
+   case IP_VERSION(7, 4, 1):
+   case IP_VERSION(7, 4, 4):
+   if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = &nbio_v7_4_ras;
-   amdgpu_ras_register_ras_block(adev, 
&adev->nbio.ras->ras_block);
-   adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm;
-   }
break;
default:
/* nbio ras is not available */
break;
}
 
+   /* nbio ras block needs to be enabled ahead of other ras blocks
+* to handle fatal error */
+   r = amdgpu_nbio_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt) {
r = adev->nbio.ras->init_ras_controller_interrupt(adev);
-- 
2.17.1

[PATCH 06/10] drm/amdgpu: Move hdp ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize hdp ras block only when mmhub ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 48 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   |  5 ---
 6 files changed, 55 insertions(+), 9 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 00c33ce38761..5f9ac1bcb6bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
-   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
+   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o amdgpu_hdp.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index a15bc513dd67..551884dc5245 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -460,6 +460,11 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
+   /* hdp ras block */
+   r = amdgpu_hdp_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = &xgmi_ras;
amdgpu_ras_register_ras_block(adev, 
&adev->gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
new file mode 100644
index ..b6cf801939aa
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_hdp_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_hdp_ras *ras;
+
+   if (!adev->hdp.ras)
+   return 0;
+
+   ras = adev->hdp.ras;
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register hdp ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "hdp");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__HDP;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->hdp.ras_if = &ras->ras_block.ras_comm;
+
+   /* hdp ras follows amdgpu_ras_block_late_init_default for late init */
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index ac5c61d3de2b..7b8a6152dc8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -43,5 +43,5 @@ struct amdgpu_hdp {
struct amdgpu_hdp_ras   *ras;
 };
 
-int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
+int amdgpu_hdp_ras_sw_init(struct amdgpu_device *adev);
 #endif /* __AMDGPU_HDP_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdg

[PATCH 05/10] drm/amdgpu: Move mmhub ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize mmhub ras block only when mmhub ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 46 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  9 -
 5 files changed, 54 insertions(+), 10 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index d4dfd48451ce..00c33ce38761 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
-   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o \
+   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index c764b57957e8..a15bc513dd67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -455,6 +455,11 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
+   /* mmhub ras block */
+   r = amdgpu_mmhub_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = &xgmi_ras;
amdgpu_ras_register_ras_block(adev, 
&adev->gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
new file mode 100644
index ..0f6b1021fef3
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2023  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_mmhub_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_mmhub_ras *ras;
+
+   if (!adev->mmhub.ras)
+   return 0;
+
+   ras = adev->mmhub.ras;
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register mmhub ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "mmhub");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MMHUB;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mmhub.ras_if = &ras->ras_block.ras_comm;
+
+   /* mmhub ras follows amdgpu_ras_block_late_init_default for late init */
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 93430d3823c9..d21bb6dae56e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -48,5 +48,7 @@ struct amdgpu_mmhub {
struct amdgpu_mmhub_ras  *ras;
 };
 
+int amdgpu_mmhub_ras_sw_init(struct amdgpu_device *adev);
+
 #endif
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e9b6599e790c..b3bb70210122 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -13

[PATCH 03/10] drm/amdgpu: Move umc ras block init to gmc ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize umc ras block only when umc ip block
supports ras. Driver queries ras capabilities after
early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  9 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 30 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 26 -
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 21 -
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 26 -
 7 files changed, 52 insertions(+), 63 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 6830f671cde7..c764b57957e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -446,8 +446,15 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device 
*adev, uint64_t addr,
} while (fault->timestamp < tmp);
 }
 
-int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)
+int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
 {
+   int r;
+
+   /* umc ras block */
+   r = amdgpu_umc_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = &xgmi_ras;
amdgpu_ras_register_ras_block(adev, 
&adev->gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 0305b660cd17..f1773abd5e1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -351,7 +351,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
  uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 uint16_t pasid);
-int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
+int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1b8574bc4463..da68ceaa024c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -208,6 +208,36 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, 
true);
 }
 
+int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_umc_ras *ras;
+
+   if (!adev->umc.ras)
+   return 0;
+
+   ras = adev->umc.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register umc ras block!\n");
+   return err;
+   }
+
+   strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->umc.ras_if = &ras->ras_block.ras_comm;
+
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
+
+   if (ras->ras_block.ras_cb)
+   ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
+
+   return 0;
+}
+
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 36e19336f3b3..d7f1229ff11f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -87,6 +87,7 @@ struct amdgpu_umc {
unsigned long active_mask;
 };
 
+int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index c59c2332d191..924f6f38fae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -703,25 +703,8 @@ static void gmc_v10_0_set_umc_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-   if (adev->umc.ras) {
-   amdgpu_ras_register_ras_block(adev, &adev->umc.ras->ras_block);
-
-   strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
-

[PATCH 02/10] drm/amdgpu: Move vcn ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize vcn ras block only when vcn ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_int.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 29 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   |  6 +++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  6 +++--
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 02d428ddf2f8..8664a5301b2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1158,19 +1158,28 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-void amdgpu_vcn_set_ras_funcs(struct amdgpu_device *adev)
+int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev)
 {
+   int err;
+   struct amdgpu_vcn_ras *ras;
+
if (!adev->vcn.ras)
-   return;
+   return 0;
 
-   amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block);
+   ras = adev->vcn.ras;
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register vcn ras block!\n");
+   return err;
+   }
 
-   strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
-   adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
-   adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
-   adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
+   strcpy(ras->ras_block.ras_comm.name, "vcn");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+   adev->vcn.ras_if = &ras->ras_block.ras_comm;
 
-   /* If don't define special ras_late_init function, use default 
ras_late_init */
-   if (!adev->vcn.ras->ras_block.ras_late_init)
-   adev->vcn.ras->ras_block.ras_late_init = 
amdgpu_ras_block_late_init;
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index d3e2af902907..c730949ece7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -400,6 +400,6 @@ void amdgpu_debugfs_vcn_fwlog_init(struct amdgpu_device 
*adev,
 int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void amdgpu_vcn_set_ras_funcs(struct amdgpu_device *adev);
+int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index b0b0e69c6a94..223e7dfe4618 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -225,6 +225,10 @@ static int vcn_v2_5_sw_init(void *handle)
if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
adev->vcn.pause_dpg_mode = vcn_v2_5_pause_dpg_mode;
 
+   r = amdgpu_vcn_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -2031,6 +2035,4 @@ static void vcn_v2_5_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   amdgpu_vcn_set_ras_funcs(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 43d587404c3e..720ab36f9c92 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -181,6 +181,10 @@ static int vcn_v4_0_sw_init(void *handle)
if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
adev->vcn.pause_dpg_mode = vcn_v4_0_pause_dpg_mode;
 
+   r = amdgpu_vcn_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -2123,6 +2127,4 @@ static void vcn_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   amdgpu_vcn_set_ras_funcs(adev);
 }
-- 
2.17.1

[PATCH 04/10] drm/amdgpu: Correct gfx ras_late_init callback

2023-03-12 Thread Hawking Zhang

Use default gfx ras_late_init callback for gfx
ras block.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 35ed46b9249c..c50d59855011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -725,7 +725,7 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
 
/* If not define special ras_late_init function, use gfx default 
ras_late_init */
if (!ras->ras_block.ras_late_init)
-   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+   ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init;
 
/* If not defined special ras_cb function, use default ras_cb */
if (!ras->ras_block.ras_cb)
-- 
2.17.1

[PATCH 01/10] drm/amdgpu: Move jpeg ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize jpeg ras block only when jpeg ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_int.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 29 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c   |  6 +++--
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  6 +++--
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 8f472517d181..74695161cf1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -235,19 +235,28 @@ int amdgpu_jpeg_process_poison_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-void jpeg_set_ras_funcs(struct amdgpu_device *adev)
+int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev)
 {
+   int err;
+   struct amdgpu_jpeg_ras *ras;
+
if (!adev->jpeg.ras)
-   return;
+   return 0;
 
-   amdgpu_ras_register_ras_block(adev, &adev->jpeg.ras->ras_block);
+   ras = adev->jpeg.ras;
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register jpeg ras block!\n");
+   return err;
+   }
 
-   strcpy(adev->jpeg.ras->ras_block.ras_comm.name, "jpeg");
-   adev->jpeg.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
-   adev->jpeg.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
-   adev->jpeg.ras_if = &adev->jpeg.ras->ras_block.ras_comm;
+   strcpy(ras->ras_block.ras_comm.name, "jpeg");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+   adev->jpeg.ras_if = &ras->ras_block.ras_comm;
 
-   /* If don't define special ras_late_init function, use default 
ras_late_init */
-   if (!adev->jpeg.ras->ras_block.ras_late_init)
-   adev->jpeg.ras->ras_block.ras_late_init = 
amdgpu_ras_block_late_init;
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index e8ca3e32ad52..0ca76f0f23e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -72,6 +72,6 @@ int amdgpu_jpeg_dec_ring_test_ib(struct amdgpu_ring *ring, 
long timeout);
 int amdgpu_jpeg_process_poison_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void jpeg_set_ras_funcs(struct amdgpu_device *adev);
+int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
index f2b743a93915..6b1887808782 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
@@ -138,6 +138,10 @@ static int jpeg_v2_5_sw_init(void *handle)
adev->jpeg.inst[i].external.jpeg_pitch = SOC15_REG_OFFSET(JPEG, 
i, mmUVD_JPEG_PITCH);
}
 
+   r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -806,6 +810,4 @@ static void jpeg_v2_5_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   jpeg_set_ras_funcs(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 3beb731b2ce5..3129094baccc 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -113,6 +113,10 @@ static int jpeg_v4_0_sw_init(void *handle)
adev->jpeg.internal.jpeg_pitch = regUVD_JPEG_PITCH_INTERNAL_OFFSET;
adev->jpeg.inst->external.jpeg_pitch = SOC15_REG_OFFSET(JPEG, 0, 
regUVD_JPEG_PITCH);
 
+   r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -685,6 +689,4 @@ static void jpeg_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   jpeg_set_ras_funcs(adev);
 }
-- 
2.17.1

[PATCH 00/10] add ras sw_init (v2)

2023-03-12 Thread Hawking Zhang

We are moving from soc ras to ip ras to address issues as follows
- RAS sw block init is mixed in early_init and sw_init
- RAS cap check is mixed with both soc check and ip check.

RAS cap check is now only avaialble in amdgpu_ras_init,
based on the cap query from bios. RAS sw block init is all
moved to ras sw_init and follows ip based ras cap check
from amdgpu_ras_init, instead of the check in soc level.

v2: simplify the ras check (Stanley/Tao)

Hawking Zhang (10):
  drm/amdgpu: Move jpeg ras block init to ras sw_init
  drm/amdgpu: Move vcn ras block init to ras sw_init
  drm/amdgpu: Move umc ras block init to gmc ras sw_init
  drm/amdgpu: Correct gfx ras_late_init callback
  drm/amdgpu: Move mmhub ras block init to ras sw_init
  drm/amdgpu: Move hdp ras block init to ras sw_init
  drm/amdgpu: Rework mca ras sw_init
  drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init
  drm/amdgpu: Rework pcie_bif ras sw_init
  drm/amdgpu: drop ras check at asic level for new blocks

 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   | 41 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c   | 48 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c  | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c   | 72 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h   |  9 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 46 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c  | 23 
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 20 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 30 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  | 28 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 26 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c| 21 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 59 +++
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c |  5 --
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c|  6 +-
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c|  6 +-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 44 +-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.h |  4 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c |  6 +-
 31 files changed, 389 insertions(+), 186 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c

-- 
2.17.1

[PATCH] drm/amdgpu: Retire pcie_gen3_enable function

2023-03-06 Thread Hawking Zhang

Not needed since from vi. drop the function so
we don't duplicate code when introduce new asics.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/nv.c| 17 -
 drivers/gpu/drm/amd/amdgpu/soc15.c | 20 
 drivers/gpu/drm/amd/amdgpu/soc21.c | 17 -
 drivers/gpu/drm/amd/amdgpu/vi.c| 20 
 4 files changed, 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 492a8b148227..d56cba10cd5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -519,21 +519,6 @@ static int nv_set_vce_clocks(struct amdgpu_device *adev, 
u32 evclk, u32 ecclk)
return 0;
 }
 
-static void nv_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void nv_program_aspm(struct amdgpu_device *adev)
 {
if (!amdgpu_device_should_use_aspm(adev))
@@ -1041,8 +1026,6 @@ static int nv_common_hw_init(void *handle)
if (adev->nbio.funcs->apply_l1_link_width_reconfig_wa)
adev->nbio.funcs->apply_l1_link_width_reconfig_wa(adev);
 
-   /* enable pcie gen2/3 link */
-   nv_pcie_gen3_enable(adev);
/* enable aspm */
nv_program_aspm(adev);
/* setup nbio registers */
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 2c37b49f5c00..1064972dc558 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -609,24 +609,6 @@ static int soc15_set_vce_clocks(struct amdgpu_device 
*adev, u32 evclk, u32 ecclk
return 0;
 }
 
-static void soc15_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (adev->flags & AMD_IS_APU)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void soc15_program_aspm(struct amdgpu_device *adev)
 {
if (!amdgpu_device_should_use_aspm(adev))
@@ -1183,8 +1165,6 @@ static int soc15_common_hw_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-   /* enable pcie gen2/3 link */
-   soc15_pcie_gen3_enable(adev);
/* enable aspm */
soc15_program_aspm(adev);
/* setup nbio registers */
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 21e271877c4c..e56f2bc73930 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -370,21 +370,6 @@ static int soc21_set_vce_clocks(struct amdgpu_device 
*adev, u32 evclk, u32 ecclk
return 0;
 }
 
-static void soc21_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void soc21_program_aspm(struct amdgpu_device *adev)
 {
if (!amdgpu_device_should_use_aspm(adev))
@@ -714,8 +699,6 @@ static int soc21_common_hw_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-   /* enable pcie gen2/3 link */
-   soc21_pcie_gen3_enable(adev);
/* enable aspm */
soc21_program_aspm(adev);
/* setup nbio registers */
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 12ef782eb478..2512b70ea992 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -1105,24 +1105,6 @@ static int vi_set_vce_clocks(struct amdgpu_device *adev, 
u32 evclk, u32 ecclk)
return 0;
 }
 
-static void vi_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (adev->flags & AMD_IS_APU)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void vi_enable_aspm(struct amdgpu_device *adev)
 {
u32 data, orig;
@@ -1743,8 +1725,6 @@ static

[PATCH 2/2] drm/amdgpu: Move to common helper to query soc rev_id

2023-03-06 Thread Hawking Zhang

Replace soc15, nv, soc21 get_rev_id callback with common
helper so we don't need to duplicate code when introduce
new asics.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 
 drivers/gpu/drm/amd/amdgpu/nv.c|  7 +--
 drivers/gpu/drm/amd/amdgpu/soc15.c |  7 +--
 drivers/gpu/drm/amd/amdgpu/soc21.c |  7 +--
 5 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9387731afb8b..527795f921a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1149,7 +1149,7 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device 
*adev,
 u32 reg_addr, u32 reg_data);
 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
   u32 reg_addr, u64 reg_data);
-
+u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev);
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1b815dc69b3..13fa8a2709c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -807,6 +807,18 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device 
*adev,
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 }
 
+/**
+ * amdgpu_device_get_rev_id - query device rev_id
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Return device rev_id
+ */
+u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
+{
+   return adev->nbio.funcs->get_rev_id(adev);
+}
+
 /**
  * amdgpu_invalid_rreg - dummy reg read function
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 60c10132ed32..492a8b148227 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -566,11 +566,6 @@ void nv_set_virt_ops(struct amdgpu_device *adev)
adev->virt.ops = &xgpu_nv_virt_ops;
 }
 
-static uint32_t nv_get_rev_id(struct amdgpu_device *adev)
-{
-   return adev->nbio.funcs->get_rev_id(adev);
-}
-
 static bool nv_need_full_reset(struct amdgpu_device *adev)
 {
return true;
@@ -712,7 +707,7 @@ static int nv_common_early_init(void *handle)
 
adev->asic_funcs = &nv_asic_funcs;
 
-   adev->rev_id = nv_get_rev_id(adev);
+   adev->rev_id = amdgpu_device_get_rev_id(adev);
adev->external_rev_id = 0xff;
/* TODO: split the GC and PG flags based on the relevant IP version for 
which
 * they are relevant.
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 6392d10e6eaf..2c37b49f5c00 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -653,11 +653,6 @@ const struct amdgpu_ip_block_version 
vega10_common_ip_block =
.funcs = &soc15_common_ip_funcs,
 };
 
-static uint32_t soc15_get_rev_id(struct amdgpu_device *adev)
-{
-   return adev->nbio.funcs->get_rev_id(adev);
-}
-
 static void soc15_reg_base_init(struct amdgpu_device *adev)
 {
/* Set IP register base before any HW register access */
@@ -907,7 +902,7 @@ static int soc15_common_early_init(void *handle)
adev->se_cac_rreg = &soc15_se_cac_rreg;
adev->se_cac_wreg = &soc15_se_cac_wreg;
 
-   adev->rev_id = soc15_get_rev_id(adev);
+   adev->rev_id = amdgpu_device_get_rev_id(adev);
adev->external_rev_id = 0xFF;
/* TODO: split the GC and PG flags based on the relevant IP version for 
which
 * they are relevant.
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 9d91e20a22bb..21e271877c4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -411,11 +411,6 @@ const struct amdgpu_ip_block_version soc21_common_ip_block 
=
.funcs = &soc21_common_ip_funcs,
 };
 
-static uint32_t soc21_get_rev_id(struct amdgpu_device *adev)
-{
-   return adev->nbio.funcs->get_rev_id(adev);
-}
-
 static bool soc21_need_full_reset(struct amdgpu_device *adev)
 {
switch (adev->ip_versions[GC_HWIP][0]) {
@@ -557,7 +552,7 @@ static int soc21_common_early_init(void *handle)
 
adev->asic_funcs = &soc21_asic_funcs;
 
-   adev->rev_id = soc21_get_rev_id(adev);
+   adev->rev_id = amdgpu_device_get_rev_id(adev);
adev->external_rev_id = 0xff;
switch (adev->ip_versions[GC_HWIP][0]) {
case IP_VERSION(11, 0, 0):
-- 
2.17.1

1 2 3 4 >

1 - 100 of 395 matches

Mail list logo