From: Ofir Bitton <obit...@habana.ai>

In order for user to be aware of undefined opcode events, we must
store all relevant information and notify user about the failure.
The user will fetch the stored info via info ioctl.

Signed-off-by: Ofir Bitton <obit...@habana.ai>
Reviewed-by: Oded Gabbay <ogab...@kernel.org>
Signed-off-by: Oded Gabbay <ogab...@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 142 ++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 5 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c 
b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 20c4583f12b0..ed3b0b6225d2 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -993,6 +993,111 @@ 
gaudi2_pcie_addr_dec_error_cause[GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE] = {
        "TLP is blocked by RR"
 };
 
+static const int gaudi2_queue_id_to_engine_id[] = {
+       [GAUDI2_QUEUE_ID_PDMA_0_0...GAUDI2_QUEUE_ID_PDMA_0_3] = 
GAUDI2_ENGINE_ID_PDMA_0,
+       [GAUDI2_QUEUE_ID_PDMA_1_0...GAUDI2_QUEUE_ID_PDMA_1_3] = 
GAUDI2_ENGINE_ID_PDMA_1,
+       [GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_0_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_EDMA_0,
+       [GAUDI2_QUEUE_ID_DCORE0_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_EDMA_1,
+       [GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_0_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_EDMA_0,
+       [GAUDI2_QUEUE_ID_DCORE1_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_EDMA_1,
+       [GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_0_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_EDMA_0,
+       [GAUDI2_QUEUE_ID_DCORE2_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_EDMA_1,
+       [GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_0_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_EDMA_0,
+       [GAUDI2_QUEUE_ID_DCORE3_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_EDMA_1,
+       [GAUDI2_QUEUE_ID_DCORE0_MME_0_0...GAUDI2_QUEUE_ID_DCORE0_MME_0_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_MME,
+       [GAUDI2_QUEUE_ID_DCORE1_MME_0_0...GAUDI2_QUEUE_ID_DCORE1_MME_0_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_MME,
+       [GAUDI2_QUEUE_ID_DCORE2_MME_0_0...GAUDI2_QUEUE_ID_DCORE2_MME_0_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_MME,
+       [GAUDI2_QUEUE_ID_DCORE3_MME_0_0...GAUDI2_QUEUE_ID_DCORE3_MME_0_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_MME,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_0_0...GAUDI2_QUEUE_ID_DCORE0_TPC_0_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_0,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_1_0...GAUDI2_QUEUE_ID_DCORE0_TPC_1_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_1,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_2_0...GAUDI2_QUEUE_ID_DCORE0_TPC_2_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_2,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_3_0...GAUDI2_QUEUE_ID_DCORE0_TPC_3_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_3,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_4_0...GAUDI2_QUEUE_ID_DCORE0_TPC_4_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_4,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_5_0...GAUDI2_QUEUE_ID_DCORE0_TPC_5_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_5,
+       [GAUDI2_QUEUE_ID_DCORE0_TPC_6_0...GAUDI2_QUEUE_ID_DCORE0_TPC_6_3] =
+                                                       
GAUDI2_DCORE0_ENGINE_ID_TPC_6,
+       [GAUDI2_QUEUE_ID_DCORE1_TPC_0_0...GAUDI2_QUEUE_ID_DCORE1_TPC_0_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_TPC_0,
+       [GAUDI2_QUEUE_ID_DCORE1_TPC_1_0...GAUDI2_QUEUE_ID_DCORE1_TPC_1_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_TPC_1,
+       [GAUDI2_QUEUE_ID_DCORE1_TPC_2_0...GAUDI2_QUEUE_ID_DCORE1_TPC_2_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_TPC_2,
+       [GAUDI2_QUEUE_ID_DCORE1_TPC_3_0...GAUDI2_QUEUE_ID_DCORE1_TPC_3_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_TPC_3,
+       [GAUDI2_QUEUE_ID_DCORE1_TPC_4_0...GAUDI2_QUEUE_ID_DCORE1_TPC_4_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_TPC_4,
+       [GAUDI2_QUEUE_ID_DCORE1_TPC_5_0...GAUDI2_QUEUE_ID_DCORE1_TPC_5_3] =
+                                                       
GAUDI2_DCORE1_ENGINE_ID_TPC_5,
+       [GAUDI2_QUEUE_ID_DCORE2_TPC_0_0...GAUDI2_QUEUE_ID_DCORE2_TPC_0_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_TPC_0,
+       [GAUDI2_QUEUE_ID_DCORE2_TPC_1_0...GAUDI2_QUEUE_ID_DCORE2_TPC_1_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_TPC_1,
+       [GAUDI2_QUEUE_ID_DCORE2_TPC_2_0...GAUDI2_QUEUE_ID_DCORE2_TPC_2_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_TPC_2,
+       [GAUDI2_QUEUE_ID_DCORE2_TPC_3_0...GAUDI2_QUEUE_ID_DCORE2_TPC_3_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_TPC_3,
+       [GAUDI2_QUEUE_ID_DCORE2_TPC_4_0...GAUDI2_QUEUE_ID_DCORE2_TPC_4_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_TPC_4,
+       [GAUDI2_QUEUE_ID_DCORE2_TPC_5_0...GAUDI2_QUEUE_ID_DCORE2_TPC_5_3] =
+                                                       
GAUDI2_DCORE2_ENGINE_ID_TPC_5,
+       [GAUDI2_QUEUE_ID_DCORE3_TPC_0_0...GAUDI2_QUEUE_ID_DCORE3_TPC_0_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_TPC_0,
+       [GAUDI2_QUEUE_ID_DCORE3_TPC_1_0...GAUDI2_QUEUE_ID_DCORE3_TPC_1_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_TPC_1,
+       [GAUDI2_QUEUE_ID_DCORE3_TPC_2_0...GAUDI2_QUEUE_ID_DCORE3_TPC_2_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_TPC_2,
+       [GAUDI2_QUEUE_ID_DCORE3_TPC_3_0...GAUDI2_QUEUE_ID_DCORE3_TPC_3_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_TPC_3,
+       [GAUDI2_QUEUE_ID_DCORE3_TPC_4_0...GAUDI2_QUEUE_ID_DCORE3_TPC_4_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_TPC_4,
+       [GAUDI2_QUEUE_ID_DCORE3_TPC_5_0...GAUDI2_QUEUE_ID_DCORE3_TPC_5_3] =
+                                                       
GAUDI2_DCORE3_ENGINE_ID_TPC_5,
+       [GAUDI2_QUEUE_ID_NIC_0_0...GAUDI2_QUEUE_ID_NIC_0_3] = 
GAUDI2_ENGINE_ID_NIC0_0,
+       [GAUDI2_QUEUE_ID_NIC_1_0...GAUDI2_QUEUE_ID_NIC_1_3] = 
GAUDI2_ENGINE_ID_NIC0_1,
+       [GAUDI2_QUEUE_ID_NIC_2_0...GAUDI2_QUEUE_ID_NIC_2_3] = 
GAUDI2_ENGINE_ID_NIC1_0,
+       [GAUDI2_QUEUE_ID_NIC_3_0...GAUDI2_QUEUE_ID_NIC_3_3] = 
GAUDI2_ENGINE_ID_NIC1_1,
+       [GAUDI2_QUEUE_ID_NIC_4_0...GAUDI2_QUEUE_ID_NIC_4_3] = 
GAUDI2_ENGINE_ID_NIC2_0,
+       [GAUDI2_QUEUE_ID_NIC_5_0...GAUDI2_QUEUE_ID_NIC_5_3] = 
GAUDI2_ENGINE_ID_NIC2_1,
+       [GAUDI2_QUEUE_ID_NIC_6_0...GAUDI2_QUEUE_ID_NIC_6_3] = 
GAUDI2_ENGINE_ID_NIC3_0,
+       [GAUDI2_QUEUE_ID_NIC_7_0...GAUDI2_QUEUE_ID_NIC_7_3] = 
GAUDI2_ENGINE_ID_NIC3_1,
+       [GAUDI2_QUEUE_ID_NIC_8_0...GAUDI2_QUEUE_ID_NIC_8_3] = 
GAUDI2_ENGINE_ID_NIC4_0,
+       [GAUDI2_QUEUE_ID_NIC_9_0...GAUDI2_QUEUE_ID_NIC_9_3] = 
GAUDI2_ENGINE_ID_NIC4_1,
+       [GAUDI2_QUEUE_ID_NIC_10_0...GAUDI2_QUEUE_ID_NIC_10_3] = 
GAUDI2_ENGINE_ID_NIC5_0,
+       [GAUDI2_QUEUE_ID_NIC_11_0...GAUDI2_QUEUE_ID_NIC_11_3] = 
GAUDI2_ENGINE_ID_NIC5_1,
+       [GAUDI2_QUEUE_ID_NIC_12_0...GAUDI2_QUEUE_ID_NIC_12_3] = 
GAUDI2_ENGINE_ID_NIC6_0,
+       [GAUDI2_QUEUE_ID_NIC_13_0...GAUDI2_QUEUE_ID_NIC_13_3] = 
GAUDI2_ENGINE_ID_NIC6_1,
+       [GAUDI2_QUEUE_ID_NIC_14_0...GAUDI2_QUEUE_ID_NIC_14_3] = 
GAUDI2_ENGINE_ID_NIC7_0,
+       [GAUDI2_QUEUE_ID_NIC_15_0...GAUDI2_QUEUE_ID_NIC_15_3] = 
GAUDI2_ENGINE_ID_NIC7_1,
+       [GAUDI2_QUEUE_ID_NIC_16_0...GAUDI2_QUEUE_ID_NIC_16_3] = 
GAUDI2_ENGINE_ID_NIC8_0,
+       [GAUDI2_QUEUE_ID_NIC_17_0...GAUDI2_QUEUE_ID_NIC_17_3] = 
GAUDI2_ENGINE_ID_NIC8_1,
+       [GAUDI2_QUEUE_ID_NIC_18_0...GAUDI2_QUEUE_ID_NIC_18_3] = 
GAUDI2_ENGINE_ID_NIC9_0,
+       [GAUDI2_QUEUE_ID_NIC_19_0...GAUDI2_QUEUE_ID_NIC_19_3] = 
GAUDI2_ENGINE_ID_NIC9_1,
+       [GAUDI2_QUEUE_ID_NIC_20_0...GAUDI2_QUEUE_ID_NIC_20_3] = 
GAUDI2_ENGINE_ID_NIC10_0,
+       [GAUDI2_QUEUE_ID_NIC_21_0...GAUDI2_QUEUE_ID_NIC_21_3] = 
GAUDI2_ENGINE_ID_NIC10_1,
+       [GAUDI2_QUEUE_ID_NIC_22_0...GAUDI2_QUEUE_ID_NIC_22_3] = 
GAUDI2_ENGINE_ID_NIC11_0,
+       [GAUDI2_QUEUE_ID_NIC_23_0...GAUDI2_QUEUE_ID_NIC_23_3] = 
GAUDI2_ENGINE_ID_NIC11_1,
+       [GAUDI2_QUEUE_ID_ROT_0_0...GAUDI2_QUEUE_ID_ROT_0_3] = 
GAUDI2_ENGINE_ID_ROT_0,
+       [GAUDI2_QUEUE_ID_ROT_1_0...GAUDI2_QUEUE_ID_ROT_1_3] = 
GAUDI2_ENGINE_ID_ROT_1,
+};
+
 const u32 gaudi2_qm_blocks_bases[GAUDI2_QUEUE_ID_SIZE] = {
        [GAUDI2_QUEUE_ID_PDMA_0_0] = mmPDMA0_QM_BASE,
        [GAUDI2_QUEUE_ID_PDMA_0_1] = mmPDMA0_QM_BASE,
@@ -7753,7 +7858,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device 
*hdev, u16 event_type,
        return !!ecc_data->is_critical;
 }
 
-static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base)
+static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 
qman_base, u64 event_mask)
 {
        u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
        u64 cq_ptr, arc_cq_ptr, cp_current_inst;
@@ -7775,10 +7880,22 @@ static void print_lower_qman_data_on_err(struct 
hl_device *hdev, u64 qman_base)
        dev_info(hdev->dev,
                "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size 
%u}, CP: {instruction %#llx}\n",
                cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, 
cp_current_inst);
+
+       if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+               if (arc_cq_ptr) {
+                       hdev->captured_err_info.undef_opcode.cq_addr = 
arc_cq_ptr;
+                       hdev->captured_err_info.undef_opcode.cq_size = 
arc_cq_ptr_size;
+               } else {
+                       hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
+                       hdev->captured_err_info.undef_opcode.cq_size = 
cq_ptr_size;
+               }
+
+               hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
+       }
 }
 
 static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 
event_type,
-                                                       u64 qman_base, u32 
qid_base)
+                                               u64 qman_base, u32 qid_base, 
u64 *event_mask)
 {
        u32 i, j, glbl_sts_val, arb_err_val, num_error_causes, error_count = 0;
        u64 glbl_sts_addr, arb_err_addr;
@@ -7812,8 +7929,22 @@ static int gaudi2_handle_qman_err_generic(struct 
hl_device *hdev, u16 event_type
                                error_count++;
                        }
 
-               if (i == QMAN_STREAMS)
-                       print_lower_qman_data_on_err(hdev, qman_base);
+               if (i == QMAN_STREAMS && error_count) {
+                       /* check for undefined opcode */
+                       if (glbl_sts_val & 
PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK &&
+                                       
hdev->captured_err_info.undef_opcode.write_enable) {
+                               memset(&hdev->captured_err_info.undef_opcode, 0,
+                                               
sizeof(hdev->captured_err_info.undef_opcode));
+
+                               
hdev->captured_err_info.undef_opcode.write_enable = false;
+                               hdev->captured_err_info.undef_opcode.timestamp 
= ktime_get();
+                               hdev->captured_err_info.undef_opcode.engine_id =
+                                                       
gaudi2_queue_id_to_engine_id[qid_base];
+                               *event_mask |= 
HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+                       }
+
+                       handle_lower_qman_data_on_err(hdev, qman_base, 
*event_mask);
+               }
        }
 
        arb_err_val = RREG32(arb_err_addr);
@@ -8475,7 +8606,8 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, 
u16 event_type, u64 *e
                return 0;
        }
 
-       error_count = gaudi2_handle_qman_err_generic(hdev, event_type, 
qman_base, qid_base);
+       error_count = gaudi2_handle_qman_err_generic(hdev, event_type, 
qman_base,
+                                                               qid_base, 
event_mask);
 
        /* Handle EDMA QM SEI here because there is no AXI error response event 
for EDMA */
        if (event_type >= GAUDI2_EVENT_HDMA2_QM && event_type <= 
GAUDI2_EVENT_HDMA5_QM) {
-- 
2.40.1

Reply via email to