From: Tomer Tayar <tta...@habana.ai>

Upon a QM error, the address/size from both the CQ and the ARC_CQ are
printed, although the instruction that led to the error was received
from only one of them.

Moreover, in case of a QM undefined opcode, only one of these
address/size sets will be captured based on the value of ARC_CQ_PTR.
However, this value can be non-zero even if currently the CQ is used, in
case the CQ/ARC_CQ are alternately used.

Under the assumption of having a stop-on-error configuration, modify to
use CP_STS.CUR_CQ field to get the relevant CQ for the QM error.

Signed-off-by: Tomer Tayar <tta...@habana.ai>
Reviewed-by: Oded Gabbay <ogab...@kernel.org>
Signed-off-by: Oded Gabbay <ogab...@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 44 +++++++++----------
 .../include/gaudi2/asic_reg/gaudi2_regs.h     |  1 +
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c 
b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 5075f92d15cc..77c480725a84 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device 
*hdev, u16 event_type,
 
 static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 
qman_base, u64 event_mask)
 {
-       u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
-       u64 cq_ptr, arc_cq_ptr, cp_current_inst;
-
-       lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
-       hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
-       cq_ptr = ((u64) hi) << 32 | lo;
-       cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
-
-       lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
-       hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
-       arc_cq_ptr = ((u64) hi) << 32 | lo;
-       arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
+       u32 lo, hi, cq_ptr_size, cp_sts;
+       u64 cq_ptr, cp_current_inst;
+       bool is_arc_cq;
+
+       cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
+       is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - 
legacy CQ, 1 - ARC_CQ */
+
+       if (is_arc_cq) {
+               lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
+               hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
+               cq_ptr = ((u64) hi) << 32 | lo;
+               cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
+       } else {
+               lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
+               hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
+               cq_ptr = ((u64) hi) << 32 | lo;
+               cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
+       }
 
        lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
        hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
        cp_current_inst = ((u64) hi) << 32 | lo;
 
        dev_info(hdev->dev,
-               "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size 
%u}, CP: {instruction %#llx}\n",
-               cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, 
cp_current_inst);
+               "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction 
%#llx}\n",
+               is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
 
        if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
-               if (arc_cq_ptr) {
-                       hdev->captured_err_info.undef_opcode.cq_addr = 
arc_cq_ptr;
-                       hdev->captured_err_info.undef_opcode.cq_size = 
arc_cq_ptr_size;
-               } else {
-                       hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
-                       hdev->captured_err_info.undef_opcode.cq_size = 
cq_ptr_size;
-               }
-
+               hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
+               hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
                hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
        }
 }
diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h 
b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
index a08378d0802b..8018214a7b59 100644
--- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -250,6 +250,7 @@
 #define QM_ARC_CQ_PTR_HI_OFFSET                (mmPDMA0_QM_ARC_CQ_PTR_HI - 
mmPDMA0_QM_BASE)
 #define QM_ARC_CQ_TSIZE_OFFSET         (mmPDMA0_QM_ARC_CQ_TSIZE - 
mmPDMA0_QM_BASE)
 
+#define QM_CP_STS_4_OFFSET             (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE)
 #define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - 
mmPDMA0_QM_BASE)
 #define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - 
mmPDMA0_QM_BASE)
 
-- 
2.34.1

Reply via email to