From: Dafna Hirschfeld <dhirschf...@habana.ai>

Since the err_cause register is unprivileged, we should read it from
the driver instead of using the param that came from the FW.

Signed-off-by: Dafna Hirschfeld <dhirschf...@habana.ai>
Reviewed-by: Oded Gabbay <ogab...@kernel.org>
Signed-off-by: Oded Gabbay <ogab...@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 40 +++++++++++++++++++-----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c 
b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 224eaafe953f..94d53cd1b0ff 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8689,14 +8689,13 @@ static int gaudi2_handle_kdma_core_event(struct 
hl_device *hdev, u16 event_type,
        return error_count;
 }
 
-static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type,
-                                       u64 intr_cause_data)
+static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 
event_type, int sts_addr)
 {
-       u32 error_count = 0;
+       u32 error_count = 0, sts_val = RREG32(sts_addr);
        int i;
 
        for (i = 0 ; i < GAUDI2_NUM_OF_DMA_CORE_INTR_CAUSE ; i++)
-               if (intr_cause_data & BIT(i)) {
+               if (sts_val & BIT(i)) {
                        gaudi2_print_event(hdev, event_type, true,
                                "err cause: %s", 
gaudi2_dma_core_interrupts_cause[i]);
                        error_count++;
@@ -8707,6 +8706,27 @@ static int gaudi2_handle_dma_core_event(struct hl_device 
*hdev, u16 event_type,
        return error_count;
 }
 
+static int gaudi2_handle_pdma_core_event(struct hl_device *hdev, u16 
event_type, int pdma_idx)
+{
+       u32 sts_addr;
+
+       sts_addr = mmPDMA0_CORE_ERR_CAUSE + pdma_idx * PDMA_OFFSET;
+       return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr);
+}
+
+static int gaudi2_handle_edma_core_event(struct hl_device *hdev, u16 
event_type, int edma_idx)
+{
+       static const int edma_event_index_map[] = {2, 3, 0, 1, 6, 7, 4, 5};
+       u32 sts_addr, index;
+
+       index = edma_event_index_map[edma_idx];
+
+       sts_addr = mmDCORE0_EDMA0_CORE_ERR_CAUSE +
+                               DCORE_OFFSET * (index / NUM_OF_EDMA_PER_DCORE) +
+                               DCORE_EDMA_OFFSET * (index % 
NUM_OF_EDMA_PER_DCORE);
+       return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr);
+}
+
 static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device 
*hdev, u64 *event_mask)
 {
        u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, 
razwi_happened_addr;
@@ -9524,9 +9544,15 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, 
struct hl_eq_entry *eq_ent
                event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
                break;
 
-       case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_PDMA1_CORE:
-               error_count = gaudi2_handle_dma_core_event(hdev, event_type,
-                                       
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+       case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_HDMA5_CORE:
+               index = event_type - GAUDI2_EVENT_HDMA2_CORE;
+               error_count = gaudi2_handle_edma_core_event(hdev, event_type, 
index);
+               event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+               break;
+
+       case GAUDI2_EVENT_PDMA0_CORE ... GAUDI2_EVENT_PDMA1_CORE:
+               index = event_type - GAUDI2_EVENT_PDMA0_CORE;
+               error_count = gaudi2_handle_pdma_core_event(hdev, event_type, 
index);
                event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
                break;
 
-- 
2.40.0

Reply via email to