From: Ofir Bitton <obit...@habana.ai>

User gets notification for every engine error report, but he still
lacks the exact engine information. Hence, we allow user to query
for the exact engine reported an error.

Signed-off-by: Ofir Bitton <obit...@habana.ai>
Reviewed-by: Oded Gabbay <ogab...@kernel.org>
Signed-off-by: Oded Gabbay <ogab...@kernel.org>
---
 drivers/accel/habanalabs/common/device.c      |  14 ++
 drivers/accel/habanalabs/common/habanalabs.h  |  17 ++
 .../habanalabs/common/habanalabs_ioctl.c      |  25 +++
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 168 ++++++++++++++++++
 include/uapi/drm/habanalabs_accel.h           |  16 ++
 5 files changed, 240 insertions(+)

diff --git a/drivers/accel/habanalabs/common/device.c 
b/drivers/accel/habanalabs/common/device.c
index 28be0fc325ea..80cce6b74d05 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2701,6 +2701,20 @@ void hl_handle_fw_err(struct hl_device *hdev, struct 
hl_info_fw_err_info *info)
                *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
 }
 
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 
error_count)
+{
+       struct engine_err_info *info = &hdev->captured_err_info.engine_err;
+
+       /* Capture only the first engine error */
+       if (atomic_cmpxchg(&info->event_detected, 0, 1))
+               return;
+
+       info->event.timestamp = ktime_to_ns(ktime_get());
+       info->event.engine_id = engine_id;
+       info->event.error_count = error_count;
+       info->event_info_available = true;
+}
+
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
 {
        vfree(captured_err_info->page_fault_info.user_mappings);
diff --git a/drivers/accel/habanalabs/common/habanalabs.h 
b/drivers/accel/habanalabs/common/habanalabs.h
index 4fecd300b8dd..201d826b0fb7 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3062,6 +3062,20 @@ struct fw_err_info {
        bool                            event_info_available;
 };
 
+/**
+ * struct engine_err_info - engine error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then an engine event was discovered for the
+ *                  first time after the driver has finished booting-up.
+ * @event_info_available: indicates that an engine event info is now available.
+ */
+struct engine_err_info {
+       struct hl_info_engine_err_event event;
+       atomic_t                        event_detected;
+       bool                            event_info_available;
+};
+
+
 /**
  * struct hl_error_info - holds information collected during an error.
  * @cs_timeout: CS timeout error information.
@@ -3070,6 +3084,7 @@ struct fw_err_info {
  * @page_fault_info: page fault information.
  * @hw_err: (fatal) hardware error information.
  * @fw_err: firmware error information.
+ * @engine_err: engine error information.
  */
 struct hl_error_info {
        struct cs_timeout_info          cs_timeout;
@@ -3078,6 +3093,7 @@ struct hl_error_info {
        struct page_fault_info          page_fault_info;
        struct hw_err_info              hw_err;
        struct fw_err_info              fw_err;
+       struct engine_err_info          engine_err;
 };
 
 /**
@@ -3951,6 +3967,7 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 
addr, u16 eng_id, bool is_
                                u64 *event_mask);
 void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 
*event_mask);
 void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info 
*info);
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 
error_count);
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c 
b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 549b2518fae0..097d65e493c8 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -875,6 +875,28 @@ static int fw_err_info(struct hl_fpriv *hpriv, struct 
hl_info_args *args)
        return rc ? -EFAULT : 0;
 }
 
+static int engine_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+       void __user *user_buf = (void __user *) (uintptr_t) 
args->return_pointer;
+       struct hl_device *hdev = hpriv->hdev;
+       u32 user_buf_size = args->return_size;
+       struct engine_err_info *info;
+       int rc;
+
+       if (!user_buf)
+               return -EINVAL;
+
+       info = &hdev->captured_err_info.engine_err;
+       if (!info->event_info_available)
+               return 0;
+
+       if (user_buf_size < sizeof(struct hl_info_engine_err_event))
+               return -ENOMEM;
+
+       rc = copy_to_user(user_buf, &info->event, sizeof(struct 
hl_info_engine_err_event));
+       return rc ? -EFAULT : 0;
+}
+
 static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args 
*info_args)
 {
        void __user *buff = (void __user *) (uintptr_t) 
info_args->return_pointer;
@@ -1001,6 +1023,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void 
*data,
        case HL_INFO_FW_ERR_EVENT:
                return fw_err_info(hpriv, args);
 
+       case HL_INFO_USER_ENGINE_ERR_EVENT:
+               return engine_err_info(hpriv, args);
+
        case HL_INFO_DRAM_USAGE:
                return dram_usage_info(hpriv, args);
        default:
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c 
b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 70b8f744cd73..222310bf1098 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9588,6 +9588,171 @@ static int hl_arc_event_handle(struct hl_device *hdev, 
u16 event_type,
        }
 }
 
+static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
+{
+       enum gaudi2_block_types type = GAUDI2_BLOCK_TYPE_MAX;
+       u16 index;
+
+       switch (event_type) {
+       case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
+               index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
+               type = GAUDI2_BLOCK_TYPE_TPC;
+               break;
+       case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_TPC24_QM:
+               index = event_type - GAUDI2_EVENT_TPC0_QM;
+               type = GAUDI2_BLOCK_TYPE_TPC;
+               break;
+       case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... 
GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME0_SPI_BASE ... 
GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME0_QM:
+               index = 0;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... 
GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME1_SPI_BASE ... 
GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME1_QM:
+               index = 1;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... 
GAUDI2_EVENT_MME2_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME2_SPI_BASE ... 
GAUDI2_EVENT_MME2_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME2_QM:
+               index = 2;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... 
GAUDI2_EVENT_MME3_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME3_SPI_BASE ... 
GAUDI2_EVENT_MME3_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME3_QM:
+               index = 3;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
+       case GAUDI2_EVENT_KDMA_BM_SPMU:
+       case GAUDI2_EVENT_KDMA0_CORE:
+               return GAUDI2_ENGINE_ID_KDMA;
+       case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
+       case GAUDI2_EVENT_PDMA0_CORE:
+       case GAUDI2_EVENT_PDMA0_BM_SPMU:
+       case GAUDI2_EVENT_PDMA0_QM:
+               return GAUDI2_ENGINE_ID_PDMA_0;
+       case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
+       case GAUDI2_EVENT_PDMA1_CORE:
+       case GAUDI2_EVENT_PDMA1_BM_SPMU:
+       case GAUDI2_EVENT_PDMA1_QM:
+               return GAUDI2_ENGINE_ID_PDMA_1;
+       case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... 
GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
+               index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
+               type = GAUDI2_BLOCK_TYPE_DEC;
+               break;
+       case GAUDI2_EVENT_DEC0_SPI ... GAUDI2_EVENT_DEC9_BMON_SPMU:
+               index = (event_type - GAUDI2_EVENT_DEC0_SPI) >> 1;
+               type = GAUDI2_BLOCK_TYPE_DEC;
+               break;
+       case GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE ... 
GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE:
+               index = event_type - GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE;
+               return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+       case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1:
+               index = event_type - GAUDI2_EVENT_NIC0_QM0;
+               return GAUDI2_ENGINE_ID_NIC0_0 + index;
+       case GAUDI2_EVENT_NIC0_BMON_SPMU ... GAUDI2_EVENT_NIC11_SW_ERROR:
+               index = event_type - GAUDI2_EVENT_NIC0_BMON_SPMU;
+               return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+       case GAUDI2_EVENT_TPC0_BMON_SPMU ... GAUDI2_EVENT_TPC24_KERNEL_ERR:
+               index = (event_type - GAUDI2_EVENT_TPC0_BMON_SPMU) >> 1;
+               type = GAUDI2_BLOCK_TYPE_TPC;
+               break;
+       case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_ROTATOR0_BMON_SPMU:
+       case GAUDI2_EVENT_ROTATOR0_ROT0_QM:
+               return GAUDI2_ENGINE_ID_ROT_0;
+       case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_ROTATOR1_BMON_SPMU:
+       case GAUDI2_EVENT_ROTATOR1_ROT1_QM:
+               return GAUDI2_ENGINE_ID_ROT_1;
+       case GAUDI2_EVENT_HDMA0_BM_SPMU:
+       case GAUDI2_EVENT_HDMA0_QM:
+       case GAUDI2_EVENT_HDMA0_CORE:
+               return GAUDI2_DCORE0_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA1_BM_SPMU:
+       case GAUDI2_EVENT_HDMA1_QM:
+       case GAUDI2_EVENT_HDMA1_CORE:
+               return GAUDI2_DCORE0_ENGINE_ID_EDMA_1;
+       case GAUDI2_EVENT_HDMA2_BM_SPMU:
+       case GAUDI2_EVENT_HDMA2_QM:
+       case GAUDI2_EVENT_HDMA2_CORE:
+               return GAUDI2_DCORE1_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA3_BM_SPMU:
+       case GAUDI2_EVENT_HDMA3_QM:
+       case GAUDI2_EVENT_HDMA3_CORE:
+               return GAUDI2_DCORE1_ENGINE_ID_EDMA_1;
+       case GAUDI2_EVENT_HDMA4_BM_SPMU:
+       case GAUDI2_EVENT_HDMA4_QM:
+       case GAUDI2_EVENT_HDMA4_CORE:
+               return GAUDI2_DCORE2_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA5_BM_SPMU:
+       case GAUDI2_EVENT_HDMA5_QM:
+       case GAUDI2_EVENT_HDMA5_CORE:
+               return GAUDI2_DCORE2_ENGINE_ID_EDMA_1;
+       case GAUDI2_EVENT_HDMA6_BM_SPMU:
+       case GAUDI2_EVENT_HDMA6_QM:
+       case GAUDI2_EVENT_HDMA6_CORE:
+               return GAUDI2_DCORE3_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA7_BM_SPMU:
+       case GAUDI2_EVENT_HDMA7_QM:
+       case GAUDI2_EVENT_HDMA7_CORE:
+               return GAUDI2_DCORE3_ENGINE_ID_EDMA_1;
+       default:
+               break;
+       }
+
+       switch (type) {
+       case GAUDI2_BLOCK_TYPE_TPC:
+               switch (index) {
+               case TPC_ID_DCORE0_TPC0 ... TPC_ID_DCORE0_TPC5:
+                       return GAUDI2_DCORE0_ENGINE_ID_TPC_0 + index;
+               case TPC_ID_DCORE1_TPC0 ... TPC_ID_DCORE1_TPC5:
+                       return GAUDI2_DCORE1_ENGINE_ID_TPC_0 + index - 
TPC_ID_DCORE1_TPC0;
+               case TPC_ID_DCORE2_TPC0 ... TPC_ID_DCORE2_TPC5:
+                       return GAUDI2_DCORE2_ENGINE_ID_TPC_0 + index - 
TPC_ID_DCORE2_TPC0;
+               case TPC_ID_DCORE3_TPC0 ... TPC_ID_DCORE3_TPC5:
+                       return GAUDI2_DCORE3_ENGINE_ID_TPC_0 + index - 
TPC_ID_DCORE3_TPC0;
+               default:
+                       break;
+               }
+               break;
+       case GAUDI2_BLOCK_TYPE_MME:
+               switch (index) {
+               case MME_ID_DCORE0: return GAUDI2_DCORE0_ENGINE_ID_MME;
+               case MME_ID_DCORE1: return GAUDI2_DCORE1_ENGINE_ID_MME;
+               case MME_ID_DCORE2: return GAUDI2_DCORE2_ENGINE_ID_MME;
+               case MME_ID_DCORE3: return GAUDI2_DCORE3_ENGINE_ID_MME;
+               default:
+                       break;
+               }
+               break;
+       case GAUDI2_BLOCK_TYPE_DEC:
+               switch (index) {
+               case DEC_ID_DCORE0_DEC0: return GAUDI2_DCORE0_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE0_DEC1: return GAUDI2_DCORE0_ENGINE_ID_DEC_1;
+               case DEC_ID_DCORE1_DEC0: return GAUDI2_DCORE1_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE1_DEC1: return GAUDI2_DCORE1_ENGINE_ID_DEC_1;
+               case DEC_ID_DCORE2_DEC0: return GAUDI2_DCORE2_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE2_DEC1: return GAUDI2_DCORE2_ENGINE_ID_DEC_1;
+               case DEC_ID_DCORE3_DEC0: return GAUDI2_DCORE3_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE3_DEC1: return GAUDI2_DCORE3_ENGINE_ID_DEC_1;
+               case DEC_ID_PCIE_VDEC0: return GAUDI2_PCIE_ENGINE_ID_DEC_0;
+               case DEC_ID_PCIE_VDEC1: return GAUDI2_PCIE_ENGINE_ID_DEC_1;
+               default:
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
+
+       return U16_MAX;
+}
+
 static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry 
*eq_entry)
 {
        struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -10010,6 +10175,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, 
struct hl_eq_entry *eq_ent
                }
        }
 
+       if (event_mask & HL_NOTIFIER_EVENT_USER_ENGINE_ERR)
+               hl_capture_engine_err(hdev, event_id_to_engine_id(hdev, 
event_type), error_count);
+
        /* Make sure to dump an error in case no error cause was printed so far.
         * Note that although we have counted the errors, we use this number as
         * a boolean.
diff --git a/include/uapi/drm/habanalabs_accel.h 
b/include/uapi/drm/habanalabs_accel.h
index e6436f3e8ea6..f912869b151e 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -809,6 +809,7 @@ enum hl_server_type {
  * HL_INFO_FW_ERR_EVENT   - Retrieve information on the reported FW error.
  *                          May return 0 even though no new data is available, 
in that case
  *                          timestamp will be 0.
+ * HL_INFO_USER_ENGINE_ERR_EVENT - Retrieve the last engine id that reported 
an error.
  */
 #define HL_INFO_HW_IP_INFO                     0
 #define HL_INFO_HW_EVENTS                      1
@@ -845,6 +846,7 @@ enum hl_server_type {
 #define HL_INFO_FW_GENERIC_REQ                 35
 #define HL_INFO_HW_ERR_EVENT                   36
 #define HL_INFO_FW_ERR_EVENT                   37
+#define HL_INFO_USER_ENGINE_ERR_EVENT          38
 
 #define HL_INFO_VERSION_MAX_LEN                        128
 #define HL_INFO_CARD_NAME_MAX_LEN              16
@@ -1226,6 +1228,20 @@ struct hl_info_fw_err_event {
        __u32 pad;
 };
 
+/**
+ * struct hl_info_engine_err_event - engine error info
+ * @timestamp: time-stamp of error occurrence
+ * @engine_id: engine id who reported the error.
+ * @error_count: Amount of errors reported.
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_engine_err_event {
+       __s64 timestamp;
+       __u16 engine_id;
+       __u16 error_count;
+       __u32 pad;
+};
+
 /**
  * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem 
alloc information.
  * @page_order_bitmask: bitmap in which a set bit represents the order of the 
supported page size
-- 
2.34.1

Reply via email to