add check func for pmfw eeprom
Signed-off-by: Gangliang Xie <[email protected]>
---
.../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 3 +-
drivers/gpu/drm/amd/ras/rascore/ras.h | 7 +++
drivers/gpu/drm/amd/ras/rascore/ras_core.c | 5 +-
drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h | 7 ---
.../gpu/drm/amd/ras/rascore/ras_eeprom_fw.c | 51 +++++++++++++++++++
.../gpu/drm/amd/ras/rascore/ras_eeprom_fw.h | 3 ++
6 files changed, 67 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
index 45ed8c3b5563..7d728e523604 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -137,7 +137,8 @@ static int amdgpu_ras_sys_event_notifier(struct
ras_core_context *ras_core,
break;
case RAS_EVENT_ID__DEVICE_RMA:
ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL,
NULL);
- ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
+ if (!ras_fw_eeprom_supported(ras_core))
+ ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
break;
case RAS_EVENT_ID__RESET_GPU:
ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t
*)data);
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h
b/drivers/gpu/drm/amd/ras/rascore/ras.h
index 05c7923e8f0f..c2a56138b2dd 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -50,6 +50,13 @@
#define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002)
#define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004)
+enum ras_gpu_health_status {
+ RAS_GPU_HEALTH_NONE = 0,
+ RAS_GPU_HEALTH_USABLE = 1,
+ RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
+ RAS_GPU_IN_BAD_STATUS = 3,
+};
+
enum ras_core_fw_feature_flags {
RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0),
};
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
index 1eba279a020b..a4e2ad6a159f 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
@@ -401,7 +401,10 @@ int ras_core_hw_init(struct ras_core_context *ras_core)
goto init_err6;
}
- ret = ras_eeprom_check_storage_status(ras_core);
+ if (ras_fw_eeprom_supported(ras_core))
+ ret = ras_fw_eeprom_check_storage_status(ras_core);
+ else
+ ret = ras_eeprom_check_storage_status(ras_core);
if (ret)
goto init_err6;
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
index 2abe566c18b6..f2c001ef64e1 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
@@ -57,13 +57,6 @@ do { \
(RECORD)->retired_row_pfn = tmp; \
} while (0)
-enum ras_gpu_health_status {
- RAS_GPU_HEALTH_NONE = 0,
- RAS_GPU_HEALTH_USABLE = 1,
- RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
- RAS_GPU_IN_BAD_STATUS = 3,
-};
-
enum ras_eeprom_err_type {
RAS_EEPROM_ERR_NA,
RAS_EEPROM_ERR_RECOVERABLE,
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
index 138d223a1256..f7a71ea797df 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
@@ -453,3 +453,54 @@ int ras_fw_eeprom_hw_fini(struct ras_core_context
*ras_core)
return 0;
}
+
+int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core)
+{
+ struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+ int bad_page_count;
+
+ bad_page_count = ras_umc_get_badpage_count(ras_core);
+
+ if ((control->record_threshold_count < bad_page_count) &&
+ (control->record_threshold_config != 0)) {
+ RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d",
+ bad_page_count,
control->record_threshold_count);
+ if ((control->record_threshold_config ==
WARN_NONSTOP_OVER_THRESHOLD) ||
+ (control->record_threshold_config ==
NONSTOP_OVER_THRESHOLD)) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Please consult AMD Service Action Guide (SAG) for
appropriate service procedures\n");
+ } else {
+ ras_core->is_rma = true;
+ RAS_DEV_ERR(ras_core->dev,
+ "User defined threshold is set, runtime service will be
halt when threshold is reached\n");
+ }
+ return 0;
+ }
+
+ RAS_DEV_INFO(ras_core->dev,
+ "Found existing EEPROM table with %d records\n",
+ bad_page_count);
+ /* Warn if we are at 90% of the threshold or above
+ */
+ if (10 * bad_page_count >= 9 * control->record_threshold_count)
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS records:%u exceeds 90%% of threshold:%d\n",
+ bad_page_count,
+ control->record_threshold_count);
+
+ return 0;
+}
+
+enum ras_gpu_health_status
+ ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core)
+{
+ struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+
+ if (!control->record_threshold_config)
+ return RAS_GPU_HEALTH_NONE;
+
+ if (ras_core->is_rma)
+ return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD;
+
+ return RAS_GPU_HEALTH_USABLE;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
index 5966955cd847..a0301e228863 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
@@ -79,5 +79,8 @@ int ras_fw_eeprom_update_record(struct ras_core_context
*ras_core,
struct ras_bank_ecc *ras_ecc);
int ras_fw_eeprom_hw_init(struct ras_core_context *ras_core);
int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core);
+int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core);
+enum ras_gpu_health_status
+ ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core);
#endif
--
2.34.1