RE: [PATCH] drm/amdgpu: revert "reserve backup pages for bad page retirment"

2021-03-18 Thread Zhang, Hawking
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Christian König  
Sent: Thursday, March 18, 2021 21:09
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Li, 
Dennis ; Deucher, Alexander 
Subject: [PATCH] drm/amdgpu: revert "reserve backup pages for bad page 
retirment"

As noted during the review this approach doesn't make sense at all.

We should not apply any limitation on the VRAM applications can use inside the 
kernel.

If an application or end user wants to reserve a certain amount of VRAM for bad 
pages handling we should do this in the upper layer.

This reverts commit 6ff5de4f1cfc09308d060a7085c1664a3b37f118.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 29 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 96 +---
 4 files changed, 15 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d6a9787e5597..2e7b5d922f31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -180,7 +180,7 @@ struct amdgpu_mgpu_info mgpu_info = {  };  int 
amdgpu_ras_enable = -1;  uint amdgpu_ras_mask = 0x; -int 
amdgpu_bad_page_threshold = 100;
+int amdgpu_bad_page_threshold = -1;
 struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
.period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */ @@ 
-854,7 +854,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 
0444);
  * faulty pages by ECC exceed threshold value and leave it for user's further
  * check.
  */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = 
disable bad page retirement, 100 = default value");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = 
+auto(default value), 0 = disable bad page retirement)");
 module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
 
 MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
(8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a90bf33358d3..0e16683876aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1790,14 +1790,13 @@ static bool amdgpu_ras_check_bad_page(struct 
amdgpu_device *adev,
return ret;
 }
 
-static uint32_t
-amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+   uint32_t max_length)
 {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
-   uint32_t max_length = 0;
 
-   max_length = amdgpu_ras_eeprom_get_record_max_length();
/*
 * Justification of value bad_page_cnt_threshold in ras structure
 *
@@ -1823,18 +1822,20 @@ amdgpu_ras_calculate_badpags_threshold(struct 
amdgpu_device *adev)
tmp_threshold = max_length;
 
if (tmp_threshold == -1) {
-   val = adev->gmc.real_vram_size;
+   val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
-   tmp_threshold = min(lower_32_bits(val), max_length);
+   con->bad_page_cnt_threshold = min(lower_32_bits(val),
+   max_length);
+   } else {
+   con->bad_page_cnt_threshold = tmp_threshold;
}
-
-   return tmp_threshold;
 }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+   uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret;
 
@@ -1854,16 +1855,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;
 
-   if (!con->bad_page_cnt_threshold) {
-   con->bad_page_cnt_threshold =
-   amdgpu_ras_calculate_badpags_threshold(adev);
-
-   ret = amdgpu_vram_mgr_reserve_backup_pages(
-   ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
-   con->bad_page_cnt_threshold);
-   if (ret)
-   goto out;
-   }
+   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
/* Todo: During test the SMU might fail to read the eeprom through I2C
  

[PATCH] drm/amdgpu: revert "reserve backup pages for bad page retirment"

2021-03-18 Thread Christian König
As noted during the review this approach doesn't make sense at all.

We should not apply any limitation on the VRAM applications can use inside the 
kernel.

If an application or end user wants to reserve a certain amount of VRAM for bad 
pages handling we should do this in the upper layer.

This reverts commit 6ff5de4f1cfc09308d060a7085c1664a3b37f118.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 29 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 96 +---
 4 files changed, 15 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d6a9787e5597..2e7b5d922f31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -180,7 +180,7 @@ struct amdgpu_mgpu_info mgpu_info = {
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0x;
-int amdgpu_bad_page_threshold = 100;
+int amdgpu_bad_page_threshold = -1;
 struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
.period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
@@ -854,7 +854,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 
0444);
  * faulty pages by ECC exceed threshold value and leave it for user's further
  * check.
  */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = 
disable bad page retirement, 100 = default value");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default 
value), 0 = disable bad page retirement)");
 module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
 
 MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
(8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a90bf33358d3..0e16683876aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1790,14 +1790,13 @@ static bool amdgpu_ras_check_bad_page(struct 
amdgpu_device *adev,
return ret;
 }
 
-static uint32_t
-amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+   uint32_t max_length)
 {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
-   uint32_t max_length = 0;
 
-   max_length = amdgpu_ras_eeprom_get_record_max_length();
/*
 * Justification of value bad_page_cnt_threshold in ras structure
 *
@@ -1823,18 +1822,20 @@ amdgpu_ras_calculate_badpags_threshold(struct 
amdgpu_device *adev)
tmp_threshold = max_length;
 
if (tmp_threshold == -1) {
-   val = adev->gmc.real_vram_size;
+   val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
-   tmp_threshold = min(lower_32_bits(val), max_length);
+   con->bad_page_cnt_threshold = min(lower_32_bits(val),
+   max_length);
+   } else {
+   con->bad_page_cnt_threshold = tmp_threshold;
}
-
-   return tmp_threshold;
 }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+   uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret;
 
@@ -1854,16 +1855,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;
 
-   if (!con->bad_page_cnt_threshold) {
-   con->bad_page_cnt_threshold =
-   amdgpu_ras_calculate_badpags_threshold(adev);
-
-   ret = amdgpu_vram_mgr_reserve_backup_pages(
-   ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
-   con->bad_page_cnt_threshold);
-   if (ret)
-   goto out;
-   }
+   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
/* Todo: During test the SMU might fail to read the eeprom through I2C
 * when the GPU is pending on XGMI reset during probe time
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 2df88978ccba..dec0db8b0b13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -48,8 +48,6 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
-   struct