RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2

2024-05-28 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Thanks Tao. Yes, I added the comments to amdgpu_ras.h.

+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain */
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE  (16ULL << 20)

Regards,
Hawking

-Original Message-
From: Zhou1, Tao 
Sent: Tuesday, May 28, 2024 14:02
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Kasiviswanathan, Harish 

Subject: RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity 
v2

[AMD Official Use Only - AMD Internal Distribution Only]

I prefer to add comment for AMDGPU_RAS_RESERVED_VRAM_SIZE to explain the value 
of 16MB, anyway the patch is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Zhang, Hawking 
> Sent: Tuesday, May 28, 2024 1:57 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao 
> Cc: Zhang, Hawking ; Kuehling, Felix
> ; Kasiviswanathan, Harish
> 
> Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report
> capacity v2
>
> Add estimate of how much vram we need to reserve for RAS when
> caculating the total available vram.
>
> v2: apply the change to MP0 v13_0_2 and v13_0_14
>
> Signed-off-by: Hawking Zhang 
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 20 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  4 
>  3 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index e98927529f61..ad813772f8a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
> amdgpu_device *adev,  {
>   uint64_t reserved_for_pt =
>   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes
> + : 0);
>   size_t system_mem_needed, ttm_mem_needed, vram_needed;
>   int ret = 0;
>   uint64_t vram_size = 0;
> @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
> amdgpu_device *adev,
>   (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
>kfd_mem_limit.max_ttm_mem_limit) ||
>   (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] +
> vram_needed >
> -  vram_size - reserved_for_pt - atomic64_read(>vram_pin_size)
> +
> +  vram_size - reserved_for_pt - reserved_for_ras -
> +atomic64_read(>vram_pin_size) +
>atomic64_read(>kfd.vram_pinned))) {
>   ret = -ENOMEM;
>   goto release;
> @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct
> amdgpu_device *adev,  {
>   uint64_t reserved_for_pt =
>   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes
> + : 0);
>   ssize_t available;
>   uint64_t vram_available, system_mem_available,
> ttm_mem_available;
>
> @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct
> amdgpu_device *adev,
>   - adev->kfd.vram_used_aligned[xcp_id]
>   - atomic64_read(>vram_pin_size)
>   + atomic64_read(>kfd.vram_pinned)
> - - reserved_for_pt;
> + - reserved_for_pt
> + - reserved_for_ras;
>
>   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>   system_mem_available = no_system_mem_limit ?
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ecce022c657b..f28bf5765380 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct
> amdgpu_device *adev)
>   amdgpu_put_xgmi_hive(hive);  }
>
> +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device
> +*adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> + if (!con || (adev->flags & AMD_IS_APU))
> + return;
> +
> + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
> + case IP_VERSION(13, 0, 2):
> + case IP_VERSION(13, 0, 6):
> + case IP_VERSION(13, 0, 14):
> + con->reserved_pages_in_bytes =
> AMDGPU_RAS_RESERVED_VRAM_SIZE;
> + break;
> + default:
> + 

RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2

2024-05-28 Thread Zhou1, Tao
[AMD Official Use Only - AMD Internal Distribution Only]

I prefer to add comment for AMDGPU_RAS_RESERVED_VRAM_SIZE to explain the value 
of 16MB, anyway the patch is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Zhang, Hawking 
> Sent: Tuesday, May 28, 2024 1:57 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao 
> Cc: Zhang, Hawking ; Kuehling, Felix
> ; Kasiviswanathan, Harish
> 
> Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2
>
> Add estimate of how much vram we need to reserve for RAS when caculating the
> total available vram.
>
> v2: apply the change to MP0 v13_0_2 and v13_0_14
>
> Signed-off-by: Hawking Zhang 
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 20 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  4 
>  3 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index e98927529f61..ad813772f8a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
> amdgpu_device *adev,  {
>   uint64_t reserved_for_pt =
>   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
>   size_t system_mem_needed, ttm_mem_needed, vram_needed;
>   int ret = 0;
>   uint64_t vram_size = 0;
> @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
> amdgpu_device *adev,
>   (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
>kfd_mem_limit.max_ttm_mem_limit) ||
>   (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] +
> vram_needed >
> -  vram_size - reserved_for_pt - atomic64_read(>vram_pin_size)
> +
> +  vram_size - reserved_for_pt - reserved_for_ras -
> +atomic64_read(>vram_pin_size) +
>atomic64_read(>kfd.vram_pinned))) {
>   ret = -ENOMEM;
>   goto release;
> @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct
> amdgpu_device *adev,  {
>   uint64_t reserved_for_pt =
>   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
>   ssize_t available;
>   uint64_t vram_available, system_mem_available, ttm_mem_available;
>
> @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct
> amdgpu_device *adev,
>   - adev->kfd.vram_used_aligned[xcp_id]
>   - atomic64_read(>vram_pin_size)
>   + atomic64_read(>kfd.vram_pinned)
> - - reserved_for_pt;
> + - reserved_for_pt
> + - reserved_for_ras;
>
>   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>   system_mem_available = no_system_mem_limit ?
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ecce022c657b..f28bf5765380 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct
> amdgpu_device *adev)
>   amdgpu_put_xgmi_hive(hive);
>  }
>
> +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device
> +*adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> + if (!con || (adev->flags & AMD_IS_APU))
> + return;
> +
> + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
> + case IP_VERSION(13, 0, 2):
> + case IP_VERSION(13, 0, 6):
> + case IP_VERSION(13, 0, 14):
> + con->reserved_pages_in_bytes =
> AMDGPU_RAS_RESERVED_VRAM_SIZE;
> + break;
> + default:
> + break;
> + }
> +}
> +
>  int amdgpu_ras_init(struct amdgpu_device *adev)  {
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3422,6
> +3440,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   /* Get RAS schema for particular SOC */
>   con->schema = amdgpu_get_ras_schema(adev);
>
> + amdgpu_ras_init_reserved_vram_size(adev);
> +
>   if (amdgpu_ras_fs_init(adev)) {
>   r = -EINVAL;
>   goto release_con;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd

[PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2

2024-05-27 Thread Hawking Zhang
Add estimate of how much vram we need to reserve for RAS
when caculating the total available vram.

v2: apply the change to MP0 v13_0_2 and v13_0_14

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 20 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  4 
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index e98927529f61..ad813772f8a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
uint64_t vram_size = 0;
@@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
-vram_size - reserved_for_pt - atomic64_read(>vram_pin_size) +
+vram_size - reserved_for_pt - reserved_for_ras - 
atomic64_read(>vram_pin_size) +
 atomic64_read(>kfd.vram_pinned))) {
ret = -ENOMEM;
goto release;
@@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
ssize_t available;
uint64_t vram_available, system_mem_available, ttm_mem_available;
 
@@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
- adev->kfd.vram_used_aligned[xcp_id]
- atomic64_read(>vram_pin_size)
+ atomic64_read(>kfd.vram_pinned)
-   - reserved_for_pt;
+   - reserved_for_pt
+   - reserved_for_ras;
 
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_available = no_system_mem_limit ?
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ecce022c657b..f28bf5765380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct 
amdgpu_device *adev)
amdgpu_put_xgmi_hive(hive);
 }
 
+static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (!con || (adev->flags & AMD_IS_APU))
+   return;
+
+   switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+   case IP_VERSION(13, 0, 2):
+   case IP_VERSION(13, 0, 6):
+   case IP_VERSION(13, 0, 14):
+   con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+   break;
+   default:
+   break;
+   }
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3422,6 +3440,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
 
+   amdgpu_ras_init_reserved_vram_size(adev);
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6a8c7b1609df..bd61f77a7134 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -64,6 +64,9 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe000
 
+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain */
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE  (16ULL << 20)
 /* The high three bits indicates socketid */
 #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & 
~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
 
@@ -541,6 +544,7 @@ struct amdgpu_ras {
struct ras_event_manager __event_mgr;
struct ras_event_manager *event_mgr;
 
+   uint64_t reserved_pages_in_bytes;
 };
 
 struct ras_fs_data {
-- 
2.17.1