RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2
[AMD Official Use Only - AMD Internal Distribution Only] Thanks Tao. Yes, I added the comments to amdgpu_ras.h. +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) Regards, Hawking -Original Message- From: Zhou1, Tao Sent: Tuesday, May 28, 2024 14:02 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Kasiviswanathan, Harish Subject: RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2 [AMD Official Use Only - AMD Internal Distribution Only] I prefer to add comment for AMDGPU_RAS_RESERVED_VRAM_SIZE to explain the value of 16MB, anyway the patch is: Reviewed-by: Tao Zhou > -Original Message- > From: Zhang, Hawking > Sent: Tuesday, May 28, 2024 1:57 PM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao > Cc: Zhang, Hawking ; Kuehling, Felix > ; Kasiviswanathan, Harish > > Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report > capacity v2 > > Add estimate of how much vram we need to reserve for RAS when > caculating the total available vram. > > v2: apply the change to MP0 v13_0_2 and v13_0_14 > > Signed-off-by: Hawking Zhang > --- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 > 3 files changed, 31 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index e98927529f61..ad813772f8a1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes > + : 0); > size_t system_mem_needed, ttm_mem_needed, vram_needed; > int ret = 0; > uint64_t vram_size = 0; > @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, > (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > >kfd_mem_limit.max_ttm_mem_limit) || > (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + > vram_needed > > - vram_size - reserved_for_pt - atomic64_read(>vram_pin_size) > + > + vram_size - reserved_for_pt - reserved_for_ras - > +atomic64_read(>vram_pin_size) + >atomic64_read(>kfd.vram_pinned))) { > ret = -ENOMEM; > goto release; > @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes > + : 0); > ssize_t available; > uint64_t vram_available, system_mem_available, > ttm_mem_available; > > @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, > - adev->kfd.vram_used_aligned[xcp_id] > - atomic64_read(>vram_pin_size) > + atomic64_read(>kfd.vram_pinned) > - - reserved_for_pt; > + - reserved_for_pt > + - reserved_for_ras; > > if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { > system_mem_available = no_system_mem_limit ? > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ecce022c657b..f28bf5765380 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct > amdgpu_device *adev) > amdgpu_put_xgmi_hive(hive); } > > +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device > +*adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + > + if (!con || (adev->flags & AMD_IS_APU)) > + return; > + > + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { > + case IP_VERSION(13, 0, 2): > + case IP_VERSION(13, 0, 6): > + case IP_VERSION(13, 0, 14): > + con->reserved_pages_in_bytes = > AMDGPU_RAS_RESERVED_VRAM_SIZE; > + break; > + default: > +
RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2
[AMD Official Use Only - AMD Internal Distribution Only] I prefer to add comment for AMDGPU_RAS_RESERVED_VRAM_SIZE to explain the value of 16MB, anyway the patch is: Reviewed-by: Tao Zhou > -Original Message- > From: Zhang, Hawking > Sent: Tuesday, May 28, 2024 1:57 PM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao > Cc: Zhang, Hawking ; Kuehling, Felix > ; Kasiviswanathan, Harish > > Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2 > > Add estimate of how much vram we need to reserve for RAS when caculating the > total available vram. > > v2: apply the change to MP0 v13_0_2 and v13_0_14 > > Signed-off-by: Hawking Zhang > --- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 > 3 files changed, 31 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index e98927529f61..ad813772f8a1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0); > size_t system_mem_needed, ttm_mem_needed, vram_needed; > int ret = 0; > uint64_t vram_size = 0; > @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, > (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > >kfd_mem_limit.max_ttm_mem_limit) || > (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + > vram_needed > > - vram_size - reserved_for_pt - atomic64_read(>vram_pin_size) > + > + vram_size - reserved_for_pt - reserved_for_ras - > +atomic64_read(>vram_pin_size) + >atomic64_read(>kfd.vram_pinned))) { > ret = -ENOMEM; > goto release; > @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0); > ssize_t available; > uint64_t vram_available, system_mem_available, ttm_mem_available; > > @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, > - adev->kfd.vram_used_aligned[xcp_id] > - atomic64_read(>vram_pin_size) > + atomic64_read(>kfd.vram_pinned) > - - reserved_for_pt; > + - reserved_for_pt > + - reserved_for_ras; > > if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { > system_mem_available = no_system_mem_limit ? > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ecce022c657b..f28bf5765380 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct > amdgpu_device *adev) > amdgpu_put_xgmi_hive(hive); > } > > +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device > +*adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + > + if (!con || (adev->flags & AMD_IS_APU)) > + return; > + > + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { > + case IP_VERSION(13, 0, 2): > + case IP_VERSION(13, 0, 6): > + case IP_VERSION(13, 0, 14): > + con->reserved_pages_in_bytes = > AMDGPU_RAS_RESERVED_VRAM_SIZE; > + break; > + default: > + break; > + } > +} > + > int amdgpu_ras_init(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3422,6 > +3440,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > /* Get RAS schema for particular SOC */ > con->schema = amdgpu_get_ras_schema(adev); > > + amdgpu_ras_init_reserved_vram_size(adev); > + > if (amdgpu_ras_fs_init(adev)) { > r = -EINVAL; > goto release_con; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd
[PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2
Add estimate of how much vram we need to reserve for RAS when caculating the total available vram. v2: apply the change to MP0 v13_0_2 and v13_0_14 Signed-off-by: Hawking Zhang --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index e98927529f61..ad813772f8a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, { uint64_t reserved_for_pt = ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0); size_t system_mem_needed, ttm_mem_needed, vram_needed; int ret = 0; uint64_t vram_size = 0; @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > kfd_mem_limit.max_ttm_mem_limit) || (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > -vram_size - reserved_for_pt - atomic64_read(>vram_pin_size) + +vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(>vram_pin_size) + atomic64_read(>kfd.vram_pinned))) { ret = -ENOMEM; goto release; @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, { uint64_t reserved_for_pt = ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0); ssize_t available; uint64_t vram_available, system_mem_available, ttm_mem_available; @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, - adev->kfd.vram_used_aligned[xcp_id] - atomic64_read(>vram_pin_size) + atomic64_read(>kfd.vram_pinned) - - reserved_for_pt; + - reserved_for_pt + - reserved_for_ras; if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { system_mem_available = no_system_mem_limit ? diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ecce022c657b..f28bf5765380 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) amdgpu_put_xgmi_hive(hive); } +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con || (adev->flags & AMD_IS_APU)) + return; + + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { + case IP_VERSION(13, 0, 2): + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE; + break; + default: + break; + } +} + int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3422,6 +3440,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Get RAS schema for particular SOC */ con->schema = amdgpu_get_ras_schema(adev); + amdgpu_ras_init_reserved_vram_size(adev); + if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto release_con; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6a8c7b1609df..bd61f77a7134 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -64,6 +64,9 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe000 +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) @@ -541,6 +544,7 @@ struct amdgpu_ras { struct ras_event_manager __event_mgr; struct ras_event_manager *event_mgr; + uint64_t reserved_pages_in_bytes; }; struct ras_fs_data { -- 2.17.1