Re: [PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x

2023-02-06 Thread Felix Kuehling

On 2023-02-06 07:58, Ji, Ruili wrote:

From: Ruili Ji 

For the MQD memory, KMD would always allocate 4K memory,
and mes scheduler would write to the end of MQD for unmap flag.

Signed-off-by: Ruili Ji 
---
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 12 +--
  2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c06ada0844ba..d682e6921438 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct 
device_queue_manager *dqm)
int retval;
struct kfd_dev *dev = dqm->dev;
struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
-   uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
-   get_num_all_sdma_engines(dqm) *
-   dev->device_info.num_sdma_queues_per_engine +
-   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   uint32_t size;
+   /*
+* MES write to areas beyond MQD size. So allocate
+* 1 PAGE_SIZE memory for MQD is MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes) {
+   size = PAGE_SIZE *
+   get_num_all_sdma_engines(dqm) *
+   dev->device_info.num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   } else {
+   size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
+   get_num_all_sdma_engines(dqm) *
+   dev->device_info.num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   }


This function is needed mostly as a workaround for Arcturus firmware 
limitations that doesn't have enough SRAM to store 64-bit pointers to 
all SDMA MQDs. When using MES, you can probably just use the generic 
allocate_mqd/kfd_free_mqd_cp function for SDMA MQDs. And you don't need 
an HIQ MQD at all, as far as I know, so you could skip 
allocate_hiq_sdma_mqd completely if MES is enabled.


Regards,
  Felix


  
  	retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size,

&(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index 623ccd227b7d..ea176a515898 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
  {
struct kfd_mem_obj *mqd_mem_obj = NULL;
uint64_t offset;
+   uint32_t size;
  
  	mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);

if (!mqd_mem_obj)
return NULL;
+   /*
+* MES write to areas beyond MQD size. So allocate
+* 1 PAGE_SIZE memory for MQD is MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes)
+   size = PAGE_SIZE;
+   else
+   size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
  
  	offset = (q->sdma_engine_id *

dev->device_info.num_sdma_queues_per_engine +
-   q->sdma_queue_id) *
-   dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
+   q->sdma_queue_id) * size;
  
  	offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
  


RE: [PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x

2023-02-06 Thread Liu, Aaron
[AMD Official Use Only - General]

Reviewed-by: Aaron Liu 

> -Original Message-
> From: Ji, Ruili 
> Sent: Monday, February 6, 2023 8:58 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Kuehling, Felix
> ; Liu, Aaron ; Zhang, Yifan
> ; Ji, Ruili 
> Subject: [PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x
>
> From: Ruili Ji 
>
> For the MQD memory, KMD would always allocate 4K memory, and mes
> scheduler would write to the end of MQD for unmap flag.
>
> Signed-off-by: Ruili Ji 
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20
> +++  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> | 12 +--
>  2 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c06ada0844ba..d682e6921438 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct
> device_queue_manager *dqm)
>   int retval;
>   struct kfd_dev *dev = dqm->dev;
>   struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
> - uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]-
> >mqd_size *
> - get_num_all_sdma_engines(dqm) *
> - dev->device_info.num_sdma_queues_per_engine +
> - dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
> + uint32_t size;
> + /*
> +  * MES write to areas beyond MQD size. So allocate
> +  * 1 PAGE_SIZE memory for MQD is MES is enabled.
> +  */
> + if (dev->shared_resources.enable_mes) {
> + size = PAGE_SIZE *
> + get_num_all_sdma_engines(dqm) *
> + dev->device_info.num_sdma_queues_per_engine +
> + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
> + } else {
> + size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size
> *
> + get_num_all_sdma_engines(dqm) *
> + dev->device_info.num_sdma_queues_per_engine +
> + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
> + }
>
>   retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size,
>   &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 623ccd227b7d..ea176a515898 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct
> kfd_dev *dev,  {
>   struct kfd_mem_obj *mqd_mem_obj = NULL;
>   uint64_t offset;
> + uint32_t size;
>
>   mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
>   if (!mqd_mem_obj)
>   return NULL;
> + /*
> +  * MES write to areas beyond MQD size. So allocate
> +  * 1 PAGE_SIZE memory for MQD is MES is enabled.
> +  */
> + if (dev->shared_resources.enable_mes)
> + size = PAGE_SIZE;
> + else
> + size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]-
> >mqd_size;
>
>   offset = (q->sdma_engine_id *
>   dev->device_info.num_sdma_queues_per_engine +
> - q->sdma_queue_id) *
> - dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
> + q->sdma_queue_id) * size;
>
>   offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
>
> --
> 2.25.1



[PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x

2023-02-06 Thread Ji, Ruili
From: Ruili Ji 

For the MQD memory, KMD would always allocate 4K memory,
and mes scheduler would write to the end of MQD for unmap flag.

Signed-off-by: Ruili Ji 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 12 +--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c06ada0844ba..d682e6921438 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct 
device_queue_manager *dqm)
int retval;
struct kfd_dev *dev = dqm->dev;
struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
-   uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
-   get_num_all_sdma_engines(dqm) *
-   dev->device_info.num_sdma_queues_per_engine +
-   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   uint32_t size;
+   /*
+* MES write to areas beyond MQD size. So allocate
+* 1 PAGE_SIZE memory for MQD is MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes) {
+   size = PAGE_SIZE *
+   get_num_all_sdma_engines(dqm) *
+   dev->device_info.num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   } else {
+   size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
+   get_num_all_sdma_engines(dqm) *
+   dev->device_info.num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   }
 
retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size,
&(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index 623ccd227b7d..ea176a515898 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
 {
struct kfd_mem_obj *mqd_mem_obj = NULL;
uint64_t offset;
+   uint32_t size;
 
mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
if (!mqd_mem_obj)
return NULL;
+   /*
+* MES write to areas beyond MQD size. So allocate
+* 1 PAGE_SIZE memory for MQD is MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes)
+   size = PAGE_SIZE;
+   else
+   size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
 
offset = (q->sdma_engine_id *
dev->device_info.num_sdma_queues_per_engine +
-   q->sdma_queue_id) *
-   dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
+   q->sdma_queue_id) * size;
 
offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
 
-- 
2.25.1