Re: [PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x
On 2023-02-06 07:58, Ji, Ruili wrote: From: Ruili Ji For the MQD memory, KMD would always allocate 4K memory, and mes scheduler would write to the end of MQD for unmap flag. Signed-off-by: Ruili Ji --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 12 +-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c06ada0844ba..d682e6921438 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm) int retval; struct kfd_dev *dev = dqm->dev; struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd; - uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * - get_num_all_sdma_engines(dqm) * - dev->device_info.num_sdma_queues_per_engine + - dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + uint32_t size; + /* +* MES write to areas beyond MQD size. So allocate +* 1 PAGE_SIZE memory for MQD is MES is enabled. +*/ + if (dev->shared_resources.enable_mes) { + size = PAGE_SIZE * + get_num_all_sdma_engines(dqm) * + dev->device_info.num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + } else { + size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * + get_num_all_sdma_engines(dqm) * + dev->device_info.num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + } This function is needed mostly as a workaround for Arcturus firmware limitations that doesn't have enough SRAM to store 64-bit pointers to all SDMA MQDs. When using MES, you can probably just use the generic allocate_mqd/kfd_free_mqd_cp function for SDMA MQDs. And you don't need an HIQ MQD at all, as far as I know, so you could skip allocate_hiq_sdma_mqd completely if MES is enabled. Regards, Felix retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size, &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 623ccd227b7d..ea176a515898 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, { struct kfd_mem_obj *mqd_mem_obj = NULL; uint64_t offset; + uint32_t size; mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); if (!mqd_mem_obj) return NULL; + /* +* MES write to areas beyond MQD size. So allocate +* 1 PAGE_SIZE memory for MQD is MES is enabled. +*/ + if (dev->shared_resources.enable_mes) + size = PAGE_SIZE; + else + size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; offset = (q->sdma_engine_id * dev->device_info.num_sdma_queues_per_engine + - q->sdma_queue_id) * - dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; + q->sdma_queue_id) * size; offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
RE: [PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x
[AMD Official Use Only - General] Reviewed-by: Aaron Liu > -Original Message- > From: Ji, Ruili > Sent: Monday, February 6, 2023 8:58 PM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander ; Kuehling, Felix > ; Liu, Aaron ; Zhang, Yifan > ; Ji, Ruili > Subject: [PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x > > From: Ruili Ji > > For the MQD memory, KMD would always allocate 4K memory, and mes > scheduler would write to the end of MQD for unmap flag. > > Signed-off-by: Ruili Ji > --- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 > +++ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c > | 12 +-- > 2 files changed, 26 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index c06ada0844ba..d682e6921438 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct > device_queue_manager *dqm) > int retval; > struct kfd_dev *dev = dqm->dev; > struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd; > - uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]- > >mqd_size * > - get_num_all_sdma_engines(dqm) * > - dev->device_info.num_sdma_queues_per_engine + > - dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; > + uint32_t size; > + /* > + * MES write to areas beyond MQD size. So allocate > + * 1 PAGE_SIZE memory for MQD is MES is enabled. > + */ > + if (dev->shared_resources.enable_mes) { > + size = PAGE_SIZE * > + get_num_all_sdma_engines(dqm) * > + dev->device_info.num_sdma_queues_per_engine + > + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; > + } else { > + size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size > * > + get_num_all_sdma_engines(dqm) * > + dev->device_info.num_sdma_queues_per_engine + > + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; > + } > > retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size, > &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c > index 623ccd227b7d..ea176a515898 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c > @@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct > kfd_dev *dev, { > struct kfd_mem_obj *mqd_mem_obj = NULL; > uint64_t offset; > + uint32_t size; > > mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); > if (!mqd_mem_obj) > return NULL; > + /* > + * MES write to areas beyond MQD size. So allocate > + * 1 PAGE_SIZE memory for MQD is MES is enabled. > + */ > + if (dev->shared_resources.enable_mes) > + size = PAGE_SIZE; > + else > + size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]- > >mqd_size; > > offset = (q->sdma_engine_id * > dev->device_info.num_sdma_queues_per_engine + > - q->sdma_queue_id) * > - dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; > + q->sdma_queue_id) * size; > > offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; > > -- > 2.25.1
[PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x
From: Ruili Ji For the MQD memory, KMD would always allocate 4K memory, and mes scheduler would write to the end of MQD for unmap flag. Signed-off-by: Ruili Ji --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 12 +-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c06ada0844ba..d682e6921438 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm) int retval; struct kfd_dev *dev = dqm->dev; struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd; - uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * - get_num_all_sdma_engines(dqm) * - dev->device_info.num_sdma_queues_per_engine + - dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + uint32_t size; + /* +* MES write to areas beyond MQD size. So allocate +* 1 PAGE_SIZE memory for MQD is MES is enabled. +*/ + if (dev->shared_resources.enable_mes) { + size = PAGE_SIZE * + get_num_all_sdma_engines(dqm) * + dev->device_info.num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + } else { + size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * + get_num_all_sdma_engines(dqm) * + dev->device_info.num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + } retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size, &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 623ccd227b7d..ea176a515898 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, { struct kfd_mem_obj *mqd_mem_obj = NULL; uint64_t offset; + uint32_t size; mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); if (!mqd_mem_obj) return NULL; + /* +* MES write to areas beyond MQD size. So allocate +* 1 PAGE_SIZE memory for MQD is MES is enabled. +*/ + if (dev->shared_resources.enable_mes) + size = PAGE_SIZE; + else + size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; offset = (q->sdma_engine_id * dev->device_info.num_sdma_queues_per_engine + - q->sdma_queue_id) * - dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; + q->sdma_queue_id) * size; offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; -- 2.25.1