Re: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr
On 2024-10-21 04:12, Christian König wrote: Am 18.10.24 um 23:59 schrieb Philip Yang: On 2024-10-18 14:28, Felix Kuehling wrote: On 2024-10-17 04:34, Victor Zhao wrote: make sure KFD_FENCE_INIT write to fence_addr before pm_send_query_status called, to avoid qcm fence timeout caused by incorrect ordering. Signed-off-by: Victor Zhao --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b2b16a812e73..d9264a353775 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2254,6 +2254,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, goto out; *dqm->fence_addr = KFD_FENCE_INIT; + mb(); pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 09ab36f8e8c6..bddb169bb301 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -260,7 +260,7 @@ struct device_queue_manager { uint16_t vmid_pasid[VMID_NUM]; uint64_t pipelines_addr; uint64_t fence_gpu_addr; - uint64_t *fence_addr; + volatile uint64_t *fence_addr; [+Christian] Is the volatile keyword really needed here? I just saw other patches removing volatile in some places because it's not sufficient, and not needed if you use memory barriers correctly. After reading kernel memory barrier document and below link, I think we need both volatile type and memory barrier, to guarantee F/W get the updated fence value. This fixes an CP hang issue on SRIOV. https://stackoverflow.com/questions/75750110/volatile-vs-memory-barriers#:~:text=volatile%20will%20make%20sure%20that,not%20reorder%20writes%20or%20reads. No, that isn't correct. Using volatile is considered harmful and almost never correct, see here https://docs.kernel.org/process/volatile-considered-harmful.html Placing appropriate memory barriers must be sufficient or otherwise there is a rather bad platform or compiler bug lurking around. Yes, Victor confirmed that memory barrier fixes the issue, will send new patch to remove the volatile type. Regards, Philip Regards, Christian. Regards, Philip Regards, Felix struct kfd_mem_obj *fence_mem; bool active_runlist; int sched_policy;
Re: [PATCH] Revert "drm/amdkfd: SMI report dropped event count"
On 2024-10-21 13:46, Alex Deucher wrote: This reverts commit a3ab2d45b9887ee609cd3bea39f668236935774c. The userspace side for this code is not ready yet so revert for now. Signed-off-by: Alex Deucher Cc: Philip Yang Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 27 +++-- include/uapi/linux/kfd_ioctl.h | 6 - 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index c8d67d62ca3f..9b8169761ec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -42,7 +42,6 @@ struct kfd_smi_client { struct rcu_head rcu; pid_t pid; bool suser; - u32 drop_count; }; #define KFD_MAX_KFIFO_SIZE 8192 @@ -104,28 +103,12 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, } to_copy = min(size, to_copy); ret = kfifo_out(&client->fifo, buf, to_copy); + spin_unlock(&client->lock); if (ret <= 0) { - spin_unlock(&client->lock); ret = -EAGAIN; goto ret_err; } - if (client->drop_count) { - char msg[KFD_SMI_EVENT_MSG_SIZE]; - int len; - - len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); - len += snprintf(msg + len, sizeof(msg) - len, -KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), -client->pid, client->drop_count)); - if (kfifo_avail(&client->fifo) >= len) { - kfifo_in(&client->fifo, msg, len); - client->drop_count = 0; - } - } - - spin_unlock(&client->lock); - ret = copy_to_user(user, buf, to_copy); if (ret) { ret = -EFAULT; @@ -199,15 +182,13 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, list_for_each_entry_rcu(client, &dev->smi_clients, list) { if (!kfd_smi_ev_enabled(pid, client, smi_event)) continue; - spin_lock(&client->lock); - if (!client->drop_count && kfifo_avail(&client->fifo) >= len) { + if (kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - client->drop_count++; - pr_debug("smi_event(EventID: %u): no space left drop_count %d\n", - smi_event, client->drop_count); + pr_debug("smi_event(EventID: %u): no space left\n", + smi_event); } spin_unlock(&client->lock); } diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 7afd66d45313..fa9f9846b88e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -530,7 +530,6 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, - KFD_SMI_EVENT_DROPPED_EVENT = 12, /* * max event number, as a flag bit to get events from all processes, @@ -611,7 +610,6 @@ struct kfd_ioctl_smi_events_args { *rw: 'W' for write page fault, 'R' for read page fault *rescheduled: 'R' if the queue restore failed and rescheduled to try again *error_code: migrate failure error code, 0 if no error - *drop_count: how many events dropped when fifo is full */ #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ "%x %s\n", (reset_seq_num), (reset_cause) @@ -647,10 +645,6 @@ struct kfd_ioctl_smi_events_args { "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\ (node), (unmap_trigger) -#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\ - "%lld -%d %d\n", (ns), (pid), (drop_count) - - /** * CRIU IOCTLs (Checkpoint Restore In Userspace) *
Re: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr
On 2024-10-18 14:28, Felix Kuehling wrote: On 2024-10-17 04:34, Victor Zhao wrote: make sure KFD_FENCE_INIT write to fence_addr before pm_send_query_status called, to avoid qcm fence timeout caused by incorrect ordering. Signed-off-by: Victor Zhao --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b2b16a812e73..d9264a353775 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2254,6 +2254,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, goto out; *dqm->fence_addr = KFD_FENCE_INIT; + mb(); pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 09ab36f8e8c6..bddb169bb301 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -260,7 +260,7 @@ struct device_queue_manager { uint16_t vmid_pasid[VMID_NUM]; uint64_t pipelines_addr; uint64_t fence_gpu_addr; - uint64_t *fence_addr; + volatile uint64_t *fence_addr; [+Christian] Is the volatile keyword really needed here? I just saw other patches removing volatile in some places because it's not sufficient, and not needed if you use memory barriers correctly. After reading kernel memory barrier document and below link, I think we need both volatile type and memory barrier, to guarantee F/W get the updated fence value. This fixes an CP hang issue on SRIOV. https://stackoverflow.com/questions/75750110/volatile-vs-memory-barriers#:~:text=volatile%20will%20make%20sure%20that,not%20reorder%20writes%20or%20reads. Regards, Philip Regards, Felix struct kfd_mem_obj *fence_mem; bool active_runlist; int sched_policy;
Re: [PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling
It is safe to access dqm->sched status inside dqm_lock, no race with gpu reset. Reviewed-by: Philip Yang On 2024-10-18 11:10, Shaoyun Liu wrote: From: shaoyunl Add back kfd queues in start scheduling that originally been removed on stop scheduling. Signed-off-by: Shaoyun Liu Reviewed-by: Felix Kuehling --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b2b16a812e73..edfb9f98555f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -202,6 +202,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r, queue_type; uint64_t wptr_addr_off; + if (!dqm->sched_running || dqm->sched_halt) + return 0; if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; @@ -270,6 +272,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r; struct mes_remove_queue_input queue_input; + if (!dqm->sched_running || dqm->sched_halt) + return 0; if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; @@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_all_queues_mes(struct device_queue_manager *dqm) +static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; struct device *dev = dqm->dev->adev->dev; @@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) +{ + struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q; + int retval = 0; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) +continue; + retval = add_queue_mes(dqm, q, qpd); + if (retval) { +dev_err(dev, "%s: Failed to add queue %d for dev %d", + __func__, + q->properties.queue_id, + dqm->dev->id); +return retval; + } + } + } + + return retval; +} + static int suspend_all_queues_mes(struct device_queue_manager *dqm) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; @@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm) KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else - ret = remove_all_queues_mes(dqm); + ret = remove_all_kfd_queues_mes(dqm); } dqm->sched_halt = true; dqm_unlock(dqm); @@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm) ret = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); + else + ret = add_all_kfd_queues_mes(dqm); + dqm_unlock(dqm); return ret; @@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) if (!dqm->dev->kfd->shared_resources.enable_mes) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else - remove_all_queues_mes(dqm); + remove_all_kfd_queues_mes(dqm); dqm->sched_running = false;
Re: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr
On 2024-10-18 01:31, Zhao, Victor wrote: [AMD Official Use Only - AMD Internal Distribution Only] [AMD Official Use Only - AMD Internal Distribution Only] Ping. Please help review. Thanks, Victor -Original Message- From: Victor Zhao Sent: Thursday, October 17, 2024 4:35 PM To: amd-gfx@lists.freedesktop.org Cc: Zhao, Victor Subject: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr make sure KFD_FENCE_INIT write to fence_addr before pm_send_query_status called, to avoid qcm fence timeout caused by incorrect ordering. Signed-off-by: Victor Zhao Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b2b16a812e73..d9264a353775 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2254,6 +2254,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, goto out; *dqm->fence_addr = KFD_FENCE_INIT; + mb(); pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 09ab36f8e8c6..bddb169bb301 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -260,7 +260,7 @@ struct device_queue_manager { uint16_tvmid_pasid[VMID_NUM]; uint64_tpipelines_addr; uint64_tfence_gpu_addr; - uint64_t*fence_addr; + volatile uint64_t *fence_addr; struct kfd_mem_obj *fence_mem; boolactive_runlist; int sched_policy; -- 2.34.1
Re: [PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling
On 2024-10-17 12:12, Shaoyun Liu wrote: From: shaoyunl Add back kfd queues in start scheduling that originally been removed on stop scheduling. Signed-off-by: Shaoyun Liu --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b2b16a812e73..542363b4712e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -204,6 +204,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!dqm->sched_running || dqm->sched_halt) { up_read(&adev->reset_domain->sem); + return 0; } memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); queue_input.process_id = qpd->pqm->process->pasid; @@ -272,6 +274,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!dqm->sched_running || dqm->sched_halt) { up_read(&adev->reset_domain->sem); + return 0; } It is simpler to move sched_halt/running check outside reset sem lock, but not sure if it is safe. Regards, Philip memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = q->properties.doorbell_off; @@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_all_queues_mes(struct device_queue_manager *dqm) +static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; struct device *dev = dqm->dev->adev->dev; @@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) +{ + struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q; + int retval = 0; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) +continue; + retval = add_queue_mes(dqm, q, qpd); + if (retval) { +dev_err(dev, "%s: Failed to add queue %d for dev %d", + __func__, + q->properties.queue_id, + dqm->dev->id); +return retval; + } + } + } + + return retval; +} + static int suspend_all_queues_mes(struct device_queue_manager *dqm) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; @@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm) KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else - ret = remove_all_queues_mes(dqm); + ret = remove_all_kfd_queues_mes(dqm); } dqm->sched_halt = true; dqm_unlock(dqm); @@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm) ret = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); + else + ret = add_all_kfd_queues_mes(dqm); + dqm_unlock(dqm); return ret; @@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) if (!dqm->dev->kfd->shared_resources.enable_mes) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else - remove_all_queues_mes(dqm); + remove_all_kfd_queues_mes(dqm); dqm->sched_running = false;
[PATCH v3] drm/amdkfd: Accounting pdd vram_usage for svm
Process device data pdd->vram_usage is read by rocm-smi via sysfs, this is currently missing the svm_bo usage accounting, so "rocm-smi --showpids" per process VRAM usage report is incorrect. Add pdd->vram_usage accounting when svm_bo allocation and release, change to atomic64_t type because it is updated outside process mutex now. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 +++--- drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 26 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a1f191a5984b..065d87841459 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) size >>= 1; - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size)); + atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage); } mutex_unlock(&p->mutex); @@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); + atomic64_sub(size, &pdd->vram_usage); err_unlock: err_pdd: @@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { bo_bucket->restored_offset = offset; /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + atomic64_add(bo_bucket->size, &pdd->vram_usage); } return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 6a5bf88cc232..9e5ca0b93b2a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -775,7 +775,7 @@ struct kfd_process_device { enum kfd_pdd_bound bound; /* VRAM usage */ - uint64_t vram_usage; + atomic64_t vram_usage; struct attribute attr_vram; char vram_filename[MAX_SYSFS_FILENAME_LEN]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 7909dfd158be..4810521736a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); } else if (strncmp(attr->name, "sdma_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_sdma); @@ -1625,7 +1625,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->bound = PDD_UNBOUND; pdd->already_dequeued = false; pdd->runtime_inuse = false; - pdd->vram_usage = 0; + atomic64_set(&pdd->vram_usage, 0); pdd->sdma_past_activity_counter = 0; pdd->user_gpu_id = dev->id; atomic64_set(&pdd->evict_duration_counter, 0); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 857ec6f23bba..3e2911895c74 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -405,6 +405,27 @@ static void svm_range_bo_release(struct kref *kref) spin_lock(&svm_bo->list_lock); } spin_unlock(&svm_bo->list_lock); + + if (mmget_not_zero(svm_bo->eviction_fence->mm)) { + struct kfd_process_device *pdd; + struct kfd_process *p; + struct mm_struct *mm; + + mm = svm_bo->eviction_fence->mm; + /* +* The forked child process takes svm_bo device pages ref, svm_bo could be +* released after parent process is gone. +*/ + p = kfd_lookup_process_by_mm(mm); +
Re: [PATCH 1/2] drm/amdkfd: Save pdd to svm_bo to replace node
Drop this patch series, as Felix pointed out, the forked process takes svm_bo device pages ref, svm_bo->pdd could refer to the process that doesn't exist any more. Regards, Philip On 2024-10-11 11:00, Philip Yang wrote: KFD process device data pdd will be used for VRAM usage accounting, save pdd to svm_bo to avoid searching pdd for every accounting, and get KFD node from pdd->dev. svm_bo->pdd will always be valid because KFD process release free all svm_bo first, then destroy process pdds. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +-- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 857ec6f23bba..d40f6fb803df 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -180,7 +180,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, page = hmm_pfn_to_page(hmm_pfns[i]); if (is_zone_device_page(page)) { - struct amdgpu_device *bo_adev = prange->svm_bo->node->adev; + struct amdgpu_device *bo_adev = prange->svm_bo->pdd->dev->adev; addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + bo_adev->vm_manager.vram_base_offset - @@ -457,11 +457,11 @@ svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) } if (svm_bo_ref_unless_zero(prange->svm_bo)) { /* - * Migrate from GPU to GPU, remove range from source svm_bo->node + * Migrate from GPU to GPU, remove range from source svm_bo node * range list, and return false to allocate svm_bo from destination * node. */ - if (prange->svm_bo->node != node) { + if (prange->svm_bo->pdd->dev != node) { mutex_unlock(&prange->lock); spin_lock(&prange->svm_bo->list_lock); @@ -532,6 +532,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear) { + struct kfd_process_device *pdd; struct amdgpu_bo_param bp; struct svm_range_bo *svm_bo; struct amdgpu_bo_user *ubo; @@ -548,17 +549,22 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, return 0; svm_bo = svm_range_bo_new(); - if (!svm_bo) { - pr_debug("failed to alloc svm bo\n"); + if (!svm_bo) return -ENOMEM; + + pdd = svm_range_get_pdd_by_node(prange, node); + if (!pdd) { + r = -ESRCH; + goto out_free; } + svm_bo->pdd = pdd; + mm = get_task_mm(p->lead_thread); if (!mm) { pr_debug("failed to get mm\n"); - kfree(svm_bo); - return -ESRCH; + r = -ESRCH; + goto out_free; } - svm_bo->node = node; svm_bo->eviction_fence = amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), mm, @@ -629,6 +635,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, amdgpu_bo_unref(&bo); create_bo_failed: dma_fence_put(&svm_bo->eviction_fence->base); +out_free: kfree(svm_bo); prange->ttm_res = NULL; @@ -1176,7 +1183,7 @@ svm_range_get_pte_flags(struct kfd_node *node, unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) - bo_node = prange->svm_bo->node; + bo_node = prange->svm_bo->pdd->dev; switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) { case IP_VERSION(9, 4, 1): @@ -1440,7 +1447,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, int r = 0; if (prange->svm_bo && prange->ttm_res) - bo_adev = prange->svm_bo->node->adev; + bo_adev = prange->svm_bo->pdd->dev->adev; p = container_of(prange->svms, struct kfd_process, svms); for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index bddd24f04669..fad2d6d2223a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -48,7 +48,7 @@ struct svm_range_bo { struct work_struct eviction_work; uint32_t evicting; struct work_struct release_work; - struct kfd_node *node; + struct kfd_process_device *pdd; }; enum svm_work_list_ops {
[PATCH 2/2] drm/amdkfd: Accounting pdd vram_usage for svm
Process device data pdd->vram_usage is read by rocm-smi via sysfs, this is currently missing the svm_bo usage accounting, so "rocm-smi --showpids" per process VRAM usage report is incorrect. Add pdd->vram_usage accounting when svm_bo allocation and free, and change type to atomic64_t because it is updated outside process mutex now. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 +++--- drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 ++ 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a1f191a5984b..065d87841459 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) size >>= 1; - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size)); + atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage); } mutex_unlock(&p->mutex); @@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); + atomic64_sub(size, &pdd->vram_usage); err_unlock: err_pdd: @@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { bo_bucket->restored_offset = offset; /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + atomic64_add(bo_bucket->size, &pdd->vram_usage); } return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 6a5bf88cc232..9e5ca0b93b2a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -775,7 +775,7 @@ struct kfd_process_device { enum kfd_pdd_bound bound; /* VRAM usage */ - uint64_t vram_usage; + atomic64_t vram_usage; struct attribute attr_vram; char vram_filename[MAX_SYSFS_FILENAME_LEN]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 7909dfd158be..4810521736a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); } else if (strncmp(attr->name, "sdma_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_sdma); @@ -1625,7 +1625,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->bound = PDD_UNBOUND; pdd->already_dequeued = false; pdd->runtime_inuse = false; - pdd->vram_usage = 0; + atomic64_set(&pdd->vram_usage, 0); pdd->sdma_past_activity_counter = 0; pdd->user_gpu_id = dev->id; atomic64_set(&pdd->evict_duration_counter, 0); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index d40f6fb803df..ba501fffa556 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -409,6 +409,7 @@ static void svm_range_bo_release(struct kref *kref) /* We're not in the eviction worker. Signal the fence. */ dma_fence_signal(&svm_bo->eviction_fence->base); dma_fence_put(&svm_bo->eviction_fence->base); + atomic64_sub(amdgpu_bo_size(svm_bo->bo), &svm_bo->pdd->vram_usage); amdgpu_bo_unref(&svm_bo->bo); kfree(svm_bo); } @@ -628,6 +629,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, spin_lock(&svm_bo->list_lock); list_add(&prange->svm_bo_list, &svm_bo->range_list); spin_unlock(&svm_bo->list_lock); + atomic64_add(amdgpu_bo_size(bo), &svm_bo->pdd->vram_usage); return 0; -- 2.43.2
[PATCH 1/2] drm/amdkfd: Save pdd to svm_bo to replace node
KFD process device data pdd will be used for VRAM usage accounting, save pdd to svm_bo to avoid searching pdd for every accounting, and get KFD node from pdd->dev. svm_bo->pdd will always be valid because KFD process release free all svm_bo first, then destroy process pdds. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +-- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 857ec6f23bba..d40f6fb803df 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -180,7 +180,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, page = hmm_pfn_to_page(hmm_pfns[i]); if (is_zone_device_page(page)) { - struct amdgpu_device *bo_adev = prange->svm_bo->node->adev; + struct amdgpu_device *bo_adev = prange->svm_bo->pdd->dev->adev; addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + bo_adev->vm_manager.vram_base_offset - @@ -457,11 +457,11 @@ svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) } if (svm_bo_ref_unless_zero(prange->svm_bo)) { /* -* Migrate from GPU to GPU, remove range from source svm_bo->node +* Migrate from GPU to GPU, remove range from source svm_bo node * range list, and return false to allocate svm_bo from destination * node. */ - if (prange->svm_bo->node != node) { + if (prange->svm_bo->pdd->dev != node) { mutex_unlock(&prange->lock); spin_lock(&prange->svm_bo->list_lock); @@ -532,6 +532,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear) { + struct kfd_process_device *pdd; struct amdgpu_bo_param bp; struct svm_range_bo *svm_bo; struct amdgpu_bo_user *ubo; @@ -548,17 +549,22 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, return 0; svm_bo = svm_range_bo_new(); - if (!svm_bo) { - pr_debug("failed to alloc svm bo\n"); + if (!svm_bo) return -ENOMEM; + + pdd = svm_range_get_pdd_by_node(prange, node); + if (!pdd) { + r = -ESRCH; + goto out_free; } + svm_bo->pdd = pdd; + mm = get_task_mm(p->lead_thread); if (!mm) { pr_debug("failed to get mm\n"); - kfree(svm_bo); - return -ESRCH; + r = -ESRCH; + goto out_free; } - svm_bo->node = node; svm_bo->eviction_fence = amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), mm, @@ -629,6 +635,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, amdgpu_bo_unref(&bo); create_bo_failed: dma_fence_put(&svm_bo->eviction_fence->base); +out_free: kfree(svm_bo); prange->ttm_res = NULL; @@ -1176,7 +1183,7 @@ svm_range_get_pte_flags(struct kfd_node *node, unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) - bo_node = prange->svm_bo->node; + bo_node = prange->svm_bo->pdd->dev; switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) { case IP_VERSION(9, 4, 1): @@ -1440,7 +1447,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, int r = 0; if (prange->svm_bo && prange->ttm_res) - bo_adev = prange->svm_bo->node->adev; + bo_adev = prange->svm_bo->pdd->dev->adev; p = container_of(prange->svms, struct kfd_process, svms); for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index bddd24f04669..fad2d6d2223a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -48,7 +48,7 @@ struct svm_range_bo { struct work_struct eviction_work; uint32_tevicting; struct work_struct release_work; - struct kfd_node *node; + struct kfd_process_device *pdd; }; enum svm_work_list_ops { -- 2.43.2
Re: [PATCH] drm/amdkfd: Accounting pdd vram_usage for svm
On 2024-10-09 17:20, Felix Kuehling wrote: On 2024-10-04 16:28, Philip Yang wrote: Per process device data pdd->vram_usage is used by rocm-smi to report VRAM usage, this is currently missing the svm_bo usage accounting, so "rocm-smi --showpids" per process report is incorrect. Add pdd->vram_usage accounting for svm_bo and change type to atomic64_t because it is updated outside process mutex now. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 +++--- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 22 ++ 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a1f191a5984b..065d87841459 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) size >>= 1; - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size)); + atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage); } mutex_unlock(&p->mutex); @@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); + atomic64_sub(size, &pdd->vram_usage); err_unlock: err_pdd: @@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { bo_bucket->restored_offset = offset; /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + atomic64_add(bo_bucket->size, &pdd->vram_usage); } return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 6a5bf88cc232..9e5ca0b93b2a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -775,7 +775,7 @@ struct kfd_process_device { enum kfd_pdd_bound bound; /* VRAM usage */ - uint64_t vram_usage; + atomic64_t vram_usage; struct attribute attr_vram; char vram_filename[MAX_SYSFS_FILENAME_LEN]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 7909dfd158be..4810521736a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); } else if (strncmp(attr->name, "sdma_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_sdma); @@ -1625,7 +1625,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->b
[PATCH] drm/amdkfd: Accounting pdd vram_usage for svm
Per process device data pdd->vram_usage is used by rocm-smi to report VRAM usage, this is currently missing the svm_bo usage accounting, so "rocm-smi --showpids" per process report is incorrect. Add pdd->vram_usage accounting for svm_bo and change type to atomic64_t because it is updated outside process mutex now. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 +++--- drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 22 ++ 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a1f191a5984b..065d87841459 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) size >>= 1; - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size)); + atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage); } mutex_unlock(&p->mutex); @@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); + atomic64_sub(size, &pdd->vram_usage); err_unlock: err_pdd: @@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { bo_bucket->restored_offset = offset; /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + atomic64_add(bo_bucket->size, &pdd->vram_usage); } return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 6a5bf88cc232..9e5ca0b93b2a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -775,7 +775,7 @@ struct kfd_process_device { enum kfd_pdd_bound bound; /* VRAM usage */ - uint64_t vram_usage; + atomic64_t vram_usage; struct attribute attr_vram; char vram_filename[MAX_SYSFS_FILENAME_LEN]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 7909dfd158be..4810521736a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); } else if (strncmp(attr->name, "sdma_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_sdma); @@ -1625,7 +1625,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->bound = PDD_UNBOUND; pdd->already_dequeued = false; pdd->runtime_inuse = false; - pdd->vram_usage = 0; + atomic64_set(&pdd->vram_usage, 0); pdd->sdma_past_activity_counter = 0; pdd->user_gpu_id = dev->id; atomic64_set(&pdd->evict_duration_counter, 0); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 857ec6f23bba..61891ea6b1ac 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -379,6 +379,7 @@ static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) static void svm_range_bo_release(struct kref *kref) { struct svm_range_bo *svm_bo; + struct mm_struct *mm = NULL; svm_bo = container_of(kref, struct svm_range_bo, kref); pr_debug("svm_bo 0x%p\n", svm_bo); @@ -405,6 +406,22 @@ static void svm_range_bo_release(struct kref *kref) spin_lock(&svm_bo->list_lock); } spin_unlock(&svm_bo->list_lock); + + if (mmget_not_zero(svm_bo->eviction_fence->mm)) { + struct kfd_process_device *pdd; + struct kfd_process *p; + + mm
[PATCH] drm/amdkfd: Copy wave state only for compute queue
get_wave_state is not defined for sdma queue, copy_context_work_handler calls it for sdma queue will crash. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 648f40091aa3..b2b16a812e73 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -3173,7 +3173,7 @@ struct copy_context_work_handler_workarea { struct kfd_process *p; }; -static void copy_context_work_handler (struct work_struct *work) +static void copy_context_work_handler(struct work_struct *work) { struct copy_context_work_handler_workarea *workarea; struct mqd_manager *mqd_mgr; @@ -3200,6 +3200,9 @@ static void copy_context_work_handler (struct work_struct *work) struct qcm_process_device *qpd = &pdd->qpd; list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE) + continue; + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP]; /* We ignore the return value from get_wave_state -- 2.43.2
Re: [PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions
On 2024-09-11 02:54, Christian König wrote: Yeah, I completely agree with Xiaogang. The PASID is an identifier of an address space. And the idea of the KFD was that we can just use the same address space and with it the page tables for multiple execution devices, e.g. CPUs, GPUs etc... That idea turned out to be a bad one because it clashes with some use cases (e.g. native context virtualization). The better approach is to see the CPU and GPU processes as separate things which just share the same underlying data. Opening the KFD node multiple times currently results in the same KFD process being used. We should probably consider changing that. It is one KFD process binding to one app process, with count to support multiple open/close of for the same process. The IOMMU most likely uses Linux process pid, not from kfd process->pasid. The KFD process->pasid is passed to F/W to map queues, flush TLB. The reason to replace vm->pasid with KFD process->pasid is to find vm from fault pasid, then for compute vm, find the kfd process from pasid. I can see a bug in amdgpu_vm_handle_fault, only for compute vm, to force update PTE no-retry-fault to the incorrect VM for multiple partitions. This patch will fix this bug but we can have a simple fix. Regards, Philip Regards, Christian. Am 11.09.24 um 01:59 schrieb Chen, Xiaogang: You want have 1:1 mapping between vm and pasid so can query vm from pasid. I think there is a basic existing issue that we cannot have vm and pasid 1:1 correspondence. PASIDs are global address space identifiers that can be shared between the GPU, an IOMMU and the driver. One app should have one pasid that iommu uses to decide which page table to use when device access system resource. But one app can open render/kfd node multiple times even for one gpu. That said one app could have multiple GPU vms . I think we did not have this issue because app usually open a rent node or kfd node only once. With one adev has multiple partitions there are multiple vms on one adev, so have this issue. But the root cause is not from multiple partitions and solution is not to introduce multiple pasids. I think we should have one pasid for one app and use different way to get vm from pasid. Regards Xiaogang On 9/10/2024 3:47 PM, Kim, Jonathan wrote: [Public] Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. [Public] KMS open still set per pasid-vm bindings per adev (socket) so I don’t see how the per-partition overwrite PASID issue is primarily a KFD concern. Are you saying the KFD process devices holds a shadow copy of the correct VM during page restore during fault? Doesn’t it acquire the wrong VM object on process init in the first place? Even if it were the case the KFD had a separate VM reference, the underlying IRQ fault handling is still broken. We probably don’t want to bandage over something to fix one symptom. Jon From: Yang, Philip Sent: Tuesday, September 10, 2024 11:24 AM To: Koenig, Christian ; Kim, Jonathan ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Deucher, Alexander ; Joshi, Mukul Subject: Re: [PATCH] drm/amdkfd: fix
Re: [PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions
On 2024-09-09 14:46, Christian König wrote: Am 09.09.24 um 18:02 schrieb Kim, Jonathan: [Public] -Original Message- From: Christian König Sent: Thursday, September 5, 2024 10:24 AM To: Kim, Jonathan ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Deucher, Alexander ; Joshi, Mukul Subject: Re: [PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. Am 19.08.24 um 19:59 schrieb Jonathan Kim: Currently multiple partitions will incorrectly overwrite the VM lookup table since the table is indexed by PASID and multiple partitions can register different VM objects on the same PASID. That's a rather bad idea. Why do we have the same PASID for different VM objects in the first place? Alex can probably elaborate on the KGD side, but from what I can see, the KMS driver open call has always assigned a new VM object per PASID on an open call. The KFD acquires and replaces the KGD PASID-VMID registration on its own compute process open/creation call. If this is the bad_idea you're referring to, then someone else will have to chime in. I don't have much history on this unfortunately. Yeah, Felix and I designed that. app opens drm node to create vm for each partition, with different vm->pasid for each vm, issue is from kfd_ioctl_acquire_vm -> kfd_process_device_init_vm -> amdgpu_amdkfd_gpuvm_set_vm_pasid, to replace all vm->pasid with kfd process->pasid, which is from open kfd node. This ends up to store only one vm to adev->vm_manager.pasids with KFD process pasid, so we cannot retrieve correct vm from adev->vm_manager.pasids on mGPUs or multiple partitions. That aside, the current problem is, is that all KFD device structures are logical partitions and register their PASID-VM binding using this concept of a device. As far as I can see that is the fundamental problem. This needs to be fixed instead. On the KGD side however, the registration table is maintained in the adev struct, which is a physical socket. So there's a mismatch in understanding of what a device is between the KFD & KGD with regard to the look up table that results in bad bindings. Adding a per-partition dimension to the existing lookup table resolves issues where seeing, for example, with memory violation interception and XNACK i.e bad bindings result in wrong vm object found to set no-retry flags on memory violations. svm_range_restore_pages retry fault recover uses fault pasid to get kfd process, and use the fault node_id to get pdd->vm, maybe you can use this way to fix the debugger issue. Regards, Philip Yeah that is pretty much a no-go. The PASID and how it is used is defined by the PCIe specifications. If we now start to assign multiple VMs to the same PASID then we are violating the PCIe specification. The problems you see are most likely just the tip of the iceberg here. Regards, Christian. Jon Regards, Christian. This results in loading the wrong VM object on PASID query. To correct this, setup the lookup table to be per-partition-per-PASID instead. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 55 +++-- -- driver
Re: [PATCH V5] drm/amdgpu: Surface svm_default_granularity, a RW module parameter
On 2024-09-03 19:24, Ramesh Errabolu wrote: Enables users to update SVM's default granularity, used in buffer migration and handling of recoverable page faults. Param value is set in terms of log(numPages(buffer)), e.g. 9 for a 2 MIB buffer Signed-off-by: Ramesh Errabolu With 2 below nitpicks fixed, this patch is Reviewed-by: Philip Yang change subject to "drm/amdkfd: Add svm_default_granularity module parameter" --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 17 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 23 +++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e8c284aea1f2..8eb934af02f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -237,6 +237,7 @@ extern int sched_policy; extern bool debug_evictions; extern bool no_system_mem_limit; extern int halt_if_hws_hang; +extern uint amdgpu_svm_default_granularity; #else static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS; static const bool __maybe_unused debug_evictions; /* = false */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b9529948f2b2..442039436cb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -169,6 +169,16 @@ uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu; char *amdgpu_virtual_display; bool enforce_isolation; + +/* Specifies the default granularity for SVM, used in buffer + * migration and restoration of backing memory when handling + * recoverable page faults. + * + * The value is given as log(numPages(buffer)); for a 2 MiB + * buffer it computes to be 9 + */ +uint amdgpu_svm_default_granularity = 9; + /* * OverDrive(bit 14) disabled by default * GFX DCS(bit 19) disabled by default @@ -320,6 +330,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444); MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(msi, amdgpu_msi, int, 0444); +/** + * DOC: svm_default_granularity (uint) + * Used in buffer migration and handling of recoverable page faults + */ +MODULE_PARM_DESC(svm_default_granularity, "SVM's default granularity in log(2^Pages), default 9 = 2^9 = 2 MiB"); +module_param_named(svm_default_granularity, amdgpu_svm_default_granularity, uint, 0644); + /** * DOC: lockup_timeout (string) * Set GPU scheduler timeout value in ms. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 9ae9abc6eb43..d6530febabad 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -868,6 +868,12 @@ struct svm_range_list { struct task_struct *faulting_task; /* check point ts decides if page fault recovery need be dropped */ uint64_t checkpoint_ts[MAX_GPU_INSTANCE]; + + /* Default granularity to use in buffer migration + * and restoration of backing memory while handling + * recoverable page faults + */ + uint8_t default_granularity; }; /* Process data */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index b44dec90969f..2bc2389cc7f0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -309,12 +309,13 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) } static void -svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, - uint8_t *granularity, uint32_t *flags) +svm_range_set_default_attributes(struct svm_range_list *svms, int32_t *location, + int32_t *prefetch_loc, uint8_t *granularity, + uint32_t *flags) { *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; - *granularity = 9; + *granularity = svms->default_granularity; *flags = KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; } @@ -358,9 +359,10 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); - svm_range_set_default_attributes(&prange->preferred_loc, + svm_range_set_default_attributes(svms, &prange->preferred_loc, &prange->prefetch_loc, - &prange->granularity, &prange->flags); + &prange->granularity, + &prange->flags); unnecessary extra change. pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); @@ -2694,9 +2696,10 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); start_limi
Re: [PATCH v2] drm/amdgpu: fix a call trace when unload amdgpu driver
On 2024-09-04 04:04, Asher Song wrote: In some APUs, the bo type of GART page table is ttm_bo_type_sg. Those type BOs is released by bo->delayed_delete which is added in ttm_device->wq, not released immediately. To make sure all the ttm_resource is released before ttm_resource_manager is finilized, drain the workqueue in ttm_device. v2: move drain_workqueue to amdgpu_ttm.c Fixes:d99fbd9aab62 ("drm/ttm: Always take the bo delayed cleanup path for imported bos") Suggested-by: Christian König Signed-off-by: Asher Song Acked-by: Philip Yang Most likely this will fix another bug caused by race condition b/w GPU mode 1 reset and delayed bo cleanup worker. Thank you. Philip --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 5c938ff0bf48..cbac21df5c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2461,6 +2461,7 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev) drm_dev_exit(idx); } + drain_workqueue(adev->mman.bdev.wq); amdgpu_direct_gma_fini(adev); amdgpu_vram_mgr_fini(adev); amdgpu_gtt_mgr_fini(adev);
Re: [PATCH] drm/amdgpu: fix invalid fence handling in amdgpu_vm_tlb_flush
On 2024-09-02 05:06, Christian König wrote: Am 02.09.24 um 05:03 schrieb Lang Yu: Fixes: 5a1c27951966 ("drm/amdgpu: implement TLB flush fence") Signed-off-by: Lang Yu Ah yes, that explains why CPU based updates doesn't work reliable any more. My understanding is amdgpu_vm_cpu_commit increase the vm->tlb_seq if needs_flush, so this patch only fix the tlb_cb memory leaking issue. Regards, Philip You need to add some explanation to the commit message, e.g. something like "CPU based updates doesn't produce a fence." With that done Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1d46a5c81ec4..f93804902fd3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -908,10 +908,12 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, { struct amdgpu_vm *vm = params->vm; - if (!fence || !*fence) + tlb_cb->vm = vm; + if (!fence || !*fence) { + amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); return; + } - tlb_cb->vm = vm; if (!dma_fence_add_callback(*fence, &tlb_cb->cb, amdgpu_vm_tlb_seq_cb)) { dma_fence_put(vm->last_tlb_flush);
Re: [PATCH V3] drm/amdgpu: Surface svm_default_granularity, a RW module parameter
On 2024-08-29 18:31, Chen, Xiaogang wrote: On 8/29/2024 5:13 PM, Ramesh Errabolu wrote: Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. Enables users to update SVM's default granularity, used in buffer migration and handling of recoverable page faults. Param value is set in terms of log(numPages(buffer)), e.g. 9 for a 2 MIB buffer Forgot asking if this parameter is request from customer or used for debug/experiment purpose? If it is later, how about put it at debug fs? There are already many driver parameters. debugfs is not always available, depending on kernel configuration, and debugfs seems for debugging purpose, ex. /sys/kernel/debug/kfd/mqds, hqds, not for functional purpose. one comment embedded below. Regards Xiaogang Signed-off-by: Ramesh Errabolu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 17 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 25 +++-- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e8c284aea1f2..8eb934af02f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -237,6 +237,7 @@ extern int sched_policy; extern bool debug_evictions; extern bool no_system_mem_limit; extern int halt_if_hws_hang; +extern uint amdgpu_svm_default_granularity; #else static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS; static const bool __maybe_unused debug_evictions; /* = false */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b9529948f2b2..442039436cb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -169,6 +169,16 @@ uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu; char *amdgpu_virtual_display; bool enforce_isolation; + +/* Specifies the default granularity for SVM, used in buffer + * migration and restoration of backing memory when handling + * recoverable page faults. + * + * The value is given as log(numPages(buffer)); for a 2 MiB + * buffer it computes to be 9 + */ +uint amdgpu_svm_default_granularity = 9; + /* * OverDrive(bit 14) disabled by default * GFX DCS(bit 19) disabled by default @@ -320,6 +330,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444); MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(msi, amdgpu_msi, int, 0444); +/** + * DOC: svm_default_granularity (uint) + * Used in buffer migration and handling of recoverable page faults + */ +MODULE_PARM_DESC(svm_default_granularity, "SVM's default granularity in log(2^Pages), default 9 = 2^9 = 2 MiB"); +module_param_named(svm_default_granularity, amdgpu_svm_default_granularity, uint, 0644); + /** * DOC: lockup_timeout (string) * Set GPU scheduler timeout value in ms. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 9ae9abc6eb43..d6530febabad 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -868,6 +868,12 @@ struct svm_range_list { struct task_struct *faulting_task; /* check point ts decides if page fault recovery need be dropped */ uint64_t checkpoint_ts[MAX_GPU_INSTANCE];
Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset
On 2024-08-29 17:15, Felix Kuehling wrote: On 2024-08-23 15:49, Philip Yang wrote: If GPU reset kick in while KFD restore_process_worker running, this may causes different issues, for example below rcu stall warning, because restore work may move BOs and evict queues under VRAM pressure. Fix this race by taking adev reset_domain read semaphore to prevent GPU reset in restore_process_worker, the reset read semaphore can be taken recursively if adev have multiple partitions. Then there is live locking issue if CP hangs while restore_process_worker runs, then GPU reset wait for semaphore to start and restore_process_worker cannot finish to release semaphore. We need signal eviction fence to solve the live locking if evict queue return -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs, amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded! rcu: INFO: rcu_sched self-detected stall on CPU Workqueue: kfd_restore_wq restore_process_worker [amdgpu] Call Trace: update_process_times+0x94/0xd0 RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu] amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu] restore_process_helper+0x27/0x80 [amdgpu] Signed-off-by: Philip Yang See comments inline. I'd also like Christian to take a look at this patch since he's the expert on the reset locking stuff. --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a902950cc060..53a814347522 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -35,6 +35,7 @@ #include #include "amdgpu_amdkfd.h" #include "amdgpu.h" +#include "amdgpu_reset.h" struct mm_struct; @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct *work) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid); - } else + } else if (ret == -ETIMEDOUT || ret == -ETIME) { + /* If CP hangs, signal the eviction fence, then restore_bo_worker + * can finish to up_read GPU reset semaphore to start GPU reset. + */ + signal_eviction_fence(p); + } else { pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); + } } static int restore_process_helper(struct kfd_process *p) @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p) return ret; } +/* + * kfd_hold_devices_reset_semaphore + * + * return: + * true : hold reset domain semaphore to prevent device reset + * false: one of the device is resetting or already reset + * + */ +static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p) I find the function naming of these functions (hold/unhold) a bit weird. I'd suggest kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems. ok +{ + struct amdgpu_device *adev; + int i; + + for (i = 0; i < p->n_pdds; i++) { + adev = p->pdds[i]->dev->adev; + if (!down_read_trylock(&adev->reset_domain->sem)) + goto out_upread; + } + return true; + +out_upread: + while (i--) { + adev = p->pdds[i]->dev->adev;
Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset
On 2024-08-28 18:01, Felix Kuehling wrote: On 2024-08-23 15:49, Philip Yang wrote: If GPU reset kick in while KFD restore_process_worker running, this may causes different issues, for example below rcu stall warning, because restore work may move BOs and evict queues under VRAM pressure. Fix this race by taking adev reset_domain read semaphore to prevent GPU reset in restore_process_worker, the reset read semaphore can be taken recursively if adev have multiple partitions. Are you sure that an rw_sem can be read-locked recursively in the same thread. I can't find any evidence that this is true. yes, down_read_trylock(&adev->reset_domain->sem) still return 1 for successful inside down_read_trylock(&adev->reset_domain->sem). This works fine in many path now, for example execute_queues_cpsch down read lock reset_domain->sem -> unmap_queues_cpsch down read lock reset_domain->sem again. Regards, Philip Regards, Felix Then there is live locking issue if CP hangs while restore_process_worker runs, then GPU reset wait for semaphore to start and restore_process_worker cannot finish to release semaphore. We need signal eviction fence to solve the live locking if evict queue return -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs, amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded! rcu: INFO: rcu_sched self-detected stall on CPU Workqueue: kfd_restore_wq restore_process_worker [amdgpu] Call Trace: update_process_times+0x94/0xd0 RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu] amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu] restore_process_helper+0x27/0x80 [amdgpu] Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a902950cc060..53a814347522 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -35,6 +35,7 @@ #include #include "amdgpu_amdkfd.h" #include "amdgpu.h" +#include "amdgpu_reset.h" struct mm_struct; @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct *work) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid); - } else + } else if (ret == -ETIMEDOUT || ret == -ETIME) { + /* If CP hangs, signal the eviction fence, then restore_bo_worker + * can finish to up_read GPU reset semaphore to start GPU reset. + */ + signal_eviction_fence(p); + } else { pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); + } } static int restore_process_helper(struct kfd_process *p) @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p) return ret; } +/* + * kfd_hold_devices_reset_semaphore + * + * return: + * true : hold reset domain semaphore to prevent device reset + * false: one of the device is resetting or already reset + * + */ +static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p) +{ + struct amdgpu_device *adev; + int i; + + for (i = 0; i < p->n_pdds; i++) { + adev = p->pdds[i]->dev->adev; + if (!down_read_trylock(&adev->reset_domain->sem)) + goto out
Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter
On 2024-08-26 15:34, Ramesh Errabolu wrote: Enables users to update the default size of buffer used in migration either from Sysmem to VRAM or vice versa. The param GOBM refers to granularity of buffer migration, and is specified in terms of log(numPages(buffer)). It facilitates users of unregistered memory to control GOBM, it is used for both registered and unregistered range cases. albeit at a coarse level Signed-off-by: Ramesh Errabolu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 26 - 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e8c284aea1f2..73dd816b01f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -237,6 +237,7 @@ extern int sched_policy; extern bool debug_evictions; extern bool no_system_mem_limit; extern int halt_if_hws_hang; +extern uint amdgpu_svm_attr_gobm; #else static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS; static const bool __maybe_unused debug_evictions; /* = false */ @@ -313,6 +314,9 @@ extern int amdgpu_wbrf; /* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */ #define AMDGPU_SWCTF_EXTRA_DELAY 50 +/* Default size of buffer to use in migrating buffer */ +#define AMDGPU_SVM_ATTR_GOBM 9 + struct amdgpu_xcp_mgr; struct amdgpu_device; struct amdgpu_irq_src; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b9529948f2b2..09c501753a3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -169,6 +169,17 @@ uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu; char *amdgpu_virtual_display; bool enforce_isolation; + +/* Specifies the default size of buffer to use in + * migrating buffer from Sysmem to VRAM and vice + * versa + * + * GOBM - Granularity of Buffer Migration + * + * Defined as log2(sizeof(buffer)/PAGE_SIZE) + */ +uint amdgpu_svm_attr_gobm = AMDGPU_SVM_ATTR_GOBM; + /* * OverDrive(bit 14) disabled by default * GFX DCS(bit 19) disabled by default @@ -320,6 +331,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444); MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(msi, amdgpu_msi, int, 0444); +/** + * DOC: svm_attr_gobm (uint) + * Size of buffer to use in migrating buffer from Sysmem to VRAM and vice versa + */ +MODULE_PARM_DESC(svm_attr_gobm, "Defined as log2(sizeof(buffer)/PAGE_SIZE), e.g. 9 for 2 MiB"); +module_param_named(svm_attr_gobm, amdgpu_svm_attr_gobm, uint, 0644); + /** * DOC: lockup_timeout (string) * Set GPU scheduler timeout value in ms. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 9ae9abc6eb43..c2e54b18c167 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -868,6 +868,18 @@ struct svm_range_list { struct task_struct *faulting_task; /* check point ts decides if page fault recovery need be dropped */ uint64_t checkpoint_ts[MAX_GPU_INSTANCE]; + + /* Indicates the default size to use in migrating + * buffers of a process from Sysmem to VRAM and vice + * versa. The max legal value cannot be greater than + * 0x3F + * + * @note: A side effect of this symbol being part of + * struct svm_range_list is that it forces all buffers + * of the process of unregistered kind to use the same + * size in buffer migration + */ The comment is good enough, note maybe misleading, not needed. + uint8_t attr_gobm; }; /* Process data */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index b44dec90969f..78c78baddb1f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -309,12 +309,11 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) } static void -svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, - uint8_t *granularity, uint32_t *flags) +svm_range_set_default_attributes(int32_t *location, + int32_t *prefetch_loc, uint32_t *flags) { *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; - *granularity = 9; as svm_range_set_default_attributes is called in multiple places, add new parameter struct svm_range_list *svms, to remove the duplicate code. *granularity = svms->attr_gobm; *flags = KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; } @@ -358,9 +357,9 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_a
[PATCH v3 0/4] Improve SVM migrate event report
1. Document how to use SMI system management interface to receive SVM events, define string format macro for user mode. 2. Increase the event kfifo size, so less chance to drop event. 3. Output migrate end event with error code if migration failed. 4. Report dropped event count if fifo is full. v3: Simplify event drop count handling (James Zhu) Philip Yang (4): drm/amdkfd: Document and define SVM events message macro drm/amdkfd: Output migrate end event if migrate failed drm/amdkfd: Increase SMI event fifo size drm/amdkfd: SMI report dropped event count drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 14 ++- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 79 +-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 3 +- include/uapi/linux/kfd_ioctl.h | 107 +--- 4 files changed, 150 insertions(+), 53 deletions(-) -- 2.43.2
[PATCH v3 4/4] drm/amdkfd: SMI report dropped event count
Add new SMI event to report the dropped event count. When the event kfifo is full, drop count is not zero, or no enough space left to store the event message, increase drop count. After reading event out from kfifo, if event was dropped, drop_count is not zero, generate a dropped event record and reset drop count to zero. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 25 + include/uapi/linux/kfd_ioctl.h | 6 + 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 9b8169761ec5..db321b8ea127 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -42,6 +42,7 @@ struct kfd_smi_client { struct rcu_head rcu; pid_t pid; bool suser; + u32 drop_count; }; #define KFD_MAX_KFIFO_SIZE 8192 @@ -103,12 +104,26 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, } to_copy = min(size, to_copy); ret = kfifo_out(&client->fifo, buf, to_copy); - spin_unlock(&client->lock); if (ret <= 0) { + spin_unlock(&client->lock); ret = -EAGAIN; goto ret_err; } + if (client->drop_count) { + char msg[KFD_SMI_EVENT_MSG_SIZE]; + int len; + + len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); + len += snprintf(msg + len, sizeof(msg) - len, + KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), + client->pid, client->drop_count)); + kfifo_in(&client->fifo, msg, len); + client->drop_count = 0; + } + + spin_unlock(&client->lock); + ret = copy_to_user(user, buf, to_copy); if (ret) { ret = -EFAULT; @@ -182,13 +197,15 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, list_for_each_entry_rcu(client, &dev->smi_clients, list) { if (!kfd_smi_ev_enabled(pid, client, smi_event)) continue; + spin_lock(&client->lock); - if (kfifo_avail(&client->fifo) >= len) { + if (!client->drop_count && kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - pr_debug("smi_event(EventID: %u): no space left\n", - smi_event); + client->drop_count++; + pr_debug("smi_event(EventID: %u): no space left drop_count %d\n", +smi_event, client->drop_count); } spin_unlock(&client->lock); } diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index fa9f9846b88e..7afd66d45313 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -530,6 +530,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, + KFD_SMI_EVENT_DROPPED_EVENT = 12, /* * max event number, as a flag bit to get events from all processes, @@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args { *rw: 'W' for write page fault, 'R' for read page fault *rescheduled: 'R' if the queue restore failed and rescheduled to try again *error_code: migrate failure error code, 0 if no error + *drop_count: how many events dropped when fifo is full */ #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ "%x %s\n", (reset_seq_num), (reset_cause) @@ -645,6 +647,10 @@ struct kfd_ioctl_smi_events_args { "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\ (node), (unmap_trigger) +#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\ + "%lld -%d %d\n", (ns), (pid), (drop_count) + + /** * CRIU IOCTLs (Checkpoint Restore In Userspace) * -- 2.43.2
[PATCH] drm/amdkfd: SMI report dropped event count
Add new SMI event to report the dropped event count. When the event kfifo is full, drop count is not zero, or no enough space left to store the event message, increase drop count. After reading event out from kfifo, if event was dropped, drop_count is not zero, generate a dropped event record and reset drop count to zero. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 25 + include/uapi/linux/kfd_ioctl.h | 6 + 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 9b8169761ec5..db321b8ea127 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -42,6 +42,7 @@ struct kfd_smi_client { struct rcu_head rcu; pid_t pid; bool suser; + u32 drop_count; }; #define KFD_MAX_KFIFO_SIZE 8192 @@ -103,12 +104,26 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, } to_copy = min(size, to_copy); ret = kfifo_out(&client->fifo, buf, to_copy); - spin_unlock(&client->lock); if (ret <= 0) { + spin_unlock(&client->lock); ret = -EAGAIN; goto ret_err; } + if (client->drop_count) { + char msg[KFD_SMI_EVENT_MSG_SIZE]; + int len; + + len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); + len += snprintf(msg + len, sizeof(msg) - len, + KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), + client->pid, client->drop_count)); + kfifo_in(&client->fifo, msg, len); + client->drop_count = 0; + } + + spin_unlock(&client->lock); + ret = copy_to_user(user, buf, to_copy); if (ret) { ret = -EFAULT; @@ -182,13 +197,15 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, list_for_each_entry_rcu(client, &dev->smi_clients, list) { if (!kfd_smi_ev_enabled(pid, client, smi_event)) continue; + spin_lock(&client->lock); - if (kfifo_avail(&client->fifo) >= len) { + if (!client->drop_count && kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - pr_debug("smi_event(EventID: %u): no space left\n", - smi_event); + client->drop_count++; + pr_debug("smi_event(EventID: %u): no space left drop_count %d\n", +smi_event, client->drop_count); } spin_unlock(&client->lock); } diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index fa9f9846b88e..7afd66d45313 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -530,6 +530,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, + KFD_SMI_EVENT_DROPPED_EVENT = 12, /* * max event number, as a flag bit to get events from all processes, @@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args { *rw: 'W' for write page fault, 'R' for read page fault *rescheduled: 'R' if the queue restore failed and rescheduled to try again *error_code: migrate failure error code, 0 if no error + *drop_count: how many events dropped when fifo is full */ #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ "%x %s\n", (reset_seq_num), (reset_cause) @@ -645,6 +647,10 @@ struct kfd_ioctl_smi_events_args { "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\ (node), (unmap_trigger) +#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\ + "%lld -%d %d\n", (ns), (pid), (drop_count) + + /** * CRIU IOCTLs (Checkpoint Restore In Userspace) * -- 2.43.2
[PATCH v3 1/4] drm/amdkfd: Document and define SVM events message macro
Document how to use SMI system management interface to enable and receive SVM events. Document SVM event triggers. Define SVM events message string format macro that could be used by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 45 + include/uapi/linux/kfd_ioctl.h | 100 +--- 2 files changed, 109 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index ea6a8e43bd5b..de8b9abf7afc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, amdgpu_reset_get_desc(reset_context, reset_cause, sizeof(reset_cause)); - kfd_smi_event_add(0, dev, event, "%x %s\n", - dev->reset_seq_num, - reset_cause); + kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( + dev->reset_seq_num, reset_cause)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING( throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (task_info) { /* Report VM faults from user applications, not retry from kernel */ if (task_info->pid) - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", -task_info->pid, task_info->task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( + task_info->pid, task_info->task_name)); amdgpu_vm_put_task_info(task_info); } } @@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", + KFD_EVENT_FMT_MIGRATE_START( ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + from, to, prefetch_loc, preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, @@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, uint32_t from, uint32_t to, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", + KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); +
[PATCH v3 3/4] drm/amdkfd: Increase SMI event fifo size
SMI event fifo size 1KB was enough to report GPU vm fault or reset event, but could drop the more frequent SVM migration events. Increase kfifo size to 8KB to store about 100 migrate events, less chance to drop the migrate events if lots of migration happened in the short period of time. Add KFD prefix to the macro name. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 1d94b445a060..9b8169761ec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -44,7 +44,7 @@ struct kfd_smi_client { bool suser; }; -#define MAX_KFIFO_SIZE 1024 +#define KFD_MAX_KFIFO_SIZE 8192 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); @@ -86,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, struct kfd_smi_client *client = filep->private_data; unsigned char *buf; - size = min_t(size_t, size, MAX_KFIFO_SIZE); + size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE); buf = kmalloc(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -355,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) return -ENOMEM; INIT_LIST_HEAD(&client->list); - ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL); if (ret) { kfree(client); return ret; -- 2.43.2
[PATCH v3 2/4] drm/amdkfd: Output migrate end event if migrate failed
If page migration failed, also output migrate end event to match with migrate start event, with failure error_code added to the end of the migrate message macro. This will not break uAPI because application uses old message macro sscanf drop and ignore the error_code. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 14 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 3 ++- include/uapi/linux/kfd_ioctl.h | 7 --- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 8ee3d07ffbdf..eacfeb32f35d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -445,14 +445,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", mpages, cpages, migrate.npages); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - 0, node->id, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + 0, node->id, trigger, r); out: if (!r && mpages) { pdd = svm_range_get_pdd_by_node(prange, node); @@ -751,14 +750,13 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(&migrate); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - node->id, 0, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + node->id, 0, trigger, r); out: if (!r && cpages) { mpages = cpages - upages; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index de8b9abf7afc..1d94b445a060 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -292,12 +292,13 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, -uint32_t from, uint32_t to, uint32_t trigger) +uint32_t from, uint32_t to, uint32_t trigger, +int error_code) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger)); + from, to, trigger, error_code)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 85010b8307f8..503bff13d815 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -44,7 +44,8 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, -uint32_t from, uint32_t to, uint32_t trigger); +uint32_t from, uint32_t to, uint32_t trigger, +int error_code); void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 717307d6b5b7..fa9f9846b88e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -609,6 +609,7 @@ struct kfd_ioctl_smi_events_args { *migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update *rw: 'W' for write page fault, 'R' for read page fault *rescheduled: 'R' if the queue restore failed and rescheduled to
[PATCH] drm/amdkfd: restore_process_worker race with GPU reset
If GPU reset kick in while KFD restore_process_worker running, this may causes different issues, for example below rcu stall warning, because restore work may move BOs and evict queues under VRAM pressure. Fix this race by taking adev reset_domain read semaphore to prevent GPU reset in restore_process_worker, the reset read semaphore can be taken recursively if adev have multiple partitions. Then there is live locking issue if CP hangs while restore_process_worker runs, then GPU reset wait for semaphore to start and restore_process_worker cannot finish to release semaphore. We need signal eviction fence to solve the live locking if evict queue return -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs, amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded! rcu: INFO: rcu_sched self-detected stall on CPU Workqueue: kfd_restore_wq restore_process_worker [amdgpu] Call Trace: update_process_times+0x94/0xd0 RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu] amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu] restore_process_helper+0x27/0x80 [amdgpu] Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a902950cc060..53a814347522 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -35,6 +35,7 @@ #include #include "amdgpu_amdkfd.h" #include "amdgpu.h" +#include "amdgpu_reset.h" struct mm_struct; @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct *work) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid); - } else + } else if (ret == -ETIMEDOUT || ret == -ETIME) { + /* If CP hangs, signal the eviction fence, then restore_bo_worker +* can finish to up_read GPU reset semaphore to start GPU reset. +*/ + signal_eviction_fence(p); + } else { pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); + } } static int restore_process_helper(struct kfd_process *p) @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p) return ret; } +/* + * kfd_hold_devices_reset_semaphore + * + * return: + * true : hold reset domain semaphore to prevent device reset + * false: one of the device is resetting or already reset + * + */ +static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p) +{ + struct amdgpu_device *adev; + int i; + + for (i = 0; i < p->n_pdds; i++) { + adev = p->pdds[i]->dev->adev; + if (!down_read_trylock(&adev->reset_domain->sem)) + goto out_upread; + } + return true; + +out_upread: + while (i--) { + adev = p->pdds[i]->dev->adev; + up_read(&adev->reset_domain->sem); + } + return false; +} + +static void kfd_unhold_devices_reset_semaphore(struct kfd_process *p) +{ + struct amdgpu_device *adev; + int i; + + for (i = 0; i < p->n_pdds; i++) { + adev = p->pdds[i]->dev->adev; + up_read(&adev->reset_domain->sem); + } +} + static void restore_process_worker(struct work_struct *work) { struct delayed_work *dwork; @@ -2009,6 +2055,12 @@ static void restore_process_worker(struct work_struct *work) * lifetime of this thread, kfd_process p will be valid */ p = container_of(dwork, struct kfd_process, restore_work); + + if (!kfd_hold_devices_reset_semaphore(p)) { + pr_debug("GPU resetting, restore bo and queue skipped\n"); + return; + } + pr_debug("Started restoring pasid 0x%x\n", p->pasid); /* Setting last_restore_timestamp before successful restoration. @@ -2031,6 +2083,8 @@ static void restore_process_worker(struct work_struct *work) msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); } + + kfd_unhold_devices_reset_semaphore(p); } void kfd_suspend_all_processes(void) -- 2.43.2
Re: [PATCH v2 3/4] drm/amdkfd: Increase SMI event fifo size
On 2024-08-22 10:34, James Zhu wrote: On 2024-07-30 16:15, Philip Yang wrote: SMI event fifo size 1KB was enough to report GPU vm fault or reset [JZ] There is a typo here. it should be NOT enough. It is not typo, but it is not clear, will change to SMI event fifo size 1KB is enough to report GPU vm fault or reset event, but could drop the more frequent SVM migration events, Regards, Philip event, increase it to 8KB to store about 100 migrate events, less chance to drop the migrate events if lots of migration happened in the short period of time. Add KFD prefix to the macro name. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 1d94b445a060..9b8169761ec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -44,7 +44,7 @@ struct kfd_smi_client { bool suser; }; -#define MAX_KFIFO_SIZE 1024 +#define KFD_MAX_KFIFO_SIZE 8192 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); @@ -86,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, struct kfd_smi_client *client = filep->private_data; unsigned char *buf; - size = min_t(size_t, size, MAX_KFIFO_SIZE); + size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE); buf = kmalloc(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -355,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) return -ENOMEM; INIT_LIST_HEAD(&client->list); - ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL); if (ret) { kfree(client); return ret;
Re: [PATCH v2 1/4] drm/amdkfd: Document and define SVM events message macro
On 2024-08-22 10:32, James Zhu wrote: On 2024-07-30 16:15, Philip Yang wrote: Document how to use SMI system management interface to enable and receive SVM events. Document SVM event triggers. Define SVM events message string format macro that could be used by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 45 + include/uapi/linux/kfd_ioctl.h | 100 +--- 2 files changed, 109 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index ea6a8e43bd5b..de8b9abf7afc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, amdgpu_reset_get_desc(reset_context, reset_cause, sizeof(reset_cause)); - kfd_smi_event_add(0, dev, event, "%x %s\n", - dev->reset_seq_num, - reset_cause); + kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( + dev->reset_seq_num, reset_cause)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING( throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (task_info) { /* Report VM faults from user applications, not retry from kernel */ if (task_info->pid) - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info->pid, task_info->task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( + task_info->pid, task_info->task_name)); amdgpu_vm_put_task_info(task_info); } } @@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", + KFD_EVENT_FMT_MIGRATE_START( ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + from, to, prefetch_loc, preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, @@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, uint32_t from, uint32_t to, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", + KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); + from, to, trigger)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION, - "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid, - node->id, trigger); + KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid, + node->id, trigger)); } void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE, - "%lld -%d %x\n", ktime_get_boottime_ns(), pid, - node->id); + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid, + node->id, 0)); } void kfd_smi_event_queue_restore_reschedu
Re: [PATCH v6] drm/amdkfd: Change kfd/svm page fault drain handling
On 2024-08-21 18:01, Xiaogang.Chen wrote: From: Xiaogang Chen When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and not handle any incoming pages fault of this process until a deferred work item got executed by default system wq. The time period of "not handle page fault" can be long and is unpredicable. That is advese to kfd performance on page faults recovery. This patch uses time stamp of incoming page fault to decide to drop or recover page fault. When app unmap vm ranges kfd records each gpu device's ih ring current time stamp. These time stamps are used at kfd page fault recovery routine. Any page fault happened on unmapped ranges after unmap events is application bug that accesses vm range after unmap. It is not driver work to cover that. By using time stamp of page fault do not need drain page faults at deferred work. So, the time period that kfd does not handle page faults is reduced and can be controlled. Signed-off-by: Xiaogang.Chen Some nitpicks below. This patch is Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 + drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 98 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 7 files changed, 75 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index a060c28f0877..d89a4c14bbb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2764,7 +2764,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault) { bool is_compute_context = false; @@ -2790,7 +2790,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, ts, write_fault)) { amdgpu_bo_unref(&root); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 046949c4b695..d12d66dca8e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -558,7 +558,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm); void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index f0ceab3ce5bf..9784a2892185 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, + entry->timestamp, write_fault)) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index b73136d390cc..c76ac0dfe572 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault); + addr, entry->timestamp, write_fault); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault)) + addr, entry->timestamp, write_fault)) return 1; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 4190fa339913..288ebf02fa1c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -863,6 +863,8 @@ struct svm_range_list { struct delayed_work restore_work; DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE); struct task_struct *faul
Re: [PATCH] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter
On 2024-08-21 19:22, Ramesh Errabolu wrote: KFD's design of unified memory (UM) does not allow users to configure the size of buffer used in migrating buffer either from Sysmem to VRAM or vice versa. This is not true, app can change range granularity attribute, to configure the buffer migration size. This module parameter is used to config the default range granularity. This patch remedies this gap, albeit at a coarse level, for workloads that deal with unregistered memory Signed-off-by: Ramesh Errabolu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 16 ++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 29 + 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e8c284aea1f2..73dd816b01f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -237,6 +237,7 @@ extern int sched_policy; extern bool debug_evictions; extern bool no_system_mem_limit; extern int halt_if_hws_hang; +extern uint amdgpu_svm_attr_gobm; #else static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS; static const bool __maybe_unused debug_evictions; /* = false */ @@ -313,6 +314,9 @@ extern int amdgpu_wbrf; /* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */ #define AMDGPU_SWCTF_EXTRA_DELAY 50 +/* Default size of buffer to use in migrating buffer */ +#define AMDGPU_SVM_ATTR_GOBM 9 + struct amdgpu_xcp_mgr; struct amdgpu_device; struct amdgpu_irq_src; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b9529948f2b2..e195e1cf0f28 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -169,6 +169,15 @@ uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu; char *amdgpu_virtual_display; bool enforce_isolation; + +/* Specifies the default size of buffer to use in + * migrating buffer from Sysmem to VRAM and vice + * versa + * + * Defined as log2(sizeof(buffer)/PAGE_SIZE) + */ +uint amdgpu_svm_attr_gobm = AMDGPU_SVM_ATTR_GOBM; /* add explanation, GOBM : granularity of buffer migration Use u8 type, the same type used in prange->granularity u8 amdgpu_svm_default_gobm = AMDGPU_SVM_DEFAULT_GOBM + /* * OverDrive(bit 14) disabled by default * GFX DCS(bit 19) disabled by default @@ -320,6 +329,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444); MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(msi, amdgpu_msi, int, 0444); +/** + * DOC: svm_attr_gobm (uint) + * Size of buffer to use in migrating buffer from Sysmem to VRAM and vice versa + */ +MODULE_PARM_DESC(svm_attr_gobm, "Defined as log2(sizeof(buffer)/PAGE_SIZE), e.g. 9 for 2 MiB"); +module_param_named(svm_attr_gobm, amdgpu_svm_attr_gobm, uint, 0644); + /** * DOC: lockup_timeout (string) * Set GPU scheduler timeout value in ms. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 7bba6bed2f48..07b202ab008a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -866,6 +866,18 @@ struct svm_range_list { struct delayed_work restore_work; DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE); struct task_struct *faulting_task; + + /* Indicates the default size to use in migrating + * buffers of a process from Sysmem to VRAM and vice + * versa. The max legal value cannot be greater than + * 0x3F + * + * @note: A side effect of this symbol being part of + * struct svm_range_list is that it forces all buffers + * of the process of unregistered kind to use the same + * size in buffer migration + */ + uint8_t attr_gobm; }; /* Process data */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 10b1a1f64198..fcfe5543a3c0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -309,12 +309,11 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) } static void -svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, - uint8_t *granularity, uint32_t *flags) +svm_range_set_default_attributes(int32_t *location, + int32_t *prefetch_loc, uint32_t *flags) { *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; - *granularity = 9; *flags = KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; } @@ -358,9 +357,9 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); + prange->granula
[PATCH] drm/amdkfd: Handle queue destroy buffer access race
Add helper function kfd_queue_unreference_buffers to reduce queue buffer refcount, separate it from release queue buffers. Because it is circular locking to hold dqm_lock to take vm lock, kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers first, but not release queue buffers, to handle error in case failed to hold vm lock. Then hold dqm_lock to remove queue from queue list and then release queue buffers. Restore process worker restore queue hold dqm_lock, will always find the queue with valid queue buffers. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +- .../amd/amdkfd/kfd_process_queue_manager.c| 8 ++- drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 62 --- 4 files changed, 49 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0622ebd7e8ef..10d6e29b23cb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return 0; err_create_queue: + kfd_queue_unreference_buffers(pdd, &q_properties); kfd_queue_release_buffers(pdd, &q_properties); err_acquire_queue_buf: err_sdma_engine_id: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 057d20446c31..e38484b40467 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size); -void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo); +void kfd_queue_buffer_put(struct amdgpu_bo **bo); int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +void kfd_queue_unreference_buffer(struct amdgpu_vm *vm, struct amdgpu_bo **bo); +int kfd_queue_unreference_buffers(struct kfd_process_device *pdd, + struct queue_properties *properties); void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index f732ee35b531..ef76a9cbc7e2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm) list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { if (pqn->q) { pdd = kfd_get_process_device_data(pqn->q->device, pqm->process); + kfd_queue_unreference_buffers(pdd, &pqn->q->properties); kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); } @@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) } if (pqn->q) { - retval = kfd_queue_release_buffers(pdd, &pqn->q->properties); + retval = kfd_queue_unreference_buffers(pdd, &pqn->q->properties); if (retval) goto err_destroy_queue; @@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (retval != -ETIME) goto err_destroy_queue; } - + kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); uninit_queue(pqn->q); } @@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, return -EFAULT; } - kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo); + kfd_queue_unreference_buffer(vm, &pqn->q->properties.ring_bo); + kfd_queue_buffer_put(&pqn->q->properties.ring_bo); amdgpu_bo_unreserve(vm->root.bo); pqn->q->properties.ring_bo = p->ring_bo; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index e0a073ae4a49..9ac15dff527f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -224,16 +224,8 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __us
Re: [PATCH] drm/amdgpu: change kgd2kfd_init_zone sequence during device_init
On 2024-07-31 04:10, Shikang Fan wrote: Move kgd2kfd_init _zone_device() after release_full_gpu() as it takes long time for asics with huge bar size and it could potentially hit full access timeout for SRIOV during init. Signed-off-by: Shikang Fan --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3a43754e7f10..4494fa7ae70f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2930,10 +2930,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_ttm_set_buffer_funcs_status(adev, true); /* Don't init kfd if whole hive need to be reset during init */ - if (!adev->gmc.xgmi.pending_reset) { - kgd2kfd_init_zone_device(adev); + if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); - } amdgpu_fru_get_product_info(adev); @@ -4362,6 +4360,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, flush_delayed_work(&adev->delayed_init_work); } + /* On asics with huge bar size, memremap_pages can take long time + * and potentially leading to full access timeout for SRIOV. Move + * init_zone_device() after exit full gpu + */ + if (!adev->gmc.xgmi.pending_reset) + kgd2kfd_init_zone_device(adev); + This change will not work because KFD amdgpu_amdkfd_device_init check KFD_IS_SVM_API_SUPPORTED, it always return false, as a result, SVM API is not enabled for user space. Maybe you can move two function calls together here, if there is no other init dependency issue. /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) { kgd2kfd_init_zone_device(adev); amdgpu_amdkfd_device_init(adev); } Regards, Philip /* * Place those sysfs registering after `late_init`. As some of those * operations performed in `late_init` might affect the sysfs
[PATCH v2 1/4] drm/amdkfd: Document and define SVM events message macro
Document how to use SMI system management interface to enable and receive SVM events. Document SVM event triggers. Define SVM events message string format macro that could be used by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 45 + include/uapi/linux/kfd_ioctl.h | 100 +--- 2 files changed, 109 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index ea6a8e43bd5b..de8b9abf7afc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, amdgpu_reset_get_desc(reset_context, reset_cause, sizeof(reset_cause)); - kfd_smi_event_add(0, dev, event, "%x %s\n", - dev->reset_seq_num, - reset_cause); + kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( + dev->reset_seq_num, reset_cause)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING( throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (task_info) { /* Report VM faults from user applications, not retry from kernel */ if (task_info->pid) - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", -task_info->pid, task_info->task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( + task_info->pid, task_info->task_name)); amdgpu_vm_put_task_info(task_info); } } @@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", + KFD_EVENT_FMT_MIGRATE_START( ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + from, to, prefetch_loc, preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, @@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, uint32_t from, uint32_t to, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", + KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); +
[PATCH v2 4/4] drm/amdkfd: SMI report dropped event count
Add new SMI event to report the dropped event count when the event kfifo is full. When the kfifo has space for two events, generate a dropped event record to report how many events were dropped, together with the next event to add to kfifo. After reading event out from kfifo, if there were events dropped, generate a dropped event record and add to kfifo. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 41 ++--- include/uapi/linux/kfd_ioctl.h | 6 +++ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 9b8169761ec5..9b47657d5160 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -42,6 +42,7 @@ struct kfd_smi_client { struct rcu_head rcu; pid_t pid; bool suser; + u32 drop_count; }; #define KFD_MAX_KFIFO_SIZE 8192 @@ -103,12 +104,26 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, } to_copy = min(size, to_copy); ret = kfifo_out(&client->fifo, buf, to_copy); - spin_unlock(&client->lock); if (ret <= 0) { + spin_unlock(&client->lock); ret = -EAGAIN; goto ret_err; } + if (client->drop_count) { + char msg[KFD_SMI_EVENT_MSG_SIZE]; + int len; + + len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); + len += snprintf(msg + len, sizeof(msg) - len, + KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), + client->pid, client->drop_count)); + kfifo_in(&client->fifo, msg, len); + client->drop_count = 0; + } + + spin_unlock(&client->lock); + ret = copy_to_user(user, buf, to_copy); if (ret) { ret = -EFAULT; @@ -173,22 +188,36 @@ static bool kfd_smi_ev_enabled(pid_t pid, struct kfd_smi_client *client, } static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, - unsigned int smi_event, char *event_msg, int len) + unsigned int smi_event, char *event_msg, int event_len) { struct kfd_smi_client *client; + char msg[KFD_SMI_EVENT_MSG_SIZE]; + int len = 0; rcu_read_lock(); list_for_each_entry_rcu(client, &dev->smi_clients, list) { if (!kfd_smi_ev_enabled(pid, client, smi_event)) continue; + spin_lock(&client->lock); - if (kfifo_avail(&client->fifo) >= len) { - kfifo_in(&client->fifo, event_msg, len); + if (client->drop_count) { + len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); + len += snprintf(msg + len, sizeof(msg) - len, + KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), pid, + client->drop_count)); + } + + if (kfifo_avail(&client->fifo) >= event_len + len) { + if (len) + kfifo_in(&client->fifo, msg, len); + kfifo_in(&client->fifo, event_msg, event_len); wake_up_all(&client->wait_queue); + client->drop_count = 0; } else { - pr_debug("smi_event(EventID: %u): no space left\n", - smi_event); + client->drop_count++; + pr_debug("smi_event(EventID: %u): no space left drop_count %d\n", +smi_event, client->drop_count); } spin_unlock(&client->lock); } diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index e4ed8fec3294..915d1e7c67fe 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -530,6 +530,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, + KFD_SMI_EVENT_DROPPED_EVENT = 12, /* * max event number, as a flag bit to get events from all processes, @@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args { *rw: 'W' for write page fault, 'R' for read page fault *rescheduled: 'R' if the queue restore failed and rescheduled to try again *error_code: migrate failure error code, 0 if no error + *drop_count: how many events dropped when fifo is full */
[PATCH v2 3/4] drm/amdkfd: Increase SMI event fifo size
SMI event fifo size 1KB was enough to report GPU vm fault or reset event, increase it to 8KB to store about 100 migrate events, less chance to drop the migrate events if lots of migration happened in the short period of time. Add KFD prefix to the macro name. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 1d94b445a060..9b8169761ec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -44,7 +44,7 @@ struct kfd_smi_client { bool suser; }; -#define MAX_KFIFO_SIZE 1024 +#define KFD_MAX_KFIFO_SIZE 8192 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); @@ -86,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, struct kfd_smi_client *client = filep->private_data; unsigned char *buf; - size = min_t(size_t, size, MAX_KFIFO_SIZE); + size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE); buf = kmalloc(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -355,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) return -ENOMEM; INIT_LIST_HEAD(&client->list); - ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL); if (ret) { kfree(client); return ret; -- 2.43.2
[PATCH v2 2/4] drm/amdkfd: Output migrate end event if migrate failed
If page migration failed, also output migrate end event to match with migrate start event, with failure error_code added to the end of the migrate message macro. This will not break uAPI because application uses old message macro sscanf drop and ignore the error_code. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 14 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 3 ++- include/uapi/linux/kfd_ioctl.h | 7 --- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 8ee3d07ffbdf..eacfeb32f35d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -445,14 +445,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", mpages, cpages, migrate.npages); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - 0, node->id, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + 0, node->id, trigger, r); out: if (!r && mpages) { pdd = svm_range_get_pdd_by_node(prange, node); @@ -751,14 +750,13 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(&migrate); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - node->id, 0, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + node->id, 0, trigger, r); out: if (!r && cpages) { mpages = cpages - upages; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index de8b9abf7afc..1d94b445a060 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -292,12 +292,13 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, -uint32_t from, uint32_t to, uint32_t trigger) +uint32_t from, uint32_t to, uint32_t trigger, +int error_code) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger)); + from, to, trigger, error_code)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 85010b8307f8..503bff13d815 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -44,7 +44,8 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, -uint32_t from, uint32_t to, uint32_t trigger); +uint32_t from, uint32_t to, uint32_t trigger, +int error_code); void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c94182ad8fb8..e4ed8fec3294 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -609,6 +609,7 @@ struct kfd_ioctl_smi_events_args { *migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update *rw: 'W' for write page fault, 'R' for read page fault *rescheduled: 'R' if the queue restore failed and rescheduled to
[PATCH v2 0/4] Improve SVM migrate event report
1. Document how to use SMI system management interface to receive SVM events, define string format macro for user mode. 2. Increase the event kfifo size, so less chance to drop event. 3. Output migrate end event with error code if migration failed. 4. Report dropped event count if fifo is full. Philip Yang (4): drm/amdkfd: Document and define SVM events message macro drm/amdkfd: Output migrate end event if migrate failed drm/amdkfd: Increase SMI event fifo size drm/amdkfd: SMI report dropped event count drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 14 ++- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 95 +++-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 3 +- include/uapi/linux/kfd_ioctl.h | 107 +--- 4 files changed, 164 insertions(+), 55 deletions(-) -- 2.43.2
Re: [PATCH] drm/amdkfd: Fix missing error code in kfd_queue_acquire_buffers
The kfdtest user queue validation cases don't cover those error condition path, thanks for catching it. This patch is Reviewed-by: Philip Yang On 2024-07-26 02:47, Srinivasan Shanmugam wrote: The fix involves setting 'err' to '-EINVAL' before each 'goto out_err_unreserve'. Fixes the below: drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:265 kfd_queue_acquire_buffers() warn: missing error code 'err' drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c 226 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) 227 { 228 struct kfd_topology_device *topo_dev; 229 struct amdgpu_vm *vm; 230 u32 total_cwsr_size; 231 int err; 232 233 topo_dev = kfd_topology_device_by_id(pdd->dev->id); 234 if (!topo_dev) 235 return -EINVAL; 236 237 vm = drm_priv_to_vm(pdd->drm_priv); 238 err = amdgpu_bo_reserve(vm->root.bo, false); 239 if (err) 240 return err; 241 242 err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE); 243 if (err) 244 goto out_err_unreserve; 245 246 err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE); 247 if (err) 248 goto out_err_unreserve; 249 250 err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, 251&properties->ring_bo, properties->queue_size); 252 if (err) 253 goto out_err_unreserve; 254 255 /* only compute queue requires EOP buffer and CWSR area */ 256 if (properties->type != KFD_QUEUE_TYPE_COMPUTE) 257 goto out_unreserve; This is clearly a success path. 258 259 /* EOP buffer is not required for all ASICs */ 260 if (properties->eop_ring_buffer_address) { 261 if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { 262 pr_debug("queue eop bo size 0x%lx not equal to node eop buf size 0x%x\n", 263 properties->eop_buf_bo->tbo.base.size, 264 topo_dev->node_props.eop_buffer_size); --> 265 goto out_err_unreserve; This has err in the label name. err = -EINVAL? 266 } 267 err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, 268&properties->eop_buf_bo, 269properties->eop_ring_buffer_size); 270 if (err) 271 goto out_err_unreserve; 272 } 273 274 if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) { 275 pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n", 276 properties->ctl_stack_size, 277 topo_dev->node_props.ctl_stack_size); 278 goto out_err_unreserve; err? 279 } 280 281 if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { 282 pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", 283 properties->ctx_save_restore_area_size, 284 topo_dev->node_props.cwsr_size); 285 goto out_err_unreserve; err? Not sure. 286 } 287 288 total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) 289 * NUM_XCC(pdd->dev->xcc_mask); 290 total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); 291 292 err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, 293&properties->cwsr_bo, total_cwsr_size); 294 if (!err) 295 goto out_unreserve; 296 297 amdgpu_bo_unreserve(vm->root.bo); 298 299 err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, 300total_cwsr_size); 301 if (err) 302 goto out_err_release; 303 304 return 0; 305 306 out_unreserve: 307 amdgpu_bo_unreserve(vm->root.bo); 308 return 0; 309 310 out_err_unreserve: 311 amdgpu_b
Re: [PATCH v3] drm/amdkfd: Change kfd/svm page fault drain handling
On 2024-07-25 14:19, Xiaogang.Chen wrote: From: Xiaogang Chen When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and not handle any incoming pages fault of this process until a deferred work item got executed by default system wq. The time period of "not handle page fault" can be long and is unpredicable. That is advese to kfd performance on page faults recovery. This patch uses time stamp of incoming page page to decide to drop or handle page fault. When app unmap vm ranges kfd records each gpu device's ih ring current time stamp. These time stamps are used at kfd page fault recovery routine. Any page fault happens on unmapped ranges after unmap events is app bug that accesses vm range after unmap. It is not driver work to cover that. By using time stamp of page fault do not need drain page faults at deferred work. So, the time period that kfd does not handle page faults is reduced and can be controlled. Signed-off-by: Xiaogang.Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 102 - drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 7 files changed, 79 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 3abfa66d72a2..d90b7ea3f020 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault) { bool is_compute_context = false; @@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, ts, write_fault)) { amdgpu_bo_unref(&root); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 312a408b80d3..1d6a1381ede9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm); void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d933e19e0cf5..3596cc2ee7e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, + entry->timestamp, write_fault)) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 350f6b6676f1..ac08d9424feb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault); + addr, entry->timestamp, write_fault); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault)) + addr, entry->timestamp, write_fault)) return 1; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c51e908f6f19..771c98e104ee 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -850,10 +850,13 @@ struct svm_range_list { struct list_headcriu_svm_metadata_list; spinlock_t deferred_list_lock; atomic_t evicted_ranges; - atomic_t drain_pagefaults; + /* stop page fault recovery for this process */ + atomic_t stop_pf_recovery; struct delayed_work restore_work; DECLAR
[PATCH] drm/amdkfd: Fix compile error if HMM support not enabled
Fixes the below if kernel config not enable HMM support >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:107:26: error: implicit declaration of function 'svm_range_from_addr' >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:107:24: error: assignment to 'struct svm_range *' from 'int' makes pointer from integer without a cast [-Wint-conversion] >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:111:28: error: invalid use of undefined type 'struct svm_range' Fixes: de165b53c93f ("drm/amdkfd: Validate user queue svm memory residency") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407252127.zvnxakra-...@intel.com/ Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 9807e8adf77d..64c292f0ba1b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -85,6 +85,8 @@ void uninit_queue(struct queue *q) kfree(q); } +#if IS_ENABLED(CONFIG_HSA_AMD_SVM) + static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) { struct kfd_process *p = pdd->process; @@ -178,6 +180,18 @@ static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u mutex_unlock(&p->svms.lock); } +#else + +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + return -EINVAL; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ +} + +#endif int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size) -- 2.43.2
Re: [PATCH v2] drm/amdkfd: Change kfd/svm page fault drain handling
On 2024-07-24 09:58, Chen, Xiaogang wrote: On 7/23/2024 4:02 PM, Philip Yang wrote: On 2024-07-19 18:17, Xiaogang.Chen wrote: From: Xiaogang Chen When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and not handle any incoming pages fault of this process until a deferred work item got executed by default system wq. The time period of "not handle page fault" can be long and is unpredicable. That is advese to kfd performance on page faults recovery. This patch uses time stamp of incoming page page to decide to drop or handle page fault. When app unmap vm ranges kfd records each gpu device's ih ring current time stamp. These time stamps are used at kfd page fault recovery routine. Any page fault happens on unmapped ranges after unmap events is app bug that accesses vm range after unmap. It is not driver work to cover that. By using time stamp of page fault do not need drain page faults at deferred work. So, the time period that kfd does not handle page faults is reduced and can be controlled. This simplify the retry fault draining and support the multiple processes correctly now, some nitpick below. Signed-off-by: Xiaogang.Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 111 + drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 7 files changed, 88 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 3abfa66d72a2..d90b7ea3f020 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault) { bool is_compute_context = false; @@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, ts, write_fault)) { amdgpu_bo_unref(&root); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 312a408b80d3..1d6a1381ede9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm); void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d933e19e0cf5..5cceaba6e5c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, + entry->timestamp, write_fault)) indent should align to the start bracket. ok. return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 350f6b6676f1..ac08d9424feb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault); + addr, entry->timestamp, write_fault); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault)) + addr, entry->timestamp, write_fault)) return 1; } } diff --git a/drivers/
Re: [PATCH v2] drm/amdkfd: Change kfd/svm page fault drain handling
On 2024-07-19 18:17, Xiaogang.Chen wrote: From: Xiaogang Chen When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and not handle any incoming pages fault of this process until a deferred work item got executed by default system wq. The time period of "not handle page fault" can be long and is unpredicable. That is advese to kfd performance on page faults recovery. This patch uses time stamp of incoming page page to decide to drop or handle page fault. When app unmap vm ranges kfd records each gpu device's ih ring current time stamp. These time stamps are used at kfd page fault recovery routine. Any page fault happens on unmapped ranges after unmap events is app bug that accesses vm range after unmap. It is not driver work to cover that. By using time stamp of page fault do not need drain page faults at deferred work. So, the time period that kfd does not handle page faults is reduced and can be controlled. This simplify the retry fault draining and support the multiple processes correctly now, some nitpick below. Signed-off-by: Xiaogang.Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 111 + drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 7 files changed, 88 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 3abfa66d72a2..d90b7ea3f020 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault) { bool is_compute_context = false; @@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, ts, write_fault)) { amdgpu_bo_unref(&root); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 312a408b80d3..1d6a1381ede9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm); void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, - u32 vmid, u32 node_id, uint64_t addr, + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, bool write_fault); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d933e19e0cf5..5cceaba6e5c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, + entry->timestamp, write_fault)) indent should align to the start bracket. return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 350f6b6676f1..ac08d9424feb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault); + addr, entry->timestamp, write_fault); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault)) + addr, entry->timestamp, write_fault)) return 1; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c51e908f6f19..8b8d5ab9da76 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -850,10 +850,13 @@ struct svm_range_list { struct list_headcriu_svm_metadata_list; spinlock_t deferr
[PATCH v2 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping
Add helper function kfd_queue_acquire_buffers to get queue wptr_bo reference from queue write_ptr if it is mapped to the KFD node with expected size. Add wptr_bo to structure queue_properties because structure queue is allocated after queue buffers are validated, then we can remove wptr_bo parameter from pqm_create_queue. Rename structure queue wptr_bo_gart to hold wptr_bo reference for GART mapping and umapping. Move MES wptr_bo_gart mapping to init_user_queue, the same location with queue ctx_bo GART mapping. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 56 +++--- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 13 +++-- .../amd/amdkfd/kfd_process_queue_manager.c| 45 +++ drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 57 +++ 7 files changed, 116 insertions(+), 68 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 6e591280774b..4ed49265c764 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, void **kptr, uint64_t *size); void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo); +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart); int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, struct dma_fence __rcu **ef); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 199e387d35f4..0ab37e7aec26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory( /** * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count * @bo: Buffer object to be mapped + * @bo_gart: Return bo reference * * Before return, bo reference count is incremented. To release the reference and unpin/ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem. */ -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart) { int ret; @@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) amdgpu_bo_unreserve(bo); - bo = amdgpu_bo_ref(bo); + *bo_gart = amdgpu_bo_ref(bo); return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 823f245dc7d0..202f24ee4bd7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->priority = args->queue_priority; q_properties->queue_address = args->ring_base_address; q_properties->queue_size = args->ring_size; - q_properties->read_ptr = (uint32_t *) args->read_pointer_address; - q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + q_properties->read_ptr = (void __user *)args->read_pointer_address; + q_properties->write_ptr = (void __user *)args->write_pointer_address; q_properties->eop_ring_buffer_address = args->eop_buffer_address; q_properties->eop_ring_buffer_size = args->eop_buffer_size; q_properties->ctx_save_restore_area_address = @@ -306,7 +306,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, struct kfd_process_device *pdd; struct queue_properties q_properties; uint32_t doorbell_offset_in_process = 0; - struct amdgpu_bo *wptr_bo = NULL; memset(&q_properties, 0, sizeof(struct queue_properties)); @@ -342,53 +341,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, } } - /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work -* on unmapped queues for usermode queue oversubscription (no aggregated doorbell) -*/ - if (dev->kfd->shared_resources.enable_mes && - ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) - >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { - struct amdgpu_bo_va_mapping *wptr_mapping; - struct amdgpu_vm *wptr_vm; - - wptr_vm = drm_priv_to_vm(pdd->drm_priv); - err =
[PATCH v2 8/9] drm/amdkfd: Store queue cwsr area size to node properties
Use the queue eop buffer size, cwsr area size, ctl stack size calculation from Thunk, store the value to KFD node properties. Those will be used to validate queue eop buffer size, cwsr area size, ctl stack size when creating KFD user compute queue. Those will be exposed to user space via sysfs KFD node properties, to remove the duplicate calculation code from Thunk. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 75 +++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 4 ++ 4 files changed, 82 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c31589043d5b..b5cae48dff66 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1295,6 +1295,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_ void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo); int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_node *dev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 67242ce051b5..adcda9730c9f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -24,6 +24,7 @@ #include #include "kfd_priv.h" +#include "kfd_topology.h" #include "kfd_svm.h" void print_queue_properties(struct queue_properties *q) @@ -305,3 +306,77 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope properties->ctx_save_restore_area_size); return 0; } + +#define SGPR_SIZE_PER_CU 0x4000 +#define LDS_SIZE_PER_CU0x1 +#define HWREG_SIZE_PER_CU 0x1000 +#define DEBUGGER_BYTES_ALIGN 64 +#define DEBUGGER_BYTES_PER_WAVE32 + +static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) +{ + u32 vgpr_size = 0x4; + + if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */ + gfxv == 90010 ||/* GFX_VERSION_ALDEBARAN */ + gfxv == 90008) /* GFX_VERSION_ARCTURUS */ + vgpr_size = 0x8; + else if (gfxv == 11 || /* GFX_VERSION_PLUM_BONITO */ +gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ +gfxv == 12 || /* GFX_VERSION_GFX1200 */ +gfxv == 120001)/* GFX_VERSION_GFX1201 */ + vgpr_size = 0x6; + + return vgpr_size; +} + +#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv) \ + (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\ +LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU) + +#define CNTL_STACK_BYTES_PER_WAVE(gfxv)\ + ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/ + +#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40 + +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) +{ + struct kfd_node_properties *props = &dev->node_props; + u32 gfxv = props->gfx_target_version; + u32 ctl_stack_size; + u32 wg_data_size; + u32 wave_num; + u32 cu_num; + + if (gfxv < 80001) /* GFX_VERSION_CARRIZO */ + return; + + cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); + wave_num = (gfxv < 100100) ?/* GFX_VERSION_NAVI10 */ + min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512) + : cu_num * 32; + + wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv), PAGE_SIZE); + ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; + ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, + PAGE_SIZE); + + if ((gfxv / 1 * 1) == 10) { + /* HW design limits control stack size to 0x7000. +* This is insufficient for theoretical PM4 cases +* but sufficient for AQL, limited by SPI events. +*/ + ctl_stack_size = min(ctl_stack_size, 0x7000); + } + + props->ctl_stack_size = ctl_stack_size; + props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); + props->cwsr_size = ctl_stack_size + wg_data_size; + + if (gfxv == 80002) /* GFX_VERSION_TONGA */ + props->eop_buffer_size = 0x
[PATCH v2 2/9] drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer
Pass pointer reference to amdgpu_bo_unref to clear the correct pointer, otherwise amdgpu_bo_unref clear the local variable, the original pointer not set to NULL, this could cause use-after-free bug. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_device.c| 4 ++-- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 03205e3c3746..c272461d70a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size, return r; } -void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj) +void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj) { - struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; + struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj; - amdgpu_bo_reserve(bo, true); - amdgpu_bo_kunmap(bo); - amdgpu_bo_unpin(bo); - amdgpu_bo_unreserve(bo); - amdgpu_bo_unref(&(bo)); + amdgpu_bo_reserve(*bo, true); + amdgpu_bo_kunmap(*bo); + amdgpu_bo_unpin(*bo); + amdgpu_bo_unreserve(*bo); + amdgpu_bo_unref(bo); } int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 66b1c72c81e5..6e591280774b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size, void **mem_obj, uint64_t *gpu_addr, void **cpu_ptr, bool mqd_gfx9); -void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj); +void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj); int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size, void **mem_obj); void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 1d9b21628be7..823f245dc7d0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -423,7 +423,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, err_create_queue: if (wptr_bo) - amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&wptr_bo); err_wptr_map_gart: err_bind_process: err_pdd: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index f4d20adaa068..6619028dd58b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -907,7 +907,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); alloc_gtt_mem_failure: dev_err(kfd_device, "device %x:%x NOT added due to errors\n", @@ -925,7 +925,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); ida_destroy(&kfd->doorbell_ida); kfd_gtt_sa_fini(kfd); - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); } kfree(kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 4f48507418d2..420444eb8e98 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2621,7 +2621,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, { WARN(!mqd, "No hiq sdma mqd trunk to free"); - amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem); } void device_queue_manager_uninit(struct device_queue_manager *dqm) diff --git a/drivers/gpu/drm/amd/am
[PATCH v2 4/9] drm/amdkfd: Validate user queue buffers
Find user queue rptr, ring buf, eop buffer and cwsr area BOs, and check BOs are mapped on the GPU with correct size and take the BO reference. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 +++ drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 38 -- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index aba9bcd91f65..80d8080c5764 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -524,6 +524,10 @@ struct queue_properties { uint64_t exception_status; struct amdgpu_bo *wptr_bo; + struct amdgpu_bo *rptr_bo; + struct amdgpu_bo *ring_bo; + struct amdgpu_bo *eop_buf_bo; + struct amdgpu_bo *cwsr_bo; }; #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index b4529ec298a9..0e661160c295 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -97,7 +97,8 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_ if (!mapping) goto out_err; - if (user_addr != mapping->start || user_addr + size - 1 != mapping->last) { + if (user_addr != mapping->start || + (size != 0 && user_addr + size - 1 != mapping->last)) { pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n", expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT, (mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT); @@ -124,18 +125,51 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE); if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE); + if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, + &properties->ring_bo, properties->queue_size); + if (err) + goto out_err_unreserve; + + /* only compute queue requires EOP buffer and CWSR area */ + if (properties->type != KFD_QUEUE_TYPE_COMPUTE) goto out_unreserve; + /* EOP buffer is not required for all ASICs */ + if (properties->eop_ring_buffer_address) { + err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, + &properties->eop_buf_bo, + properties->eop_ring_buffer_size); + if (err) + goto out_err_unreserve; + } + + err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, + &properties->cwsr_bo, 0); + if (err) + goto out_err_unreserve; + +out_unreserve: amdgpu_bo_unreserve(vm->root.bo); return 0; -out_unreserve: +out_err_unreserve: amdgpu_bo_unreserve(vm->root.bo); + kfd_queue_release_buffers(pdd, properties); return err; } int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { amdgpu_bo_unref(&properties->wptr_bo); + amdgpu_bo_unref(&properties->rptr_bo); + amdgpu_bo_unref(&properties->ring_bo); + amdgpu_bo_unref(&properties->eop_buf_bo); + amdgpu_bo_unref(&properties->cwsr_bo); return 0; } -- 2.43.2
[PATCH v2 5/9] drm/amdkfd: Ensure user queue buffers residency
Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap BO from the GPU if the bo_va queue_refcount is not zero. Create queue to increase the bo_va queue_refcount, destroy queue to decrease the bo_va queue_refcount, to ensure the queue buffers mapped on the GPU when queue is active. Signed-off-by: Philip Yang --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h| 6 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 34 --- 5 files changed, 49 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ab37e7aec26..6d5fd371d5ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, return ret; } -static void unmap_bo_from_gpuvm(struct kgd_mem *mem, +static int unmap_bo_from_gpuvm(struct kgd_mem *mem, struct kfd_mem_attachment *entry, struct amdgpu_sync *sync) { @@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem, struct amdgpu_device *adev = entry->adev; struct amdgpu_vm *vm = bo_va->base.vm; + if (bo_va->queue_refcount) { + pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount); + return -EBUSY; + } + amdgpu_vm_bo_unmap(adev, bo_va, entry->va); amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); amdgpu_sync_fence(sync, bo_va->last_pt_update); + + return 0; } static int update_gpuvm_pte(struct kgd_mem *mem, @@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", entry->va, entry->va + bo_size, entry); - unmap_bo_from_gpuvm(mem, entry, ctx.sync); + ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync); + if (ret) + goto unreserve_out; + entry->is_mapped = false; mem->mapped_to_gpu_memory--; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index bc42ccbde659..d7e27957013f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -90,6 +90,12 @@ struct amdgpu_bo_va { boolcleared; boolis_xgmi; + + /* +* protected by vm reservation lock +* if non-zero, cannot unmap from GPU because user queues may still access it +*/ + unsigned intqueue_refcount; }; struct amdgpu_bo { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 202f24ee4bd7..65a37ac5a0f0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); if (err) { - pr_err("Failed to unmap from gpu %d/%d\n", - i, args->n_devices); + pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices); goto unmap_memory_from_gpu_failed; } args->n_success = i+1; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 80d8080c5764..c31589043d5b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1292,6 +1292,7 @@ void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size); +void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo); int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 0e661160c295..3fd386dcb011 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct
[PATCH v2 9/9] drm/amdkfd: Validate queue cwsr area and eop buffer size
When creating KFD user compute queue, check if queue eop buffer size, cwsr area size, ctl stack size equal to the size of KFD node properities. Check the entire cwsr area which may split into multiple svm ranges aligned to gramularity boundary. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 46 +++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index adcda9730c9f..9807e8adf77d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -225,9 +225,15 @@ void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo) int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { + struct kfd_topology_device *topo_dev; struct amdgpu_vm *vm; + u32 total_cwsr_size; int err; + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); if (err) @@ -252,6 +258,12 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope /* EOP buffer is not required for all ASICs */ if (properties->eop_ring_buffer_address) { + if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { + pr_debug("queue eop bo size 0x%lx not equal to node eop buf size 0x%x\n", + properties->eop_buf_bo->tbo.base.size, + topo_dev->node_props.eop_buffer_size); + goto out_err_unreserve; + } err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, &properties->eop_buf_bo, properties->eop_ring_buffer_size); @@ -259,15 +271,33 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; } + if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) { + pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n", + properties->ctl_stack_size, + topo_dev->node_props.ctl_stack_size); + goto out_err_unreserve; + } + + if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + properties->ctx_save_restore_area_size, + topo_dev->node_props.cwsr_size); + goto out_err_unreserve; + } + + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, - &properties->cwsr_bo, 0); + &properties->cwsr_bo, total_cwsr_size); if (!err) goto out_unreserve; amdgpu_bo_unreserve(vm->root.bo); err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, - properties->ctx_save_restore_area_size); + total_cwsr_size); if (err) goto out_err_release; @@ -286,7 +316,9 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { + struct kfd_topology_device *topo_dev; struct amdgpu_vm *vm; + u32 total_cwsr_size; int err; vm = drm_priv_to_vm(pdd->drm_priv); @@ -302,8 +334,14 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope amdgpu_bo_unreserve(vm->root.bo); - kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, -properties->ctx_save_restore_area_size); + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + + kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); return 0; } -- 2.43.2
[PATCH v2 6/9] drm/amdkfd: Validate user queue svm memory residency
Queue CWSR area maybe registered to GPU as svm memory, create queue to ensure svm mapped to GPU with KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED flag. Add queue_refcount to struct svm_range, to track queue CWSR area usage. Because unmap mmu notifier callback return value is ignored, if application unmap the CWSR area while queue is active, pr_warn message in dmesg log. To be safe, evict user queue. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 110 - drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 12 +++ drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 + 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 3fd386dcb011..67242ce051b5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -24,6 +24,7 @@ #include #include "kfd_priv.h" +#include "kfd_svm.h" void print_queue_properties(struct queue_properties *q) { @@ -83,6 +84,100 @@ void uninit_queue(struct queue *q) kfree(q); } +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct list_head update_list; + struct svm_range *prange; + int ret = -EINVAL; + + INIT_LIST_HEAD(&update_list); + addr >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + + mutex_lock(&p->svms.lock); + + /* +* range may split to multiple svm pranges aligned to granularity boundaery. +*/ + while (size) { + uint32_t gpuid, gpuidx; + int r; + + prange = svm_range_from_addr(&p->svms, addr, NULL); + if (!prange) + break; + + if (!prange->mapped_to_gpu) + break; + + r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx); + if (r < 0) + break; + if (!test_bit(gpuidx, prange->bitmap_access) && + !test_bit(gpuidx, prange->bitmap_aip)) + break; + + if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) + break; + + list_add(&prange->update_list, &update_list); + + if (prange->last - prange->start + 1 >= size) { + size = 0; + break; + } + + size -= prange->last - prange->start + 1; + addr += prange->last - prange->start + 1; + } + if (size) { + pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1); + goto out_unlock; + } + + list_for_each_entry(prange, &update_list, update_list) + atomic_inc(&prange->queue_refcount); + ret = 0; + +out_unlock: + mutex_unlock(&p->svms.lock); + return ret; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct svm_range *prange, *pchild; + struct interval_tree_node *node; + unsigned long last; + + addr >>= PAGE_SHIFT; + last = addr + (size >> PAGE_SHIFT) - 1; + + mutex_lock(&p->svms.lock); + + node = interval_tree_iter_first(&p->svms.objects, addr, last); + while (node) { + struct interval_tree_node *next_node; + unsigned long next_start; + + prange = container_of(node, struct svm_range, it_node); + next_node = interval_tree_iter_next(node, addr, last); + next_start = min(node->last, last) + 1; + + if (atomic_add_unless(&prange->queue_refcount, -1, 0)) { + list_for_each_entry(pchild, &prange->child_list, child_list) + atomic_add_unless(&pchild->queue_refcount, -1, 0); + } + + node = next_node; + addr = next_start; + } + + mutex_unlock(&p->svms.lock); +} + int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size) { @@ -165,8 +260,17 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, &properties->cwsr_bo, 0); + if (!err) + goto out_unreserve; + + amdgpu_bo_unreserve(vm->root.bo); + + err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, +
[PATCH v2 7/9] drm/amdkfd: Validate user queue update
Ensure update queue new ring buffer is mapped on GPU with correct size. Decrease queue old ring_bo queue_refcount and increase new ring_bo queue_refcount. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- .../amd/amdkfd/kfd_process_queue_manager.c| 32 ++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 4947f28b3afb..9995dbb43359 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -549,11 +549,41 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, struct process_queue_node *pqn; pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { + if (!pqn || !pqn->q) { pr_debug("No queue %d exists for update operation\n", qid); return -EFAULT; } + /* +* Update with NULL ring address is used to disable the queue +*/ + if (p->queue_address && p->queue_size) { + struct kfd_process_device *pdd; + struct amdgpu_vm *vm; + struct queue *q = pqn->q; + int err; + + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) + return -ENODEV; + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo, +p->queue_size)) { + pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n", +p->queue_address, p->queue_size); + return -EFAULT; + } + + kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo); + amdgpu_bo_unreserve(vm->root.bo); + + pqn->q->properties.ring_bo = p->ring_bo; + } + pqn->q->properties.queue_address = p->queue_address; pqn->q->properties.queue_size = p->queue_size; pqn->q->properties.queue_percent = p->queue_percent; -- 2.43.2
[PATCH v2 1/9] drm/amdkfd: kfd_bo_mapped_dev support partition
Change amdgpu_amdkfd_bo_mapped_to_dev to use drm_priv as parameter instead of adev, to support spatial partition. This is only used by CRIU checkpoint restore now. No functional change. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index e7bb1ca35801..66b1c72c81e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -345,7 +345,7 @@ void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad pasid_notify pasid_fn, void *data, uint32_t reset); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); -bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); +bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 11672bfe4fad..199e387d35f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -3200,12 +3200,13 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, return 0; } -bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem) +bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem) { + struct amdgpu_vm *vm = drm_priv_to_vm(drm_priv); struct kfd_mem_attachment *entry; list_for_each_entry(entry, &mem->attachments, list) { - if (entry->is_mapped && entry->adev == adev) + if (entry->is_mapped && entry->bo_va->base.vm == vm) return true; } return false; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 32e5db509560..1d9b21628be7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1963,7 +1963,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); for (i = 0; i < p->n_pdds; i++) { - if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->drm_priv, kgd_mem)) bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; } -- 2.43.2
[PATCH v2 0/9] KFD user queue validation
This patch series do additional queue buffers validation in the queue creation IOCTLS, fail the queue creation if buffers not mapped on the GPU with the expected size. Ensure queue buffers residency by tracking the GPUVM virtual addresses for queue buffers to return error if the user tries to free and unmap them when the qeueu is active, or evict the queue if SVM memory is unmapped and freed from CPU. Patch 1-2 is prepration work and general fix. v2: - patch 3/9, keep wptr_bo_gart in struct queue Philip Yang (9): drm/amdkfd: kfd_bo_mapped_dev support partition drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer drm/amdkfd: Refactor queue wptr_bo GART mapping drm/amdkfd: Validate user queue buffers drm/amdkfd: Ensure user queue buffers residency drm/amdkfd: Validate user queue svm memory residency drm/amdkfd: Validate user queue update drm/amdkfd: Store queue cwsr area size to node properties drm/amdkfd: Validate queue cwsr area and eop buffer size drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 6 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 24 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h| 6 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 61 +--- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 8 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 19 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +- .../amd/amdkfd/kfd_process_queue_manager.c| 79 +++- drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 336 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 12 + drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 4 + 16 files changed, 489 insertions(+), 91 deletions(-) -- 2.43.2
Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping
On 2024-07-17 16:16, Felix Kuehling wrote: Sorry, I see that this patch still doesn't propagate errors returned from kfd_queue_releasre_buffers correctly. And the later patches in the series don't seem to fix it either. See inline. kfd_queue_release_buffers return value is handled in queue destroy path, to return -ERESTARTSYS if fail to hold vm lock to release buffers because signal is received. See inline. On 2024-07-15 08:34, Philip Yang wrote: Add helper function kfd_queue_acquire_buffers to get queue wptr_bo reference from queue write_ptr if it is mapped to the KFD node with expected size. Move wptr_bo to structure queue_properties from struct queue as queue is allocated after queue buffers are validated, then we can remove wptr_bo parameter from pqm_create_queue. Because amdgpu_bo_unref clear the pointer, queue_properties wptr_bo is used to acquire and release wptr_bo for validation, add wptr_bo_gart to queue_propertes, to hold wptr_bo reference for GART mapping and umapping. Move MES wptr_bo GART mapping to init_user_queue, the same location with queue ctx_bo GART mapping. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 56 +++--- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++-- .../amd/amdkfd/kfd_process_queue_manager.c | 45 +++ drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 57 +++ 7 files changed, 116 insertions(+), 69 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 6e591280774b..4ed49265c764 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, void **kptr, uint64_t *size); void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo); +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart); int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, struct dma_fence __rcu **ef); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 199e387d35f4..0ab37e7aec26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory( /** * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count * @bo: Buffer object to be mapped + * @bo_gart: Return bo reference * * Before return, bo reference count is incremented. To release the reference and unpin/ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem. */ -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart) { int ret; @@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) amdgpu_bo_unreserve(bo); - bo = amdgpu_bo_ref(bo); + *bo_gart = amdgpu_bo_ref(bo); return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 823f245dc7d0..202f24ee4bd7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -247,8 +247,8 @@ static int set_queue_properties_from_u
Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping
On 2024-07-17 16:10, Felix Kuehling wrote: @@ -603,8 +606,6 @@ struct queue { void *gang_ctx_bo; uint64_t gang_ctx_gpu_addr; void *gang_ctx_cpu_ptr; - - struct amdgpu_bo *wptr_bo; If the wptr_bo_gart is GART-mapped and freed in the same place as the gang_ctx_bo, then maybe it makes sense to keep the two together in this structure. It also avoids having two different reference to the same BO in the same queue_properties structure above. Yes, agree it makes sense to keep it inside struct queue and rename to wptr_bo_gart for GART mapping and unmapping. Add wptr_bo to struct queue_properties for queue wptr acquire and release. Regards, Philip Other than that, this patch looks good to me. Regards, Felix }; enum KFD_MQD_TYPE {
[PATCH 9/9] drm/amdkfd: Validate queue cwsr area and eop buffer size
When creating KFD user compute queue, check if queue eop buffer size, cwsr area size, ctl stack size equal to the size of KFD node properities. Check the entire cwsr area which may split into multiple svm ranges aligned to gramularity boundary. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 46 +++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index adcda9730c9f..9807e8adf77d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -225,9 +225,15 @@ void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo) int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { + struct kfd_topology_device *topo_dev; struct amdgpu_vm *vm; + u32 total_cwsr_size; int err; + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); if (err) @@ -252,6 +258,12 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope /* EOP buffer is not required for all ASICs */ if (properties->eop_ring_buffer_address) { + if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { + pr_debug("queue eop bo size 0x%lx not equal to node eop buf size 0x%x\n", + properties->eop_buf_bo->tbo.base.size, + topo_dev->node_props.eop_buffer_size); + goto out_err_unreserve; + } err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, &properties->eop_buf_bo, properties->eop_ring_buffer_size); @@ -259,15 +271,33 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; } + if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) { + pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n", + properties->ctl_stack_size, + topo_dev->node_props.ctl_stack_size); + goto out_err_unreserve; + } + + if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + properties->ctx_save_restore_area_size, + topo_dev->node_props.cwsr_size); + goto out_err_unreserve; + } + + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, - &properties->cwsr_bo, 0); + &properties->cwsr_bo, total_cwsr_size); if (!err) goto out_unreserve; amdgpu_bo_unreserve(vm->root.bo); err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, - properties->ctx_save_restore_area_size); + total_cwsr_size); if (err) goto out_err_release; @@ -286,7 +316,9 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { + struct kfd_topology_device *topo_dev; struct amdgpu_vm *vm; + u32 total_cwsr_size; int err; vm = drm_priv_to_vm(pdd->drm_priv); @@ -302,8 +334,14 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope amdgpu_bo_unreserve(vm->root.bo); - kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, -properties->ctx_save_restore_area_size); + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + + kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); return 0; } -- 2.43.2
[PATCH 4/9] drm/amdkfd: Validate user queue buffers
Find user queue rptr, ring buf, eop buffer and cwsr area BOs, and check BOs are mapped on the GPU with correct size and take the BO reference. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 +++ drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 38 -- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c98ff548313c..d0dca20849d9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -525,6 +525,10 @@ struct queue_properties { struct amdgpu_bo *wptr_bo_gart; struct amdgpu_bo *wptr_bo; + struct amdgpu_bo *rptr_bo; + struct amdgpu_bo *ring_bo; + struct amdgpu_bo *eop_buf_bo; + struct amdgpu_bo *cwsr_bo; }; #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index b4529ec298a9..0e661160c295 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -97,7 +97,8 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_ if (!mapping) goto out_err; - if (user_addr != mapping->start || user_addr + size - 1 != mapping->last) { + if (user_addr != mapping->start || + (size != 0 && user_addr + size - 1 != mapping->last)) { pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n", expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT, (mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT); @@ -124,18 +125,51 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE); if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE); + if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, + &properties->ring_bo, properties->queue_size); + if (err) + goto out_err_unreserve; + + /* only compute queue requires EOP buffer and CWSR area */ + if (properties->type != KFD_QUEUE_TYPE_COMPUTE) goto out_unreserve; + /* EOP buffer is not required for all ASICs */ + if (properties->eop_ring_buffer_address) { + err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, + &properties->eop_buf_bo, + properties->eop_ring_buffer_size); + if (err) + goto out_err_unreserve; + } + + err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, + &properties->cwsr_bo, 0); + if (err) + goto out_err_unreserve; + +out_unreserve: amdgpu_bo_unreserve(vm->root.bo); return 0; -out_unreserve: +out_err_unreserve: amdgpu_bo_unreserve(vm->root.bo); + kfd_queue_release_buffers(pdd, properties); return err; } int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { amdgpu_bo_unref(&properties->wptr_bo); + amdgpu_bo_unref(&properties->rptr_bo); + amdgpu_bo_unref(&properties->ring_bo); + amdgpu_bo_unref(&properties->eop_buf_bo); + amdgpu_bo_unref(&properties->cwsr_bo); return 0; } -- 2.43.2
[PATCH 5/9] drm/amdkfd: Ensure user queue buffers residency
Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap BO from the GPU if the bo_va queue_refcount is not zero. Create queue to increase the bo_va queue_refcount, destroy queue to decrease the bo_va queue_refcount, to ensure the queue buffers mapped on the GPU when queue is active. Signed-off-by: Philip Yang --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h| 6 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 34 --- 5 files changed, 49 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ab37e7aec26..6d5fd371d5ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, return ret; } -static void unmap_bo_from_gpuvm(struct kgd_mem *mem, +static int unmap_bo_from_gpuvm(struct kgd_mem *mem, struct kfd_mem_attachment *entry, struct amdgpu_sync *sync) { @@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem, struct amdgpu_device *adev = entry->adev; struct amdgpu_vm *vm = bo_va->base.vm; + if (bo_va->queue_refcount) { + pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount); + return -EBUSY; + } + amdgpu_vm_bo_unmap(adev, bo_va, entry->va); amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); amdgpu_sync_fence(sync, bo_va->last_pt_update); + + return 0; } static int update_gpuvm_pte(struct kgd_mem *mem, @@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", entry->va, entry->va + bo_size, entry); - unmap_bo_from_gpuvm(mem, entry, ctx.sync); + ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync); + if (ret) + goto unreserve_out; + entry->is_mapped = false; mem->mapped_to_gpu_memory--; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index bc42ccbde659..d7e27957013f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -90,6 +90,12 @@ struct amdgpu_bo_va { boolcleared; boolis_xgmi; + + /* +* protected by vm reservation lock +* if non-zero, cannot unmap from GPU because user queues may still access it +*/ + unsigned intqueue_refcount; }; struct amdgpu_bo { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 202f24ee4bd7..65a37ac5a0f0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); if (err) { - pr_err("Failed to unmap from gpu %d/%d\n", - i, args->n_devices); + pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices); goto unmap_memory_from_gpu_failed; } args->n_success = i+1; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index d0dca20849d9..95fbdb12beb1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1291,6 +1291,7 @@ void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size); +void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo); int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 0e661160c295..3fd386dcb011 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct
[PATCH 8/9] drm/amdkfd: Store queue cwsr area size to node properties
Use the queue eop buffer size, cwsr area size, ctl stack size calculation from Thunk, store the value to KFD node properties. Those will be used to validate queue eop buffer size, cwsr area size, ctl stack size when creating KFD user compute queue. Those will be exposed to user space via sysfs KFD node properties, to remove the duplicate calculation code from Thunk. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 75 +++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 4 ++ 4 files changed, 82 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 95fbdb12beb1..58f5bc021ea9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1294,6 +1294,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_ void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo); int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_node *dev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 67242ce051b5..adcda9730c9f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -24,6 +24,7 @@ #include #include "kfd_priv.h" +#include "kfd_topology.h" #include "kfd_svm.h" void print_queue_properties(struct queue_properties *q) @@ -305,3 +306,77 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope properties->ctx_save_restore_area_size); return 0; } + +#define SGPR_SIZE_PER_CU 0x4000 +#define LDS_SIZE_PER_CU0x1 +#define HWREG_SIZE_PER_CU 0x1000 +#define DEBUGGER_BYTES_ALIGN 64 +#define DEBUGGER_BYTES_PER_WAVE32 + +static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) +{ + u32 vgpr_size = 0x4; + + if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */ + gfxv == 90010 ||/* GFX_VERSION_ALDEBARAN */ + gfxv == 90008) /* GFX_VERSION_ARCTURUS */ + vgpr_size = 0x8; + else if (gfxv == 11 || /* GFX_VERSION_PLUM_BONITO */ +gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ +gfxv == 12 || /* GFX_VERSION_GFX1200 */ +gfxv == 120001)/* GFX_VERSION_GFX1201 */ + vgpr_size = 0x6; + + return vgpr_size; +} + +#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv) \ + (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\ +LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU) + +#define CNTL_STACK_BYTES_PER_WAVE(gfxv)\ + ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/ + +#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40 + +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) +{ + struct kfd_node_properties *props = &dev->node_props; + u32 gfxv = props->gfx_target_version; + u32 ctl_stack_size; + u32 wg_data_size; + u32 wave_num; + u32 cu_num; + + if (gfxv < 80001) /* GFX_VERSION_CARRIZO */ + return; + + cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); + wave_num = (gfxv < 100100) ?/* GFX_VERSION_NAVI10 */ + min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512) + : cu_num * 32; + + wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv), PAGE_SIZE); + ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; + ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, + PAGE_SIZE); + + if ((gfxv / 1 * 1) == 10) { + /* HW design limits control stack size to 0x7000. +* This is insufficient for theoretical PM4 cases +* but sufficient for AQL, limited by SPI events. +*/ + ctl_stack_size = min(ctl_stack_size, 0x7000); + } + + props->ctl_stack_size = ctl_stack_size; + props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); + props->cwsr_size = ctl_stack_size + wg_data_size; + + if (gfxv == 80002) /* GFX_VERSION_TONGA */ + props->eop_buffer_size = 0x
[PATCH 7/9] drm/amdkfd: Validate user queue update
Ensure update queue new ring buffer is mapped on GPU with correct size. Decrease queue old ring_bo queue_refcount and increase new ring_bo queue_refcount. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- .../amd/amdkfd/kfd_process_queue_manager.c| 32 ++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 8552400d6d47..dda26a7e3c37 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -549,11 +549,41 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, struct process_queue_node *pqn; pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { + if (!pqn || !pqn->q) { pr_debug("No queue %d exists for update operation\n", qid); return -EFAULT; } + /* +* Update with NULL ring address is used to disable the queue +*/ + if (p->queue_address && p->queue_size) { + struct kfd_process_device *pdd; + struct amdgpu_vm *vm; + struct queue *q = pqn->q; + int err; + + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) + return -ENODEV; + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo, +p->queue_size)) { + pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n", +p->queue_address, p->queue_size); + return -EFAULT; + } + + kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo); + amdgpu_bo_unreserve(vm->root.bo); + + pqn->q->properties.ring_bo = p->ring_bo; + } + pqn->q->properties.queue_address = p->queue_address; pqn->q->properties.queue_size = p->queue_size; pqn->q->properties.queue_percent = p->queue_percent; -- 2.43.2
[PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping
Add helper function kfd_queue_acquire_buffers to get queue wptr_bo reference from queue write_ptr if it is mapped to the KFD node with expected size. Move wptr_bo to structure queue_properties from struct queue as queue is allocated after queue buffers are validated, then we can remove wptr_bo parameter from pqm_create_queue. Because amdgpu_bo_unref clear the pointer, queue_properties wptr_bo is used to acquire and release wptr_bo for validation, add wptr_bo_gart to queue_propertes, to hold wptr_bo reference for GART mapping and umapping. Move MES wptr_bo GART mapping to init_user_queue, the same location with queue ctx_bo GART mapping. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 56 +++--- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++-- .../amd/amdkfd/kfd_process_queue_manager.c| 45 +++ drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 57 +++ 7 files changed, 116 insertions(+), 69 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 6e591280774b..4ed49265c764 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, void **kptr, uint64_t *size); void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo); +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart); int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, struct dma_fence __rcu **ef); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 199e387d35f4..0ab37e7aec26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory( /** * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count * @bo: Buffer object to be mapped + * @bo_gart: Return bo reference * * Before return, bo reference count is incremented. To release the reference and unpin/ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem. */ -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart) { int ret; @@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) amdgpu_bo_unreserve(bo); - bo = amdgpu_bo_ref(bo); + *bo_gart = amdgpu_bo_ref(bo); return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 823f245dc7d0..202f24ee4bd7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->priority = args->queue_priority; q_properties->queue_address = args->ring_base_address; q_properties->queue_size = args->ring_size; - q_properties->read_ptr = (uint32_t *) args->read_pointer_address; - q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + q_properties->read_ptr = (void __user *)args->read_pointer_address; + q_properties->write_ptr = (void __user *)args->write_pointer_address; q_properties->eop_ring_buffer_address = args->eop_buffer_address; q_properties->eop_ring_buffer_size = args->eop_buffer_size; q_properties->ctx_save_restore_area_address = @@ -306,7 +306,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, struct kfd_process_device *pdd; struct queue_properties q_properties; uint32_t doorbell_offset_in_process = 0; - struct amdgpu_bo *wptr_bo = NULL; memset(&q_properties, 0, sizeof(struct queue_properties)); @@ -342,53 +341,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, } } - /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work -* on unmapped queues for usermode queue oversubscription (no aggregated doorbell) -*/ - if (dev->kfd->shared_resources.enable_mes && - ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) - >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { - struct amdgpu_bo_va_mapping *wptr_mapping; -
[PATCH 1/9] drm/amdkfd: kfd_bo_mapped_dev support partition
Change amdgpu_amdkfd_bo_mapped_to_dev to use drm_priv as parameter instead of adev, to support spatial partition. This is only used by CRIU checkpoint restore now. No functional change. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index e7bb1ca35801..66b1c72c81e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -345,7 +345,7 @@ void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad pasid_notify pasid_fn, void *data, uint32_t reset); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); -bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); +bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 11672bfe4fad..199e387d35f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -3200,12 +3200,13 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, return 0; } -bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem) +bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem) { + struct amdgpu_vm *vm = drm_priv_to_vm(drm_priv); struct kfd_mem_attachment *entry; list_for_each_entry(entry, &mem->attachments, list) { - if (entry->is_mapped && entry->adev == adev) + if (entry->is_mapped && entry->bo_va->base.vm == vm) return true; } return false; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 32e5db509560..1d9b21628be7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1963,7 +1963,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); for (i = 0; i < p->n_pdds; i++) { - if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->drm_priv, kgd_mem)) bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; } -- 2.43.2
[PATCH 6/9] drm/amdkfd: Validate user queue svm memory residency
Queue CWSR area maybe registered to GPU as svm memory, create queue to ensure svm mapped to GPU with KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED flag. Add queue_refcount to struct svm_range, to track queue CWSR area usage. Because unmap mmu notifier callback return value is ignored, if application unmap the CWSR area while queue is active, pr_warn message in dmesg log. To be safe, evict user queue. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 110 - drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 12 +++ drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 + 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 3fd386dcb011..67242ce051b5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -24,6 +24,7 @@ #include #include "kfd_priv.h" +#include "kfd_svm.h" void print_queue_properties(struct queue_properties *q) { @@ -83,6 +84,100 @@ void uninit_queue(struct queue *q) kfree(q); } +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct list_head update_list; + struct svm_range *prange; + int ret = -EINVAL; + + INIT_LIST_HEAD(&update_list); + addr >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + + mutex_lock(&p->svms.lock); + + /* +* range may split to multiple svm pranges aligned to granularity boundaery. +*/ + while (size) { + uint32_t gpuid, gpuidx; + int r; + + prange = svm_range_from_addr(&p->svms, addr, NULL); + if (!prange) + break; + + if (!prange->mapped_to_gpu) + break; + + r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx); + if (r < 0) + break; + if (!test_bit(gpuidx, prange->bitmap_access) && + !test_bit(gpuidx, prange->bitmap_aip)) + break; + + if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) + break; + + list_add(&prange->update_list, &update_list); + + if (prange->last - prange->start + 1 >= size) { + size = 0; + break; + } + + size -= prange->last - prange->start + 1; + addr += prange->last - prange->start + 1; + } + if (size) { + pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1); + goto out_unlock; + } + + list_for_each_entry(prange, &update_list, update_list) + atomic_inc(&prange->queue_refcount); + ret = 0; + +out_unlock: + mutex_unlock(&p->svms.lock); + return ret; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct svm_range *prange, *pchild; + struct interval_tree_node *node; + unsigned long last; + + addr >>= PAGE_SHIFT; + last = addr + (size >> PAGE_SHIFT) - 1; + + mutex_lock(&p->svms.lock); + + node = interval_tree_iter_first(&p->svms.objects, addr, last); + while (node) { + struct interval_tree_node *next_node; + unsigned long next_start; + + prange = container_of(node, struct svm_range, it_node); + next_node = interval_tree_iter_next(node, addr, last); + next_start = min(node->last, last) + 1; + + if (atomic_add_unless(&prange->queue_refcount, -1, 0)) { + list_for_each_entry(pchild, &prange->child_list, child_list) + atomic_add_unless(&pchild->queue_refcount, -1, 0); + } + + node = next_node; + addr = next_start; + } + + mutex_unlock(&p->svms.lock); +} + int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size) { @@ -165,8 +260,17 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, &properties->cwsr_bo, 0); + if (!err) + goto out_unreserve; + + amdgpu_bo_unreserve(vm->root.bo); + + err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, + properties-&
[PATCH 0/9] KFD user queue validation
This patch series do additional queue buffers validation in the queue creation IOCTLS, fail the queue creation if buffers not mapped on the GPU with the expected size. Ensure queue buffers residency by tracking the GPUVM virtual addresses for queue buffers to return error if the user tries to free and unmap them when the qeueu is active, or evict the queue if SVM memory is unmapped and freed from CPU. Patch 1-2 is prepration work and general fix. Philip Yang (9): drm/amdkfd: kfd_bo_mapped_dev support partition drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer drm/amdkfd: Refactor queue wptr_bo GART mapping drm/amdkfd: Validate user queue buffers drm/amdkfd: Ensure user queue buffers residency drm/amdkfd: Validate user queue svm memory residency drm/amdkfd: Validate user queue update drm/amdkfd: Store queue cwsr area size to node properties drm/amdkfd: Validate queue cwsr area and eop buffer size drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 6 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 24 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h| 6 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 61 +--- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 8 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 20 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +- .../amd/amdkfd/kfd_process_queue_manager.c| 79 +++- drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 336 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 12 + drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 4 + 16 files changed, 489 insertions(+), 92 deletions(-) -- 2.43.2
[PATCH 2/9] drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer
Pass pointer reference to amdgpu_bo_unref to clear the correct pointer, otherwise amdgpu_bo_unref clear the local variable, the original pointer not set to NULL, this could cause use-after-free bug. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_device.c| 4 ++-- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 03205e3c3746..c272461d70a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size, return r; } -void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj) +void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj) { - struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; + struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj; - amdgpu_bo_reserve(bo, true); - amdgpu_bo_kunmap(bo); - amdgpu_bo_unpin(bo); - amdgpu_bo_unreserve(bo); - amdgpu_bo_unref(&(bo)); + amdgpu_bo_reserve(*bo, true); + amdgpu_bo_kunmap(*bo); + amdgpu_bo_unpin(*bo); + amdgpu_bo_unreserve(*bo); + amdgpu_bo_unref(bo); } int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 66b1c72c81e5..6e591280774b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size, void **mem_obj, uint64_t *gpu_addr, void **cpu_ptr, bool mqd_gfx9); -void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj); +void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj); int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size, void **mem_obj); void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 1d9b21628be7..823f245dc7d0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -423,7 +423,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, err_create_queue: if (wptr_bo) - amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&wptr_bo); err_wptr_map_gart: err_bind_process: err_pdd: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index f4d20adaa068..6619028dd58b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -907,7 +907,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); alloc_gtt_mem_failure: dev_err(kfd_device, "device %x:%x NOT added due to errors\n", @@ -925,7 +925,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); ida_destroy(&kfd->doorbell_ida); kfd_gtt_sa_fini(kfd); - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); } kfree(kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 4f48507418d2..420444eb8e98 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2621,7 +2621,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, { WARN(!mqd, "No hiq sdma mqd trunk to free"); - amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem); } void device_queue_manager_uninit(struct device_queue_manager *dqm) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/g
Re: [PATCH] drm/amdkfd: Correct svm prange overlapping handling at svm_range_set_attr ioctl
On 2024-06-21 13:28, Xiaogang.Chen wrote: From: Xiaogang Chen When user adds new vm range that has overlapping with existing svm pranges current kfd clones new prange and remove existing pranges including all data associate with it. It is not necessary. We can handle the overlapping on existing pranges directly that would simplify kfd code. And, when remove a existing prange the locks from it will get destroyed. This may cause issue if code still use these locks. And locks from cloned prange do not inherit context of locks that got removed. This patch does not remove existing pranges or clone new pranges, keeps locks of pranges alive. Signed-off-by: Xiaogang Chen --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 89 1 file changed, 12 insertions(+), 77 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 407636a68814..a8fcace6f9a2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -904,23 +904,6 @@ svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements, return (void *)dst; } -static int -svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src) -{ - int i; - - for (i = 0; i < MAX_GPU_INSTANCE; i++) { - if (!src->dma_addr[i]) - continue; - dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i], - sizeof(*src->dma_addr[i]), src->npages, 0, NULL); - if (!dst->dma_addr[i]) - return -ENOMEM; - } - - return 0; -} - static int svm_range_split_array(void *ppnew, void *ppold, size_t size, uint64_t old_start, uint64_t old_n, @@ -1967,38 +1950,6 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm, return r; } -static struct svm_range *svm_range_clone(struct svm_range *old) -{ - struct svm_range *new; - - new = svm_range_new(old->svms, old->start, old->last, false); - if (!new) - return NULL; - if (svm_range_copy_dma_addrs(new, old)) { - svm_range_free(new, false); - return NULL; - } - if (old->svm_bo) { - new->ttm_res = old->ttm_res; - new->offset = old->offset; - new->svm_bo = svm_range_bo_ref(old->svm_bo); - spin_lock(&new->svm_bo->list_lock); - list_add(&new->svm_bo_list, &new->svm_bo->range_list); - spin_unlock(&new->svm_bo->list_lock); - } - new->flags = old->flags; - new->preferred_loc = old->preferred_loc; - new->prefetch_loc = old->prefetch_loc; - new->actual_loc = old->actual_loc; - new->granularity = old->granularity; - new->mapped_to_gpu = old->mapped_to_gpu; - new->vram_pages = old->vram_pages; - bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); - bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); - - return new; -} - void svm_range_set_max_pages(struct amdgpu_device *adev) { uint64_t max_pages; @@ -2057,7 +2008,6 @@ svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last, * @attrs: array of attributes * @update_list: output, the ranges need validate and update GPU mapping * @insert_list: output, the ranges need insert to svms - * @remove_list: output, the ranges are replaced and need remove from svms * @remap_list: output, remap unaligned svm ranges * * Check if the virtual address range has overlap with any existing ranges, @@ -2082,7 +2032,7 @@ static int svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, struct list_head *update_list, struct list_head *insert_list, - struct list_head *remove_list, struct list_head *remap_list) + struct list_head *remap_list) { unsigned long last = start + size - 1UL; struct svm_range_list *svms = &p->svms; @@ -2096,7 +2046,6 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, INIT_LIST_HEAD(update_list); INIT_LIST_HEAD(insert_list); - INIT_LIST_HEAD(remove_list); INIT_LIST_HEAD(&new_list); INIT_LIST_HEAD(remap_list); @@ -2117,20 +2066,11 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, /* nothing to do */ } else if (node->start < start || node->last > last) { /* node intersects the update range and its attributes - * will change. Clone and split it, apply updates only + * will change. Split it, apply updates only * to the overlapping part */ - struct svm_range *old = prange; - - prange = svm_range_clone(old); - if (!prange) { -r = -ENOMEM; -goto out; - } - - list_add(&old->update_list, remove_list); - list_add(&prange->list, insert_list); - list_add(&prange->update_list, update_list); + list_move_tail(&prange->list, insert_list); + list_move_tail(&prange->update_list, update_list); The main purpose to clone prange is for error handling rollback. If removing original prange from svms and update it on insert_list, how do you rollback to put prange back to svms after splitting prange error? We hold svms lock to ac
[PATCH] drm/amdgpu: Show retry fault message if process xnack on
If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit set, but the driver select xnack on or off path depending on per process xnack setting which is also used to set qpd mem_config xnack on or off if KFD_SUPPORT_XNACK_PER_PROCESS. If process is xnack on, then GPU page fault show retry page fault message, otherwise show no-retry page fault message, to avoid misleading when debugging application page fault issue. The process lookup from pasid is done inside retry fault handler svm_range_restore_pages, add xnack_on parameter to pass process xnack setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler to show vm fault message. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 6 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 64ddc87f7fb6..58f7ab193027 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * GFX 9.4.3. * @addr: Address of the fault * @write_fault: true is write fault, false is read fault + * @xnack_on: return value, true if the process sets xnack on * * Try to gracefully handle a VM fault. Return true if the fault was handled and * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, - bool write_fault) + bool write_fault, bool *xnack_on) { bool is_compute_context = false; struct amdgpu_bo *root; @@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, write_fault, xnack_on)) { amdgpu_bo_unref(&root); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index bc71b44387b2..7f364f0b9a60 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, - bool write_fault); + bool write_fault, bool *xnack_on); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d933e19e0cf5..2f0752376236 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault, NULL)) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 671a6766df5b..3db0f2304b6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, uint32_t cam_index = 0; int ret, xcc_id = 0; uint32_t node_id; + bool xnack_on = false; node_id = entry->node_id; @@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, -addr, write_fault); +addr, write_fault, &xnack_on); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, -
Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On 2024-05-02 08:42, James Zhu wrote: On 2024-05-01 18:56, Philip Yang wrote: On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); [JZ] should we reduce MAX_WALK_BYTE to 64M in the meantime? From debug log, the range size is not related, 64MB range may takes same long time to return EBUSY too. retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); [JZ] the above is for CPU stall WA, we may still need keep it. The timeout 1 second should be long enough for normal case, if hmm_range_fault returns EBUSY, we release mmap_read lock and return to user space, so don't need explicit schedule to fix the CPU stale warning. Will run overnight KFDTest LargestSysBufferTest on larger memory system to confirm if there is CPU stale message. Regards, Philip - /* - * FIXME: This timeout should encompass the retry from - * mmu_interval_read_retry() as well. - */ if (r == -EBUSY && !time_after(jiffies, timeout)) goto retry; goto out_free_pfns; @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, out_free_range: kfree(hmm_range); + if (r == -EBUSY) + r = -EAGAIN; return r; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kf
Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On 2024-05-02 00:09, Chen, Xiaogang wrote: On 5/1/2024 5:56 PM, Philip Yang wrote: Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Wonder why letting user space do retry is better? Seems this issue is caused by hugepage merging, so how user space can avoid it? The issue is caused by khugepaged + 4 processes + sdma stalls test (to slow down sdma) + small_BAR + QPX mode, during overnight test, hmm_range_fault 180MB buffer may takes >15 seconds returns EBUSY, then alloc memory ioctl failed. Return EAGAIN, Thunk will call the alloc memory ioctl again, and we don't see the alloc memory failure. And applications may not use Thunk or libdrm, instead, use ioctl directly. If app calls ioctl directly, it should do the same thing, to call ioctl again if errno is EINTR or EAGAIN. Regards, Philip Regards Xiaogang Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); - /* - * FIXME: This timeout should encompass the retry from - * mmu_interval_read_retry() as well. - */
[PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); - /* -* FIXME: This timeout should encompass the retry from -* mmu_interval_read_retry() as well. -*/ if (r == -EBUSY && !time_after(jiffies, timeout)) goto retry; goto out_free_pfns; @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, out_free_range: kfree(hmm_range); + if (r == -EBUSY) + r = -EAGAIN; return r; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 94f83be2232d..e7040f809f33 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, readonly, owner, NULL, &hmm_range); WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { + if (r) pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - } } else { r = -EFAULT; } -- 2.43.2
Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()
On 2024-04-30 19:29, Ramesh Errabolu wrote: Analysis of code by Coverity, a static code analyser, has identified a resource leak in the symbol hmm_range. This leak occurs when one of the prior steps before it is released encounters an error. Signed-off-by: Ramesh Errabolu Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 386875e6eb96..dcb1d5d3f860 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + // Free backing memory of hmm_range if it was initialized + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; }
[PATCH v6 1/5] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index f672205243e0..02d66faaade5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..d09c4a18e571 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v6 5/5] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index d09c4a18e571..f8e9d3c1d117 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v6 4/5] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. v6: user context should use interruptible call (Felix) Signed-off-by: Philip Yang --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 02d66faaade5..acc825b84113 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,30 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + struct ttm_operation_ctx ctx = { true, false }; + + amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v6 0/5] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro name. v6: use shorter flag name, use interruptible wait ctx, drop patch 5/6 (Felix) Philip Yang (5): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 23 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 +- include/uapi/linux/kfd_ioctl.h| 4 +++- 4 files changed, 33 insertions(+), 9 deletions(-) -- 2.43.2
[PATCH v6 3/5] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 64f5001a7dc5..c21ea808f931 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1403,7 +1403,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(&resv_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v6 2/5] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro AMDGPU_MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than AMDGPU_MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..ebffb58ea53a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - + size = remaining_size; if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; @@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, size); } return 0; -- 2.43.2
Re: [PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation
On 2024-04-23 18:17, Felix Kuehling wrote: On 2024-04-23 11:28, Philip Yang wrote: RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) If I understand it correctly, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS was redefined to mean "best effort". Maybe we can drop the explicit "BEST_EFFORT" from this flag as well to keep the name to a reasonable length. yes, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is redefined, to implement "best effort" without adding new upstream GEM flag, so we may get scattered allocation if contiguous allocation failed. If we drop the "BEST_EFFORT" from flag name, this may mislead the users. Regards, Philip Regards, Felix /* Allocate memory for later SVM (shared virtual memory) mapping. *
Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation
On 2024-04-23 18:15, Felix Kuehling wrote: On 2024-04-23 11:28, Philip Yang wrote: If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* + * If bo is not contiguous on VRAM, move to system memory first to ensure + * we can get contiguous VRAM space after evicting other BOs. + */ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It always runs uninterruptible. I believe pin_bo runs in the context of ioctls from user mode. So it should be interruptible. yes, pin_bo is in the context of user mode, from KFD alloc memory or from rdma driver get pages, should use interruptible wait. amdgpu_amdkfd_bo_validate is currently used by kernel threads and ioctl amdgpu_amdkfd_add_gws_to_process (this seems bug), does it make sense to add parameter interruptible, then we can remove many duplicate code amdgpu_bo_placement_from_domain + ttm_bo_validate or I can fix it here and leave the cleanup and bug fix in the future? Regards, Philip Regards, Felix + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; }
[PATCH v5 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a5ebbe98ff7f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v5 2/6] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..ebffb58ea53a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - + size = remaining_size; if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; @@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, size); } return 0; -- 2.43.2
[PATCH v5 3/6] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(&resv_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v5 5/6] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v5 0/6] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro name. Philip Yang (6): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 +-- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/linux/kfd_ioctl.h| 4 +++- 5 files changed, 31 insertions(+), 10 deletions(-) -- 2.43.2
Re: [PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
On 2024-04-23 09:32, Christian König wrote: Am 23.04.24 um 15:04 schrieb Philip Yang: To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Well just to make it clear this patch is really a no-go for upstreaming. The RDMA code isn't upstream as far as I know and doing this here is really not a good idea even internally. Right, this change is not needed and not related to upstream, just to minimize the difference with upstream. I will not upstream this patch. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 6c7133bf51d8..101a85263b53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); }
[PATCH v4 1/7] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v4 3/7] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(&resv_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v4 7/7] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a5ebbe98ff7f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v4 4/7] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 6c7133bf51d8..101a85263b53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); } -- 2.43.2
[PATCH v4 5/7] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH v4 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..6c7133bf51d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define MAX_SG_SEGMENT_SIZE(2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - + size = remaining_size; if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; @@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, min(cursor.size, MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, size); } return 0; -- 2.43.2
[PATCH v4 0/7] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) Philip Yang (7): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdgpu: Skip dma map resource for null RDMA device drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 45 ++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/linux/kfd_ioctl.h| 4 +- 5 files changed, 50 insertions(+), 24 deletions(-) -- 2.43.2
Re: [PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
On 2024-04-22 10:56, Christian König wrote: Am 22.04.24 um 15:57 schrieb Philip Yang: To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Well that is completely illegal and would break IOMMU. Why does the RDMA driver does that in the first place? That is the amdp2ptest driver, part of KFDTest rdma test. The simple rdma test app and driver is used to test the driver path, without actually transferring data b/w machines. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9fe56a21ef88..0caf2c89ef1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); }
Re: [PATCH v3 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation
On 2024-04-22 10:40, Christian König wrote: Am 22.04.24 um 15:57 schrieb Philip Yang: Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..9fe56a21ef88 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define MAX_SG_SEGMENT_SIZE (2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,8 +534,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations + * for no contiguous allocation. + */ + size = min(remaining_size, MAX_SG_SEGMENT_SIZE); Well that doesn't make sense, either fix the creation of the sg tables or limit the segment size. Not both. yes, right. we don't need limit the segment size for non-contiguous allocation either as this is handled by min_block_size. I will send v4 patch to fix this. Then we could have another patch to remove the while loop, size and remaining size to simply the code in future. Regards, Philip if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) @@ -675,7 +682,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, min(cursor.size, MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +702,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); Please keep size_t here or use unsigned int, using unsigned long just looks like trying to hide the problem. And I wouldn't use a separate define but rather just INT_MAX instead. Regards, Christian. dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +715,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(&cursor, cursor.size); + amdgpu_res_next(&cursor, size); } return 0;
[PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9fe56a21ef88..0caf2c89ef1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); } -- 2.43.2