Hi Liu, Thanks for the explanation. I added the patch.
On Mon, Nov 20, 2017 at 6:30 PM, Liu, Shaoyun <shaoyun....@amd.com> wrote: > The save/restore memory is allocated per queue in user mode and the address > /size passed in the create_ioctl. The memory you noticed that allocated at > kfd_process_init_cwsr is the memory for CWSR shader code itself. This shader > code is executed by the shader in the user's address space(GPU point of > view), they are similar as signal handler code in CPU side . The CWSR > shader code has a fix size , one page plus another page for extra usage for > the CWSR ( ex . parameter for second level handler ). So we can easily > managed it in kernel . The save/restore data is quite big ( around 22M for > vega10) , so we don't want to allocate them in kernel . > Other comments in line . > > Regards > Shaoyun.liu > > -----Original Message----- > From: Oded Gabbay [mailto:oded.gab...@gmail.com] > Sent: Sunday, November 19, 2017 8:38 AM > To: Kuehling, Felix > Cc: amd-gfx list; Liu, Shaoyun; Zhao, Yong > Subject: Re: [PATCH 3/4] drm/amdkfd: Add CWSR support > > On Tue, Nov 14, 2017 at 11:41 PM, Felix Kuehling <felix.kuehl...@amd.com> > wrote: >> This hardware feature allows the GPU to preempt shader execution in >> the middle of a compute wave, save the state and restore it later to >> resume execution. >> >> Memory for saving the state is allocated per queue in user mode and >> the address and size passed to the create_queue ioctl. The size > Is this a correct description? > It seems to me the memory is allocated at kfd_process_init_cwsr() and the > address is saved internally and not passed in the create_ioctl. > Which begs the question, why indeed it is not allocated by the user and then > passed through the create_ioctl function ? > > >> depends on the number of waves that can be in flight simultaneously on >> a given ASIC. >> >> Signed-off-by: Shaoyun.liu <shaoyun....@amd.com> >> Signed-off-by: Yong Zhao <yong.z...@amd.com> >> Signed-off-by: Felix Kuehling <felix.kuehl...@amd.com> >> --- >> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 7 +- >> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 20 ++++- >> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 ++ >> drivers/gpu/drm/amd/amdkfd/kfd_module.c | 4 + >> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 27 +++++++ >> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 31 +++++++- >> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 87 >> +++++++++++++++++++++- >> include/uapi/linux/kfd_ioctl.h | 3 +- >> 8 files changed, 179 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> index 505d391..2a4612d 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> @@ -117,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file >> *filep) >> return -EPERM; >> } >> >> - process = kfd_create_process(current); >> + process = kfd_create_process(filep); >> if (IS_ERR(process)) >> return PTR_ERR(process); >> >> @@ -206,6 +206,7 @@ static int set_queue_properties_from_user(struct >> queue_properties *q_properties, >> q_properties->ctx_save_restore_area_address = >> args->ctx_save_restore_address; >> q_properties->ctx_save_restore_area_size = >> args->ctx_save_restore_size; >> + q_properties->ctl_stack_size = args->ctl_stack_size; >> if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || >> args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) >> q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ >> -1088,6 +1089,10 @@ static int kfd_mmap(struct file *filp, struct >> vm_area_struct *vma) >> KFD_MMAP_EVENTS_MASK) { >> vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; >> return kfd_event_mmap(process, vma); >> + } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) == >> + KFD_MMAP_RESERVED_MEM_MASK) { >> + vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK; >> + return kfd_reserved_mem_mmap(process, vma); >> } >> >> return -EFAULT; >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> index 621a3b5..4f05eac 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> @@ -27,6 +27,7 @@ >> #include "kfd_priv.h" >> #include "kfd_device_queue_manager.h" >> #include "kfd_pm4_headers_vi.h" >> +#include "cwsr_trap_handler_gfx8.asm" >> >> #define MQD_SIZE_ALIGNED 768 >> >> @@ -38,7 +39,8 @@ static const struct kfd_device_info kaveri_device_info = { >> .ih_ring_entry_size = 4 * sizeof(uint32_t), >> .event_interrupt_class = &event_interrupt_class_cik, >> .num_of_watch_points = 4, >> - .mqd_size_aligned = MQD_SIZE_ALIGNED >> + .mqd_size_aligned = MQD_SIZE_ALIGNED, >> + .supports_cwsr = false, >> }; >> >> static const struct kfd_device_info carrizo_device_info = { @@ -49,7 >> +51,8 @@ static const struct kfd_device_info carrizo_device_info = { >> .ih_ring_entry_size = 4 * sizeof(uint32_t), >> .event_interrupt_class = &event_interrupt_class_cik, >> .num_of_watch_points = 4, >> - .mqd_size_aligned = MQD_SIZE_ALIGNED >> + .mqd_size_aligned = MQD_SIZE_ALIGNED, >> + .supports_cwsr = true, >> }; >> >> struct kfd_deviceid { >> @@ -212,6 +215,17 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, >> int pasid, >> return AMD_IOMMU_INV_PRI_RSP_INVALID; } >> >> +static void kfd_cwsr_init(struct kfd_dev *kfd) { >> + if (cwsr_enable && kfd->device_info->supports_cwsr) { >> + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); >> + >> + kfd->cwsr_isa = cwsr_trap_gfx8_hex; >> + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); >> + kfd->cwsr_enabled = true; >> + } >> +} >> + >> bool kgd2kfd_device_init(struct kfd_dev *kfd, >> const struct kgd2kfd_shared_resources >> *gpu_resources) { @@ -286,6 +300,8 @@ bool kgd2kfd_device_init(struct >> kfd_dev *kfd, >> goto device_iommu_pasid_error; >> } >> >> + kfd_cwsr_init(kfd); >> + >> if (kfd_resume(kfd)) >> goto kfd_resume_error; >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> index e202921..5c06502 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> @@ -173,6 +173,9 @@ static int create_queue_nocpsch(struct >> device_queue_manager *dqm, >> *allocated_vmid = qpd->vmid; >> q->properties.vmid = qpd->vmid; >> >> + q->properties.tba_addr = qpd->tba_addr; >> + q->properties.tma_addr = qpd->tma_addr; >> + >> if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) >> retval = create_compute_queue_nocpsch(dqm, q, qpd); >> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) @@ -846,6 >> +849,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, >> struct queue *q, >> } >> >> dqm->asic_ops.init_sdma_vm(dqm, q, qpd); >> + >> + q->properties.tba_addr = qpd->tba_addr; >> + q->properties.tma_addr = qpd->tma_addr; >> retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, >> &q->gart_mqd_addr, &q->properties); >> if (retval) >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_module.c >> index 6c5a9ca..4b2423b 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c >> @@ -49,6 +49,10 @@ module_param(sched_policy, int, 0444); >> MODULE_PARM_DESC(sched_policy, >> "Scheduling policy (0 = HWS (Default), 1 = HWS without >> over-subscription, 2 = Non-HWS (Used for debugging only)"); >> >> +int cwsr_enable = 1; >> +module_param(cwsr_enable, int, 0444); MODULE_PARM_DESC(cwsr_enable, >> +"CWSR enable (0 = Off, 1 = On (Default))"); >> + >> int max_num_of_queues_per_device = >> KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; >> module_param(max_num_of_queues_per_device, int, 0444); >> MODULE_PARM_DESC(max_num_of_queues_per_device, >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c >> index 2ba7cea..00e1f1a 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c >> @@ -89,6 +89,28 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, >> if (q->format == KFD_QUEUE_FORMAT_AQL) >> m->cp_hqd_iq_rptr = 1; >> >> + if (q->tba_addr) { >> + m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); >> + m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); >> + m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); >> + m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); > Why the >> 8 on the addresses ? > > [shaoyunl] The hardware register requirements , they need to be shift 8 bits. > >> + m->compute_pgm_rsrc2 |= >> + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); >> + } >> + >> + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { >> + m->cp_hqd_persistent_state |= >> + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); >> + m->cp_hqd_ctx_save_base_addr_lo = >> + lower_32_bits(q->ctx_save_restore_area_address); >> + m->cp_hqd_ctx_save_base_addr_hi = >> + upper_32_bits(q->ctx_save_restore_area_address); >> + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; >> + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; >> + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; >> + m->cp_hqd_wg_state_offset = q->ctl_stack_size; > Just wanted to make sure the last two lines are not copy-paste error from the > third line > > [Shaoyunl] ye, that's requirement form hardware spec . > >> + } >> + >> *mqd = m; >> if (gart_addr) >> *gart_addr = addr; >> @@ -167,6 +189,11 @@ static int __update_mqd(struct mqd_manager *mm, void >> *mqd, >> 2 << >> CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; >> } >> >> + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) >> + m->cp_hqd_ctx_save_control = >> + atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | >> + mtype << >> + CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; >> + >> q->is_active = (q->queue_size > 0 && >> q->queue_address != 0 && >> q->queue_percent > 0); diff --git >> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> index 4750473..a668764 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> @@ -41,6 +41,7 @@ >> >> #define KFD_MMAP_DOORBELL_MASK 0x8000000000000 #define >> KFD_MMAP_EVENTS_MASK 0x4000000000000 >> +#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000 >> >> /* >> * When working with cp scheduler we should assign the HIQ manually >> or via @@ -63,6 +64,15 @@ #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS >> 1024 >> >> /* >> + * Size of the per-process TBA+TMA buffer: 2 pages >> + * >> + * The first page is the TBA used for the CWSR ISA code. The second >> + * page is used as TMA for daisy changing a user-mode trap handler. >> + */ >> +#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) #define >> +KFD_CWSR_TMA_OFFSET PAGE_SIZE >> + >> +/* >> * Kernel module parameter to specify maximum number of supported queues per >> * device >> */ >> @@ -78,6 +88,8 @@ extern int max_num_of_queues_per_device; >> /* Kernel module parameter to specify the scheduling policy */ >> extern int sched_policy; >> >> +extern int cwsr_enable; >> + >> /* >> * Kernel module parameter to specify whether to send sigterm to HSA >> process on >> * unhandled exception >> @@ -131,6 +143,7 @@ struct kfd_device_info { >> size_t ih_ring_entry_size; >> uint8_t num_of_watch_points; >> uint16_t mqd_size_aligned; >> + bool supports_cwsr; >> }; >> >> struct kfd_mem_obj { >> @@ -200,6 +213,11 @@ struct kfd_dev { >> >> /* Debug manager */ >> struct kfd_dbgmgr *dbgmgr; >> + >> + /* CWSR */ >> + bool cwsr_enabled; >> + const void *cwsr_isa; >> + unsigned int cwsr_isa_size; >> }; >> >> /* KGD2KFD callbacks */ >> @@ -332,6 +350,9 @@ struct queue_properties { >> uint32_t eop_ring_buffer_size; >> uint64_t ctx_save_restore_area_address; >> uint32_t ctx_save_restore_area_size; >> + uint32_t ctl_stack_size; >> + uint64_t tba_addr; >> + uint64_t tma_addr; >> }; >> >> /** >> @@ -439,6 +460,11 @@ struct qcm_process_device { >> uint32_t num_gws; >> uint32_t num_oac; >> uint32_t sh_hidden_private_base; >> + >> + /* CWSR memory */ >> + void *cwsr_kaddr; >> + uint64_t tba_addr; >> + uint64_t tma_addr; >> }; >> >> >> @@ -563,7 +589,7 @@ struct amdkfd_ioctl_desc { >> >> void kfd_process_create_wq(void); >> void kfd_process_destroy_wq(void); >> -struct kfd_process *kfd_create_process(const struct task_struct *); >> +struct kfd_process *kfd_create_process(struct file *filep); >> struct kfd_process *kfd_get_process(const struct task_struct *); >> struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); >> >> @@ -577,6 +603,9 @@ struct kfd_process_device >> *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process_device >> *kfd_create_process_device_data(struct kfd_dev *dev, >> struct >> kfd_process *p); >> >> +int kfd_reserved_mem_mmap(struct kfd_process *process, >> + struct vm_area_struct *vma); >> + >> /* Process device data iterator */ >> struct kfd_process_device *kfd_get_first_process_device_data( >> struct >> kfd_process *p); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> index 1bb9b26..39f4c19 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> @@ -28,6 +28,7 @@ >> #include <linux/amd-iommu.h> >> #include <linux/notifier.h> >> #include <linux/compat.h> >> +#include <linux/mman.h> >> >> struct mm_struct; >> >> @@ -53,6 +54,8 @@ struct kfd_process_release_work { >> >> static struct kfd_process *find_process(const struct task_struct >> *thread); static struct kfd_process *create_process(const struct >> task_struct *thread); >> +static int kfd_process_init_cwsr(struct kfd_process *p, struct file >> +*filep); >> + >> >> void kfd_process_create_wq(void) >> { >> @@ -68,9 +71,10 @@ void kfd_process_destroy_wq(void) >> } >> } >> >> -struct kfd_process *kfd_create_process(const struct task_struct >> *thread) >> +struct kfd_process *kfd_create_process(struct file *filep) >> { >> struct kfd_process *process; >> + struct task_struct *thread = current; >> >> if (!thread->mm) >> return ERR_PTR(-EINVAL); @@ -101,6 +105,8 @@ struct >> kfd_process *kfd_create_process(const struct task_struct *thread) >> >> up_write(&thread->mm->mmap_sem); >> >> + kfd_process_init_cwsr(process, filep); >> + >> return process; >> } >> >> @@ -168,6 +174,11 @@ static void kfd_process_wq_release(struct work_struct >> *work) >> amd_iommu_unbind_pasid(pdd->dev->pdev, >> p->pasid); >> >> list_del(&pdd->per_device_list); >> + >> + if (pdd->qpd.cwsr_kaddr) >> + free_pages((unsigned long)pdd->qpd.cwsr_kaddr, >> + get_order(KFD_CWSR_TBA_TMA_SIZE)); >> + >> kfree(pdd); >> } >> >> @@ -260,6 +271,46 @@ static const struct mmu_notifier_ops >> kfd_process_mmu_notifier_ops = { >> .release = kfd_process_notifier_release, }; >> >> +static int kfd_process_init_cwsr(struct kfd_process *p, struct file >> +*filep) { >> + int err = 0; >> + unsigned long offset; >> + struct kfd_process_device *temp, *pdd = NULL; >> + struct kfd_dev *dev = NULL; >> + struct qcm_process_device *qpd = NULL; >> + >> + mutex_lock(&p->mutex); >> + list_for_each_entry_safe(pdd, temp, &p->per_device_data, >> + per_device_list) { >> + dev = pdd->dev; >> + qpd = &pdd->qpd; >> + if (!dev->cwsr_enabled || qpd->cwsr_kaddr) >> + continue; >> + offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << >> PAGE_SHIFT; >> + qpd->tba_addr = (int64_t)vm_mmap(filep, 0, >> + KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, >> + MAP_SHARED, offset); >> + >> + if (IS_ERR_VALUE(qpd->tba_addr)) { >> + pr_err("Failure to set tba address. error -%d.\n", >> + (int)qpd->tba_addr); >> + err = qpd->tba_addr; >> + qpd->tba_addr = 0; >> + qpd->cwsr_kaddr = NULL; >> + goto out; >> + } >> + >> + memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, >> + dev->cwsr_isa_size); >> + >> + qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET; >> + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for >> pqm.\n", >> + qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); >> + } >> +out: >> + mutex_unlock(&p->mutex); >> + return err; >> +} >> + >> static struct kfd_process *create_process(const struct task_struct >> *thread) { >> struct kfd_process *process; >> @@ -535,3 +586,37 @@ struct kfd_process >> *kfd_lookup_process_by_pasid(unsigned int pasid) >> >> return p; >> } >> + >> +int kfd_reserved_mem_mmap(struct kfd_process *process, >> + struct vm_area_struct *vma) { >> + struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); >> + struct kfd_process_device *pdd; >> + struct qcm_process_device *qpd; >> + >> + if (!dev) >> + return -EINVAL; >> + if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { >> + pr_err("Incorrect CWSR mapping size.\n"); >> + return -EINVAL; >> + } >> + >> + pdd = kfd_get_process_device_data(dev, process); >> + if (!pdd) >> + return -EINVAL; >> + qpd = &pdd->qpd; >> + >> + qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, >> + get_order(KFD_CWSR_TBA_TMA_SIZE)); >> + if (!qpd->cwsr_kaddr) { >> + pr_err("Error allocating per process CWSR buffer.\n"); >> + return -ENOMEM; >> + } >> + >> + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND >> + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; >> + /* Mapping pages to user process */ >> + return remap_pfn_range(vma, vma->vm_start, >> + PFN_DOWN(__pa(qpd->cwsr_kaddr)), >> + KFD_CWSR_TBA_TMA_SIZE, >> +vma->vm_page_prot); } >> diff --git a/include/uapi/linux/kfd_ioctl.h >> b/include/uapi/linux/kfd_ioctl.h index 731d0df..7039f16 100644 >> --- a/include/uapi/linux/kfd_ioctl.h >> +++ b/include/uapi/linux/kfd_ioctl.h >> @@ -58,7 +58,8 @@ struct kfd_ioctl_create_queue_args { >> __u64 eop_buffer_address; /* to KFD */ >> __u64 eop_buffer_size; /* to KFD */ >> __u64 ctx_save_restore_address; /* to KFD */ >> - __u64 ctx_save_restore_size; /* to KFD */ >> + __u32 ctx_save_restore_size; /* to KFD */ >> + __u32 ctl_stack_size; /* to KFD */ >> }; >> >> struct kfd_ioctl_destroy_queue_args { >> -- >> 2.7.4 >> _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx