On Mon, Feb 04, 2019 at 10:32:51PM +0200, Oded Gabbay wrote: > From: Omer Shpigelman <oshpigel...@habana.ai> > > This patch adds the Virtual Memory and MMU modules. > > Goya has an internal MMU which provides process isolation on the internal > DDR. The internal MMU also performs translations for transactions that go > from Goya to the Host. > > The driver is responsible for allocating and freeing memory on the DDR > upon user request. It also provides an interface to map and unmap DDR and > Host memory to the device address space. > > Signed-off-by: Omer Shpigelman <oshpigel...@habana.ai> > Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> > --- > Changes in v3: > - Remove use of three exclamation marks > - Add optimization - support mapping for huge page host memory > - Minor bug fixes > > drivers/misc/habanalabs/Makefile | 2 +- > drivers/misc/habanalabs/context.c | 19 +- > drivers/misc/habanalabs/device.c | 20 +- > drivers/misc/habanalabs/goya/goya.c | 393 +++++ > drivers/misc/habanalabs/habanalabs.h | 188 +- > drivers/misc/habanalabs/habanalabs_drv.c | 2 +- > drivers/misc/habanalabs/habanalabs_ioctl.c | 3 +- > .../include/hw_ip/mmu/mmu_general.h | 45 + > .../habanalabs/include/hw_ip/mmu/mmu_v1_0.h | 15 + > drivers/misc/habanalabs/memory.c | 1514 +++++++++++++++++ > drivers/misc/habanalabs/mmu.c | 604 +++++++ > include/uapi/misc/habanalabs.h | 122 +- > 12 files changed, 2919 insertions(+), 8 deletions(-) > create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h > create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h > create mode 100644 drivers/misc/habanalabs/mmu.c > > diff --git a/drivers/misc/habanalabs/Makefile > b/drivers/misc/habanalabs/Makefile > index 5167699701b9..2698bb6a2355 100644 > --- a/drivers/misc/habanalabs/Makefile > +++ b/drivers/misc/habanalabs/Makefile > @@ -6,7 +6,7 @@ obj-m := habanalabs.o > > habanalabs-y := habanalabs_drv.o device.o context.o asid.o > habanalabs_ioctl.o \ > command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \ > - command_submission.o > + command_submission.o mmu.o > > include $(src)/goya/Makefile > habanalabs-y += $(HL_GOYA_FILES)
[ ... ] > diff --git a/drivers/misc/habanalabs/memory.c > b/drivers/misc/habanalabs/memory.c > index 6536b40c63e0..cf0be4f254e0 100644 > --- a/drivers/misc/habanalabs/memory.c > +++ b/drivers/misc/habanalabs/memory.c > @@ -5,12 +5,1198 @@ > * All Rights Reserved. > */ > > +#include <uapi/misc/habanalabs.h> > #include "habanalabs.h" > +#include "include/hw_ip/mmu/mmu_general.h" > > #include <linux/sched.h> > #include <linux/uaccess.h> > #include <linux/genalloc.h> > > +#define PGS_IN_HPAGE (HPAGE_SIZE >> PAGE_SHIFT) > +#define HL_MMU_DEBUG 0 [ ... ] > +/* > + * init_phys_pg_pack_from_userptr - initialize physical page pack from host > + * memory > + * > + * @ctx : current context > + * @userptr : userptr to initialize from > + * @pphys_pg_pack : res pointer > + * > + * This function does the following: > + * - Pin the physical pages related to the given virtual block > + * - Create a physical page pack from the physical pages related to the given > + * virtual block > + */ > +static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, > + struct hl_userptr *userptr, > + struct hl_vm_phys_pg_pack **pphys_pg_pack) > +{ > + struct hl_vm_phys_pg_pack *phys_pg_pack; > + struct scatterlist *sg; > + dma_addr_t dma_addr; > + u64 page_mask; > + u32 npages, total_npages, page_size = PAGE_SIZE; > + bool first = true, is_2mb_opt = true; Now you've mentioned 2M pages, I've started to wonder what would happen if PAGE_SIZE != 4k and HPAGE_SIZE != 2M? There are architectures that may have different sizes for both... > + int rc, i, j; > + > + phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); > + if (!phys_pg_pack) > + return -ENOMEM; > + > + phys_pg_pack->vm_type = userptr->vm_type; > + phys_pg_pack->created_from_userptr = true; > + phys_pg_pack->asid = ctx->asid; > + atomic_set(&phys_pg_pack->mapping_cnt, 1); > + > + /* Only if all dma_addrs are aligned to 2MB and their > + * sizes is at least 2MB, we can use huge page mapping. > + * We limit the 2MB optimization to this condition, > + * since later on we acquire the related VA range as one > + * consecutive block. > + */ > + total_npages = 0; > + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) { > + npages = get_sg_info(sg, &dma_addr); > + > + total_npages += npages; > + > + if (first) { > + first = false; > + dma_addr &= HPAGE_MASK; > + } > + > + if ((npages % PGS_IN_HPAGE) || (dma_addr & (HPAGE_SIZE - 1))) > + is_2mb_opt = false; > + } > + > + if (is_2mb_opt) { > + page_size = HPAGE_SIZE; > + total_npages /= PGS_IN_HPAGE; > + } > + > + page_mask = ~(((u64) page_size) - 1); > + > + phys_pg_pack->pages = kcalloc(total_npages, sizeof(u64), GFP_KERNEL); > + if (!phys_pg_pack->pages) { > + rc = -ENOMEM; > + goto page_pack_arr_mem_err; > + } > + > + phys_pg_pack->npages = total_npages; > + phys_pg_pack->page_size = page_size; > + phys_pg_pack->total_size = total_npages * page_size; > + > + j = 0; > + first = true; > + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) { > + npages = get_sg_info(sg, &dma_addr); > + > + /* align down to physical page size and save the offset */ > + if (first) { > + first = false; > + phys_pg_pack->offset = dma_addr & (page_size - 1); > + dma_addr &= page_mask; > + } > + > + while (npages) { > + phys_pg_pack->pages[j++] = dma_addr; > + dma_addr += page_size; > + > + if (is_2mb_opt) > + npages -= PGS_IN_HPAGE; > + else > + npages--; > + } > + } > + > + *pphys_pg_pack = phys_pg_pack; > + > + return 0; > + > +page_pack_arr_mem_err: > + kfree(phys_pg_pack); > + > + return rc; > +} > + [ ... ] > diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c > new file mode 100644 > index 000000000000..054b72e48a49 > --- /dev/null > +++ b/drivers/misc/habanalabs/mmu.c > @@ -0,0 +1,604 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +/* > + * Copyright 2016-2018 HabanaLabs, Ltd. > + * All Rights Reserved. > + */ > + > +#include "habanalabs.h" > +#include "include/hw_ip/mmu/mmu_general.h" > + > +#include <linux/genalloc.h> > + > +#define HOP_NOT_FREED 0 > +#define HOP_FREED 1 > + > +static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 addr) > +{ > + struct pgt_info *pgt_info = NULL; > + > + hash_for_each_possible(ctx->mmu_hash, pgt_info, node, > + (unsigned long) addr) > + if (addr == pgt_info->addr) > + break; > + > + return pgt_info; > +} > + > +static void free_hop(struct hl_ctx *ctx, u64 hop_addr) > +{ > + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); > + > + gen_pool_free(pgt_info->ctx->hdev->mmu_pgt_pool, pgt_info->addr, > + ctx->hdev->asic_prop.mmu_hop_table_size); > + hash_del(&pgt_info->node); > + > + kfree(pgt_info); > +} > + > +static u64 alloc_hop(struct hl_ctx *ctx) > +{ > + struct hl_device *hdev = ctx->hdev; > + struct pgt_info *pgt_info; > + u64 addr; > + > + pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL); > + if (!pgt_info) > + return ULLONG_MAX; > + > + addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool, > + hdev->asic_prop.mmu_hop_table_size); > + if (!addr) { > + dev_err(hdev->dev, "failed to allocate page\n"); > + kfree(pgt_info); > + return ULLONG_MAX; > + } > + > + pgt_info->addr = addr; > + pgt_info->ctx = ctx; > + pgt_info->num_of_ptes = 0; > + hash_add(ctx->mmu_hash, &pgt_info->node, addr); > + > + return addr; > +} > + > +static inline void clear_pte(struct hl_device *hdev, u64 pte_addr) > +{ > + /* clear the last and present bits */ > + hdev->asic_funcs->write_pte(hdev, pte_addr, 0); > +} > + > +static inline void inc_num_of_ptes(struct hl_ctx *ctx, u64 hop_addr) > +{ > + get_pgt_info(ctx, hop_addr)->num_of_ptes++; > +} > + > +/* > + * dec_num_of_ptes - decrement the num of ptes and free the hop if possible > + * > + * @ctx: pointer to the context structure > + * @hop_addr: addr of the hop > + * > + * This function returns HOP_FREED if the hop was freed or HOP_NOT_FREED if > the > + * num of ptes was decremented without freeing the hop. > + */ > +static inline int dec_num_of_ptes(struct hl_ctx *ctx, u64 hop_addr) I think it's worth emphasizing that this function may free a pte. Maybe even use {get,put}_pte instead of {inc,dec}_num_of_ptes. > +{ > + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); > + > + pgt_info->num_of_ptes--; > + > + if (pgt_info->num_of_ptes == 0) { > + free_hop(ctx, hop_addr); > + return HOP_FREED; > + } > + > + return HOP_NOT_FREED; Why not simply return pgt_info->num_of_ptes? > +} > + > +static inline u64 get_hop0_addr(struct hl_ctx *ctx) > +{ > + return ctx->hdev->asic_prop.mmu_pgt_addr + > + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); > +} > + > +static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr, > + u64 virt_addr) > +{ > + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * > + ((virt_addr & HOP0_MASK) >> HOP0_SHIFT); > +} > + > +static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr, > + u64 virt_addr) > +{ > + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * > + ((virt_addr & HOP1_MASK) >> HOP1_SHIFT); > +} > + > +static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr, > + u64 virt_addr) > +{ > + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * > + ((virt_addr & HOP2_MASK) >> HOP2_SHIFT); > +} > + > +static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr, > + u64 virt_addr) > +{ > + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * > + ((virt_addr & HOP3_MASK) >> HOP3_SHIFT); > +} > + > +static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, > + u64 virt_addr) > +{ > + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * > + ((virt_addr & HOP4_MASK) >> HOP4_SHIFT); > +} These are almost clones of each other. I'd factor out static inline u64 get_hopN_pte(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr, u64 mask, u64 shift); and alias it for each level. > +static inline u64 get_next_hop_addr(u64 curr_pte) > +{ > + if (curr_pte & PAGE_PRESENT_MASK) > + return curr_pte & PHYS_ADDR_MASK; > + else > + return ULLONG_MAX; > +} > + > +/* > + * hl_mmu_init - init the mmu module > + * > + * @hdev: pointer to the habanalabs device structure > + * > + * This function does the following: > + * - Allocate max_asid zeroed hop0 pgts so no mapping is available > + * - Enable mmu in hw > + * - Invalidate the mmu cache > + * - Create a pool of pages for pgts > + * - Returns 0 on success > + * > + * This function depends on DMA QMAN to be working! > + */ > +int hl_mmu_init(struct hl_device *hdev) > +{ > + struct asic_fixed_properties *prop = &hdev->asic_prop; > + int rc; > + > + if (!hdev->mmu_enable) > + return 0; > + > + /* MMU HW init was already done in device hw_init() */ > + > + mutex_init(&hdev->mmu_cache_lock); > + > + hdev->mmu_pgt_pool = > + gen_pool_create(__ffs(prop->mmu_hop_table_size), -1); > + > + if (!hdev->mmu_pgt_pool) { > + dev_err(hdev->dev, "Failed to create page gen pool\n"); > + rc = -ENOMEM; > + goto err_pool_create; > + } > + > + rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr + > + prop->mmu_hop0_tables_total_size, > + prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size, > + -1); > + if (rc) { > + dev_err(hdev->dev, "Failed to add memory to page gen pool\n"); > + goto err_pool_add; > + } > + > + return 0; > + > +err_pool_add: > + gen_pool_destroy(hdev->mmu_pgt_pool); > +err_pool_create: > + mutex_destroy(&hdev->mmu_cache_lock); > + > + return rc; > +} > + > +/* > + * hl_mmu_fini - release the mmu module. > + * > + * @hdev: pointer to the habanalabs device structure > + * > + * This function does the following: > + * - Disable mmu in hw > + * - free the pgts pool > + * > + * All ctxs should be freed before calling this func > + */ > +void hl_mmu_fini(struct hl_device *hdev) > +{ > + if (!hdev->mmu_enable) > + return; > + > + gen_pool_destroy(hdev->mmu_pgt_pool); > + > + mutex_destroy(&hdev->mmu_cache_lock); > + > + /* MMU HW fini will be done in device hw_fini() */ > +} > + > +/* > + * hl_mmu_ctx_init - init a ctx for using the mmu module > + * > + * @ctx: pointer to the context structure > + * > + * This function does the following: > + * - Init a mutex to protect the concurrent mapping flow > + * - Init a hash to hold all pgts related to this ctx > + */ > +void hl_mmu_ctx_init(struct hl_ctx *ctx) > +{ > + if (!ctx->hdev->mmu_enable) > + return; > + > + mutex_init(&ctx->mmu_lock); > + hash_init(ctx->mmu_hash); > +} > + > +/* > + * hl_mmu_ctx_fini - disable a ctx from using the mmu module > + * > + * @ctx: pointer to the context structure > + * > + * This function does the following: > + * - Free any pgts which were not freed yet > + * - Free the mutex > + */ > +void hl_mmu_ctx_fini(struct hl_ctx *ctx) > +{ > + struct pgt_info *pgt_info; > + struct hlist_node *tmp; > + int i; > + > + if (!ctx->hdev->mmu_enable) > + return; > + > + if (!hash_empty(ctx->mmu_hash)) > + dev_err(ctx->hdev->dev, > + "ctx is freed while it has pgts in use\n"); > + > + hash_for_each_safe(ctx->mmu_hash, i, tmp, pgt_info, node) { > + dev_err(ctx->hdev->dev, > + "pgt_info of addr 0x%llx of asid %d was not destroyed, > num_ptes: %d\n", > + pgt_info->addr, ctx->asid, pgt_info->num_of_ptes); > + free_hop(ctx, pgt_info->addr); > + } > + > + mutex_destroy(&ctx->mmu_lock); > +} > + > +/* > + * hl_mmu_map - maps a virtual addr to physical addr > + * > + * @ctx: pointer to the context structure > + * @virt_addr: virt addr to map from > + * @phys_addr: phys addr to map to > + * @page_size: physical page size > + * > + * This function does the following: > + * - Check that the virt addr is not mapped > + * - Allocate pgts as necessary in order to map the virt addr to the phys > + * - Returns 0 on success, -EINVAL if addr is already mapped, or -ENOMEM. > + * > + * Because this function changes the page tables in the device and because it > + * changes the MMU hash, it must be protected by a lock. > + * However, because it maps only a single page, the lock should be > implemented > + * in a higher level in order to protect the entire mapping of the memory > area > + */ > +int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 > page_size) > +{ > + struct hl_device *hdev = ctx->hdev; > + u64 hop0_addr = 0, hop0_pte_addr = 0, > + hop1_addr = 0, hop1_pte_addr = 0, > + hop2_addr = 0, hop2_pte_addr = 0, > + hop3_addr = 0, hop3_pte_addr = 0, > + hop4_addr = 0, hop4_pte_addr = 0, > + curr_pte = 0; > + int hop1_new = 0, hop2_new = 0, hop3_new = 0, hop4_new = 0, > + rc = -ENOMEM; > + bool is_huge; > + > + if (!hdev->mmu_enable) > + return 0; > + > + /* > + * This mapping function can map a 4KB/2MB page. For 2MB page there are > + * only 3 hops rather than 4. Currently the DRAM allocation uses 2MB > + * pages only but user memory could have been allocated with one of the > + * two page sizes. Since this is a common code for all the three cases, > + * we need this hugs page check. > + */ > + is_huge = page_size == PAGE_SIZE_2MB; > + > + hop0_addr = get_hop0_addr(ctx); > + > + hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr); > + > + hop1_addr = get_next_hop_addr(curr_pte); > + > + if (hop1_addr == ULLONG_MAX) { > + hop1_addr = alloc_hop(ctx); > + if (hop1_addr == ULLONG_MAX) > + goto err; > + else > + hop1_new = 1; > + } The allocation can be folded into get_alloc_next_hop_addr() along with get_next_hop_addr(). > + > + hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr); > + > + hop2_addr = get_next_hop_addr(curr_pte); > + > + if (hop2_addr == ULLONG_MAX) { > + hop2_addr = alloc_hop(ctx); > + if (hop2_addr == ULLONG_MAX) > + goto err; > + else > + hop2_new = 1; > + } > + > + hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr); > + > + hop3_addr = get_next_hop_addr(curr_pte); > + > + if (hop3_addr == ULLONG_MAX) { > + hop3_addr = alloc_hop(ctx); > + if (hop3_addr == ULLONG_MAX) > + goto err; > + else > + hop3_new = 1; > + } > + > + hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr); > + > + if (!is_huge) { > + hop4_addr = get_next_hop_addr(curr_pte); > + > + if (hop4_addr == ULLONG_MAX) { > + hop4_addr = alloc_hop(ctx); > + if (hop4_addr == ULLONG_MAX) > + goto err; > + else > + hop4_new = 1; > + } > + > + hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr); > + } > + > + if (curr_pte & PAGE_PRESENT_MASK) { > + dev_err(hdev->dev, > + "mapping already exists for virt_addr 0x%llx\n", > + virt_addr); > + > + dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n", > + hdev->asic_funcs->read_pte(hdev, hop0_pte_addr), > + hop0_pte_addr); > + dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n", > + hdev->asic_funcs->read_pte(hdev, hop1_pte_addr), > + hop1_pte_addr); > + dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n", > + hdev->asic_funcs->read_pte(hdev, hop2_pte_addr), > + hop2_pte_addr); > + dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n", > + hdev->asic_funcs->read_pte(hdev, hop3_pte_addr), > + hop3_pte_addr); > + > + if (!is_huge) > + dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n", > + hdev->asic_funcs->read_pte(hdev, > + hop4_pte_addr), > + hop4_pte_addr); > + > + rc = EINVAL; > + goto err; > + } > + > + curr_pte = (phys_addr & PTE_PHYS_ADDR_MASK) | LAST_MASK > + | PAGE_PRESENT_MASK; > + > + hdev->asic_funcs->write_pte(hdev, > + is_huge ? hop3_pte_addr : hop4_pte_addr, > + curr_pte); > + > + if (hop1_new) { > + curr_pte = (hop1_addr & PTE_PHYS_ADDR_MASK) | > + PAGE_PRESENT_MASK; > + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop0_pte_addr, > + curr_pte); > + } > + if (hop2_new) { > + curr_pte = (hop2_addr & PTE_PHYS_ADDR_MASK) | > + PAGE_PRESENT_MASK; > + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop1_pte_addr, > + curr_pte); > + inc_num_of_ptes(ctx, hop1_addr); > + } > + if (hop3_new) { > + curr_pte = (hop3_addr & PTE_PHYS_ADDR_MASK) | > + PAGE_PRESENT_MASK; > + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop2_pte_addr, > + curr_pte); > + inc_num_of_ptes(ctx, hop2_addr); > + } > + > + if (!is_huge) { > + if (hop4_new) { > + curr_pte = (hop4_addr & PTE_PHYS_ADDR_MASK) | > + PAGE_PRESENT_MASK; > + ctx->hdev->asic_funcs->write_pte(ctx->hdev, > + hop3_pte_addr, curr_pte); > + inc_num_of_ptes(ctx, hop3_addr); > + } > + > + inc_num_of_ptes(ctx, hop4_addr); > + } else > + inc_num_of_ptes(ctx, hop3_addr); Isn't hop3 would be incremented twice for 2M pages? > + > + /* flush all writes from all cores to reach PCI */ > + mb(); > + > + hdev->asic_funcs->read_pte(hdev, > + is_huge ? hop3_pte_addr : hop4_pte_addr); > + > + return 0; > + > +err: > + if (hop4_new) > + free_hop(ctx, hop4_addr); > + if (hop3_new) > + free_hop(ctx, hop3_addr); > + if (hop2_new) > + free_hop(ctx, hop2_addr); > + if (hop1_new) > + free_hop(ctx, hop1_addr); > + > + return rc; > +} > + > +/* > + * hl_mmu_unmap - unmaps a virtual addr > + * > + * @ctx: pointer to the context structure > + * @virt_addr: virt addr to map from > + * > + * This function does the following: > + * - Check that the virt addr is mapped > + * - Unmap the vurt addr and frees pgts if possible > + * - Returns 0 on success, -EINVAL if the given addr is not mapped > + * > + * Because this function changes the page tables in the device and because it > + * changes the MMU hash, it must be protected by a lock. > + * However, because it maps only a single page, the lock should be > implemented > + * in a higher level in order to protect the entire mapping of the memory > area > + */ > +int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) > +{ > + struct hl_device *hdev = ctx->hdev; > + u64 hop0_addr = 0, hop0_pte_addr = 0, > + hop1_addr = 0, hop1_pte_addr = 0, > + hop2_addr = 0, hop2_pte_addr = 0, > + hop3_addr = 0, hop3_pte_addr = 0, > + hop4_addr = 0, hop4_pte_addr = 0, > + curr_pte; > + int clear_hop3 = 1; > + > + if (!hdev->mmu_enable) > + return 0; > + > + hop0_addr = get_hop0_addr(ctx); > + > + hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr); > + > + hop1_addr = get_next_hop_addr(curr_pte); > + > + if (hop1_addr == ULLONG_MAX) > + goto not_mapped; > + > + hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr); > + > + hop2_addr = get_next_hop_addr(curr_pte); > + > + if (hop2_addr == ULLONG_MAX) > + goto not_mapped; > + > + hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr); > + > + hop3_addr = get_next_hop_addr(curr_pte); > + > + if (hop3_addr == ULLONG_MAX) > + goto not_mapped; > + > + hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr); > + > + if (!(curr_pte & LAST_MASK)) { > + hop4_addr = get_next_hop_addr(curr_pte); > + > + if (hop4_addr == ULLONG_MAX) > + goto not_mapped; > + > + hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); > + > + curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr); > + > + clear_hop3 = 0; > + } > + > + if (!(curr_pte & PAGE_PRESENT_MASK)) > + goto not_mapped; > + > + clear_pte(hdev, hop4_addr ? hop4_pte_addr : hop3_pte_addr); > + > + if (hop4_addr && dec_num_of_ptes(ctx, hop4_addr) == HOP_FREED) > + clear_hop3 = 1; > + > + if (clear_hop3) { > + clear_pte(hdev, hop3_pte_addr); > + if (dec_num_of_ptes(ctx, hop3_addr) == HOP_FREED) { > + clear_pte(hdev, hop2_pte_addr); > + if (dec_num_of_ptes(ctx, hop2_addr) == HOP_FREED) { > + clear_pte(hdev, hop1_pte_addr); > + if (dec_num_of_ptes(ctx, hop1_addr) == > + HOP_FREED) > + clear_pte(hdev, hop0_pte_addr); > + } > + } > + } Consider inverting the logic and doing something like if (dec_num_of_ptes() != HOP_FREED) goto flush; clear_pte(...) if (...) goto flush; flush: > + > + /* flush all writes from all cores to reach PCI */ > + mb(); > + > + hdev->asic_funcs->read_pte(hdev, > + hop4_addr ? hop4_pte_addr : hop3_pte_addr); > + > + return 0; > + > +not_mapped: > + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n", > + virt_addr); > + > + return -EINVAL; > +} [ ... ] > -- > 2.17.1 > -- Sincerely yours, Mike.