On Wed, Jan 23, 2019 at 02:00:53AM +0200, Oded Gabbay wrote: > This patch adds the main flow for the user to submit work to the device. > > Each work is described by a command submission object (CS). The CS contains > 3 arrays of command buffers: One for execution, and two for context-switch > (store and restore). > > For each CB, the user specifies on which queue to put that CB. In case of > an internal queue, the entry doesn't contain a pointer to the CB but the > address in the on-chip memory that the CB resides at. > > The driver parses some of the CBs to enforce security restrictions. > > The user receives a sequence number that represents the CS object. The user > can then query the driver regarding the status of the CS, using that > sequence number. > > In case the CS doesn't finish before the timeout expires, the driver will > perform a soft-reset of the device. > > Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> > --- > drivers/misc/habanalabs/Makefile | 3 +- > drivers/misc/habanalabs/command_submission.c | 787 +++++++++++++ > drivers/misc/habanalabs/context.c | 52 +- > drivers/misc/habanalabs/device.c | 16 + > drivers/misc/habanalabs/goya/goya.c | 1082 ++++++++++++++++++ > drivers/misc/habanalabs/habanalabs.h | 274 +++++ > drivers/misc/habanalabs/habanalabs_drv.c | 23 + > drivers/misc/habanalabs/habanalabs_ioctl.c | 4 +- > drivers/misc/habanalabs/hw_queue.c | 250 ++++ > drivers/misc/habanalabs/memory.c | 200 ++++ > include/uapi/misc/habanalabs.h | 158 ++- > 11 files changed, 2842 insertions(+), 7 deletions(-) > create mode 100644 drivers/misc/habanalabs/command_submission.c > create mode 100644 drivers/misc/habanalabs/memory.c > > diff --git a/drivers/misc/habanalabs/Makefile > b/drivers/misc/habanalabs/Makefile > index b5607233d216..d2fd0e18b1eb 100644 > --- a/drivers/misc/habanalabs/Makefile > +++ b/drivers/misc/habanalabs/Makefile > @@ -5,7 +5,8 @@ > obj-m := habanalabs.o > > habanalabs-y := habanalabs_drv.o device.o context.o asid.o > habanalabs_ioctl.o \ > - command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o > + command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \ > + command_submission.o > > include $(src)/goya/Makefile > habanalabs-y += $(HL_GOYA_FILES) > diff --git a/drivers/misc/habanalabs/command_submission.c > b/drivers/misc/habanalabs/command_submission.c > new file mode 100644 > index 000000000000..0116c2262f17 > --- /dev/null > +++ b/drivers/misc/habanalabs/command_submission.c > @@ -0,0 +1,787 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +/* > + * Copyright 2016-2018 HabanaLabs, Ltd. > + * All Rights Reserved. > + */ > + > +#include <uapi/misc/habanalabs.h> > +#include "habanalabs.h" > + > +#include <linux/sched/mm.h> > +#include <linux/sched/task.h> > +#include <linux/sched/signal.h> > +#include <linux/wait.h> > +#include <linux/mm.h> > +#include <linux/highmem.h>
[ ... ] > +static void cs_do_release(struct kref *ref) > +{ > + struct hl_cs *cs = container_of(ref, struct hl_cs, > + refcount); > + struct hl_device *hdev = cs->ctx->hdev; > + struct hl_cs_job *job, *tmp; > + > + cs->completed = true; > + > + /* > + * Although if we reached here it means that all external jobs have > + * finished, because each one of them took refcnt to CS, we still > + * need to go over the internal jobs and free them. Otherwise, we > + * will have leaked memory and what's worse, the CS object (and > + * potentially the CTX object) could be released, while the JOB > + * still holds a pointer to them (but no reference). > + */ > + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) > + free_job(hdev, job); > + > + /* We also need to update CI for internal queues */ > + if (cs->submitted) { > + hl_int_hw_queue_update_ci(cs); > + > + spin_lock(&hdev->hw_queues_mirror_lock); > + /* remove CS from hw_queues mirror list */ > + list_del_init(&cs->mirror_node); > + spin_unlock(&hdev->hw_queues_mirror_lock); > + > + /* > + * Don't cancel TDR in case this CS was timedout because we > + * might be running from the TDR context > + */ > + if ((!cs->timedout) && > + (hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) { > + struct hl_cs *next; > + > + if (cs->tdr_active) > + cancel_delayed_work_sync(&cs->work_tdr); > + > + spin_lock(&hdev->hw_queues_mirror_lock); > + /* queue TDR for next CS */ > + next = list_first_entry_or_null( > + &hdev->hw_queues_mirror_list, > + struct hl_cs, mirror_node); > + if ((next) && (!next->tdr_active)) { > + next->tdr_active = true; > + schedule_delayed_work(&next->work_tdr, > + hdev->timeout_jiffies); > + spin_unlock(&hdev->hw_queues_mirror_lock); > + } else { > + spin_unlock(&hdev->hw_queues_mirror_lock); > + } 'else' can be dropped, just move spin_unlock() outside the 'if' > + } > + } > + > + hl_ctx_put(cs->ctx); > + > + if (cs->timedout) > + dma_fence_set_error(cs->fence, -ETIMEDOUT); > + else if (cs->aborted) > + dma_fence_set_error(cs->fence, -EIO); > + > + dma_fence_signal(cs->fence); > + dma_fence_put(cs->fence); > + > + kfree(cs); > +} [ ... ] > +static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, > + struct hl_cs **cs_new) > +{ > + struct hl_dma_fence *fence; > + struct dma_fence *other = NULL; > + struct hl_cs *cs; > + int rc; > + > + cs = kzalloc(sizeof(*cs), GFP_ATOMIC); > + if (!cs) > + return -ENOMEM; Does this ever run from a context that cannot use GFP_KERNEL? This applies to other allocations below. > + > + cs->ctx = ctx; > + cs->submitted = false; > + cs->completed = false; > + INIT_LIST_HEAD(&cs->job_list); > + INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout); > + kref_init(&cs->refcount); > + spin_lock_init(&cs->job_lock); > + > + fence = kmalloc(sizeof(*fence), GFP_ATOMIC); kzalloc? > + if (!fence) { > + rc = -ENOMEM; > + goto free_cs; > + } > + > + fence->hdev = hdev; > + spin_lock_init(&fence->lock); > + cs->fence = &fence->base_fence; > + > + spin_lock(&ctx->cs_lock); > + > + fence->cs_seq = ctx->cs_sequence; > + other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)]; > + if ((other) && (!dma_fence_is_signaled(other))) { > + spin_unlock(&ctx->cs_lock); > + rc = -EAGAIN; > + goto free_fence; > + } > + > + dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock, > + ctx->asid, ctx->cs_sequence); > + > + cs->sequence = fence->cs_seq; > + > + ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] = > + &fence->base_fence; > + ctx->cs_sequence++; > + > + dma_fence_get(&fence->base_fence); > + > + dma_fence_put(other); > + > + spin_unlock(&ctx->cs_lock); > + > + *cs_new = cs; > + > + return 0; > + > +free_fence: > + kfree(fence); > +free_cs: > + kfree(cs); > + return rc; > +} > + [ ... ] > + > +static int goya_validate_cb(struct hl_device *hdev, > + struct hl_cs_parser *parser, bool is_mmu) > +{ > + u32 cb_parsed_length = 0; > + int rc = 0; > + > + parser->patched_cb_size = 0; > + > + /* cb_user_size is more than 0 so loop will always be executed */ > + while ((cb_parsed_length < parser->user_cb_size) && (!rc)) { > + enum packet_id pkt_id; > + u16 pkt_size; > + void *user_pkt; > + > + user_pkt = (void *) (parser->user_cb->kernel_address + > + cb_parsed_length); > + > + pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & > + PACKET_HEADER_PACKET_ID_MASK) >> > + PACKET_HEADER_PACKET_ID_SHIFT); > + > + pkt_size = goya_packet_sizes[pkt_id]; > + cb_parsed_length += pkt_size; > + if (cb_parsed_length > parser->user_cb_size) { > + dev_err(hdev->dev, > + "packet 0x%x is out of CB boundary\n", pkt_id); > + rc = -EINVAL; > + continue; For me !rc in the while statement was blind. Please consider break here and if (!rc) break; after the switch > + } > + > + switch (pkt_id) { > + case PACKET_WREG_32: > + /* > + * Although it is validated after copy in patch_cb(), > + * need to validate here as well because patch_cb() is > + * not called in MMU path while this function is called > + */ > + rc = goya_validate_wreg32(hdev, parser, user_pkt); > + break; > + > + case PACKET_WREG_BULK: > + dev_err(hdev->dev, > + "User not allowed to use WREG_BULK\n"); > + rc = -EPERM; > + break; > + > + case PACKET_MSG_PROT: > + dev_err(hdev->dev, > + "User not allowed to use MSG_PROT\n"); > + rc = -EPERM; > + break; > + > + case PACKET_CP_DMA: > + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); > + rc = -EPERM; > + break; > + > + case PACKET_STOP: > + dev_err(hdev->dev, "User not allowed to use STOP\n"); > + rc = -EPERM; > + break; > + > + case PACKET_LIN_DMA: > + if (is_mmu) > + rc = goya_validate_dma_pkt_mmu(hdev, parser, > + user_pkt); > + else > + rc = goya_validate_dma_pkt_no_mmu(hdev, parser, > + user_pkt); > + break; > + > + case PACKET_MSG_LONG: > + case PACKET_MSG_SHORT: > + case PACKET_FENCE: > + case PACKET_NOP: > + parser->patched_cb_size += pkt_size; > + break; > + > + default: > + dev_err(hdev->dev, "Invalid packet header 0x%x\n", > + pkt_id); > + rc = -EINVAL; > + break; > + } > + } > + > + /* > + * The new CB should have space at the end for two MSG_PROT packets: > + * 1. A packet that will act as a completion packet > + * 2. A packet that will generate MSI-X interrupt > + */ > + parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2; > + > + return rc; > +} [ ... ] > +static int goya_patch_cb(struct hl_device *hdev, > + struct hl_cs_parser *parser) > +{ > + u32 cb_parsed_length = 0; > + u32 cb_patched_cur_length = 0; > + int rc = 0; > + > + /* cb_user_size is more than 0 so loop will always be executed */ > + while ((cb_parsed_length < parser->user_cb_size) && (!rc)) { > + enum packet_id pkt_id; > + u16 pkt_size; > + u32 new_pkt_size = 0; > + void *user_pkt, *kernel_pkt; > + > + user_pkt = (void *) (parser->user_cb->kernel_address + > + cb_parsed_length); > + kernel_pkt = (void *) (parser->patched_cb->kernel_address + > + cb_patched_cur_length); > + > + pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & > + PACKET_HEADER_PACKET_ID_MASK) >> > + PACKET_HEADER_PACKET_ID_SHIFT); > + > + pkt_size = goya_packet_sizes[pkt_id]; > + cb_parsed_length += pkt_size; > + if (cb_parsed_length > parser->user_cb_size) { > + dev_err(hdev->dev, > + "packet 0x%x is out of CB boundary\n", pkt_id); > + rc = -EINVAL; > + continue; Ditto > + } > + > + switch (pkt_id) { > + case PACKET_LIN_DMA: > + rc = goya_patch_dma_packet(hdev, parser, user_pkt, > + kernel_pkt, &new_pkt_size); > + cb_patched_cur_length += new_pkt_size; > + break; > + > + case PACKET_WREG_32: > + memcpy(kernel_pkt, user_pkt, pkt_size); > + cb_patched_cur_length += pkt_size; > + rc = goya_validate_wreg32(hdev, parser, kernel_pkt); > + break; > + > + case PACKET_WREG_BULK: > + dev_err(hdev->dev, > + "User not allowed to use WREG_BULK\n"); > + rc = -EPERM; > + break; > + > + case PACKET_MSG_PROT: > + dev_err(hdev->dev, > + "User not allowed to use MSG_PROT\n"); > + rc = -EPERM; > + break; > + > + case PACKET_CP_DMA: > + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); > + rc = -EPERM; > + break; > + > + case PACKET_STOP: > + dev_err(hdev->dev, "User not allowed to use STOP\n"); > + rc = -EPERM; > + break; > + > + case PACKET_MSG_LONG: > + case PACKET_MSG_SHORT: > + case PACKET_FENCE: > + case PACKET_NOP: > + memcpy(kernel_pkt, user_pkt, pkt_size); > + cb_patched_cur_length += pkt_size; > + break; > + > + default: > + dev_err(hdev->dev, "Invalid packet header 0x%x\n", > + pkt_id); > + rc = -EINVAL; > + break; > + } > + } > + > + return rc; > +} [ ... ] > static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id, > u16 event_type, char *axi_name, int len) > { > @@ -4645,6 +5677,48 @@ static void goya_disable_clock_gating(struct hl_device > *hdev) > > } > > +static bool goya_is_device_idle(struct hl_device *hdev) > +{ > + u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg; > + bool val = true; > + int i; > + > + offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0; > + > + for (i = 0 ; i < DMA_MAX_NUM ; i++) { > + dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset; > + > + val = val && ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) == > + DMA_QM_IDLE_MASK); > + } > + > + offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0; > + > + for (i = 0 ; i < TPC_MAX_NUM ; i++) { > + tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset; > + tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset; > + tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset; > + > + val = val && ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) == > + TPC_QM_IDLE_MASK); > + val = val && ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) == > + TPC_CMDQ_IDLE_MASK); > + val = val && ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) == > + TPC_CFG_IDLE_MASK); > + } > + > + val = val && ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) == > + MME_QM_IDLE_MASK); > + val = val && ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) == > + MME_CMDQ_IDLE_MASK); > + val = val && ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) == > + MME_ARCH_IDLE_MASK); > + val = val && ((RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK) == > + 0); Huh, these are neat, but IMHO plain if ((RREG(reg) & mask) != mask) return false; are more readable... > + > + return val; > +} > + [ ... ] -- Sincerely yours, Mike.