Re: [PATCH 11/15] habanalabs: add command submission module

Mike Rapoport Sun, 27 Jan 2019 07:12:14 -0800

On Wed, Jan 23, 2019 at 02:00:53AM +0200, Oded Gabbay wrote:
> This patch adds the main flow for the user to submit work to the device.
> 
> Each work is described by a command submission object (CS). The CS contains
> 3 arrays of command buffers: One for execution, and two for context-switch
> (store and restore).
> 
> For each CB, the user specifies on which queue to put that CB. In case of
> an internal queue, the entry doesn't contain a pointer to the CB but the
> address in the on-chip memory that the CB resides at.
> 
> The driver parses some of the CBs to enforce security restrictions.
> 
> The user receives a sequence number that represents the CS object. The user
> can then query the driver regarding the status of the CS, using that
> sequence number.
> 
> In case the CS doesn't finish before the timeout expires, the driver will
> perform a soft-reset of the device.
> 
> Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
> ---
>  drivers/misc/habanalabs/Makefile             |    3 +-
>  drivers/misc/habanalabs/command_submission.c |  787 +++++++++++++
>  drivers/misc/habanalabs/context.c            |   52 +-
>  drivers/misc/habanalabs/device.c             |   16 +
>  drivers/misc/habanalabs/goya/goya.c          | 1082 ++++++++++++++++++
>  drivers/misc/habanalabs/habanalabs.h         |  274 +++++
>  drivers/misc/habanalabs/habanalabs_drv.c     |   23 +
>  drivers/misc/habanalabs/habanalabs_ioctl.c   |    4 +-
>  drivers/misc/habanalabs/hw_queue.c           |  250 ++++
>  drivers/misc/habanalabs/memory.c             |  200 ++++
>  include/uapi/misc/habanalabs.h               |  158 ++-
>  11 files changed, 2842 insertions(+), 7 deletions(-)
>  create mode 100644 drivers/misc/habanalabs/command_submission.c
>  create mode 100644 drivers/misc/habanalabs/memory.c
> 
> diff --git a/drivers/misc/habanalabs/Makefile 
> b/drivers/misc/habanalabs/Makefile
> index b5607233d216..d2fd0e18b1eb 100644
> --- a/drivers/misc/habanalabs/Makefile
> +++ b/drivers/misc/habanalabs/Makefile
> @@ -5,7 +5,8 @@
>  obj-m        := habanalabs.o
>  
>  habanalabs-y := habanalabs_drv.o device.o context.o asid.o 
> habanalabs_ioctl.o \
> -             command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o
> +             command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
> +             command_submission.o
>  
>  include $(src)/goya/Makefile
>  habanalabs-y += $(HL_GOYA_FILES)
> diff --git a/drivers/misc/habanalabs/command_submission.c 
> b/drivers/misc/habanalabs/command_submission.c
> new file mode 100644
> index 000000000000..0116c2262f17
> --- /dev/null
> +++ b/drivers/misc/habanalabs/command_submission.c
> @@ -0,0 +1,787 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright 2016-2018 HabanaLabs, Ltd.
> + * All Rights Reserved.
> + */
> +
> +#include <uapi/misc/habanalabs.h>
> +#include "habanalabs.h"
> +
> +#include <linux/sched/mm.h>
> +#include <linux/sched/task.h>
> +#include <linux/sched/signal.h>
> +#include <linux/wait.h>
> +#include <linux/mm.h>
> +#include <linux/highmem.h>


[ ... ]

> +static void cs_do_release(struct kref *ref)
> +{
> +     struct hl_cs *cs = container_of(ref, struct hl_cs,
> +                                             refcount);
> +     struct hl_device *hdev = cs->ctx->hdev;
> +     struct hl_cs_job *job, *tmp;
> +
> +     cs->completed = true;
> +
> +     /*
> +      * Although if we reached here it means that all external jobs have
> +      * finished, because each one of them took refcnt to CS, we still
> +      * need to go over the internal jobs and free them. Otherwise, we
> +      * will have leaked memory and what's worse, the CS object (and
> +      * potentially the CTX object) could be released, while the JOB
> +      * still holds a pointer to them (but no reference).
> +      */
> +     list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
> +             free_job(hdev, job);
> +
> +     /* We also need to update CI for internal queues */
> +     if (cs->submitted) {
> +             hl_int_hw_queue_update_ci(cs);
> +
> +             spin_lock(&hdev->hw_queues_mirror_lock);
> +             /* remove CS from hw_queues mirror list */
> +             list_del_init(&cs->mirror_node);
> +             spin_unlock(&hdev->hw_queues_mirror_lock);
> +
> +             /*
> +              * Don't cancel TDR in case this CS was timedout because we
> +              * might be running from the TDR context
> +              */
> +             if ((!cs->timedout) &&
> +                     (hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
> +                     struct hl_cs *next;
> +
> +                     if (cs->tdr_active)
> +                             cancel_delayed_work_sync(&cs->work_tdr);
> +
> +                     spin_lock(&hdev->hw_queues_mirror_lock);
> +                     /* queue TDR for next CS */
> +                     next = list_first_entry_or_null(
> +                                     &hdev->hw_queues_mirror_list,
> +                                     struct hl_cs, mirror_node);
> +                     if ((next) && (!next->tdr_active)) {
> +                             next->tdr_active = true;
> +                             schedule_delayed_work(&next->work_tdr,
> +                                                     hdev->timeout_jiffies);
> +                             spin_unlock(&hdev->hw_queues_mirror_lock);
> +                     } else {
> +                             spin_unlock(&hdev->hw_queues_mirror_lock);
> +                     }

'else' can be dropped, just move spin_unlock() outside the 'if'

> +             }
> +     }
> +
> +     hl_ctx_put(cs->ctx);
> +
> +     if (cs->timedout)
> +             dma_fence_set_error(cs->fence, -ETIMEDOUT);
> +     else if (cs->aborted)
> +             dma_fence_set_error(cs->fence, -EIO);
> +
> +     dma_fence_signal(cs->fence);
> +     dma_fence_put(cs->fence);
> +
> +     kfree(cs);
> +}

[ ... ]

> +static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
> +                     struct hl_cs **cs_new)
> +{
> +     struct hl_dma_fence *fence;
> +     struct dma_fence *other = NULL;
> +     struct hl_cs *cs;
> +     int rc;
> +
> +     cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
> +     if (!cs)
> +             return -ENOMEM;

Does this ever run from a context that cannot use GFP_KERNEL?
This applies to other allocations below.

> +
> +     cs->ctx = ctx;
> +     cs->submitted = false;
> +     cs->completed = false;
> +     INIT_LIST_HEAD(&cs->job_list);
> +     INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
> +     kref_init(&cs->refcount);
> +     spin_lock_init(&cs->job_lock);
> +
> +     fence = kmalloc(sizeof(*fence), GFP_ATOMIC);

kzalloc?

> +     if (!fence) {
> +             rc = -ENOMEM;
> +             goto free_cs;
> +     }
> +
> +     fence->hdev = hdev;
> +     spin_lock_init(&fence->lock);
> +     cs->fence = &fence->base_fence;
> +
> +     spin_lock(&ctx->cs_lock);
> +
> +     fence->cs_seq = ctx->cs_sequence;
> +     other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
> +     if ((other) && (!dma_fence_is_signaled(other))) {
> +             spin_unlock(&ctx->cs_lock);
> +             rc = -EAGAIN;
> +             goto free_fence;
> +     }
> +
> +     dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock,
> +                     ctx->asid, ctx->cs_sequence);
> +
> +     cs->sequence = fence->cs_seq;
> +
> +     ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] =
> +                                                     &fence->base_fence;
> +     ctx->cs_sequence++;
> +
> +     dma_fence_get(&fence->base_fence);
> +
> +     dma_fence_put(other);
> +
> +     spin_unlock(&ctx->cs_lock);
> +
> +     *cs_new = cs;
> +
> +     return 0;
> +
> +free_fence:
> +     kfree(fence);
> +free_cs:
> +     kfree(cs);
> +     return rc;
> +}
> +

[ ... ]

> +
> +static int goya_validate_cb(struct hl_device *hdev,
> +                     struct hl_cs_parser *parser, bool is_mmu)
> +{
> +     u32 cb_parsed_length = 0;
> +     int rc = 0;
> +
> +     parser->patched_cb_size = 0;
> +
> +     /* cb_user_size is more than 0 so loop will always be executed */
> +     while ((cb_parsed_length < parser->user_cb_size) && (!rc)) {
> +             enum packet_id pkt_id;
> +             u16 pkt_size;
> +             void *user_pkt;
> +
> +             user_pkt = (void *) (parser->user_cb->kernel_address +
> +                                                     cb_parsed_length);
> +
> +             pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
> +                             PACKET_HEADER_PACKET_ID_MASK) >>
> +                                     PACKET_HEADER_PACKET_ID_SHIFT);
> +
> +             pkt_size = goya_packet_sizes[pkt_id];
> +             cb_parsed_length += pkt_size;
> +             if (cb_parsed_length > parser->user_cb_size) {
> +                     dev_err(hdev->dev,
> +                             "packet 0x%x is out of CB boundary\n", pkt_id);
> +                     rc = -EINVAL;
> +                     continue;

For me !rc in the while statement was blind. Please consider break here and 

        if (!rc)
                break;

after the switch

> +             }
> +
> +             switch (pkt_id) {
> +             case PACKET_WREG_32:
> +                     /*
> +                      * Although it is validated after copy in patch_cb(),
> +                      * need to validate here as well because patch_cb() is
> +                      * not called in MMU path while this function is called
> +                      */
> +                     rc = goya_validate_wreg32(hdev, parser, user_pkt);
> +                     break;
> +
> +             case PACKET_WREG_BULK:
> +                     dev_err(hdev->dev,
> +                             "User not allowed to use WREG_BULK\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_MSG_PROT:
> +                     dev_err(hdev->dev,
> +                             "User not allowed to use MSG_PROT\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_CP_DMA:
> +                     dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_STOP:
> +                     dev_err(hdev->dev, "User not allowed to use STOP\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_LIN_DMA:
> +                     if (is_mmu)
> +                             rc = goya_validate_dma_pkt_mmu(hdev, parser,
> +                                             user_pkt);
> +                     else
> +                             rc = goya_validate_dma_pkt_no_mmu(hdev, parser,
> +                                             user_pkt);
> +                     break;
> +
> +             case PACKET_MSG_LONG:
> +             case PACKET_MSG_SHORT:
> +             case PACKET_FENCE:
> +             case PACKET_NOP:
> +                     parser->patched_cb_size += pkt_size;
> +                     break;
> +
> +             default:
> +                     dev_err(hdev->dev, "Invalid packet header 0x%x\n",
> +                             pkt_id);
> +                     rc = -EINVAL;
> +                     break;
> +             }
> +     }
> +
> +     /*
> +      * The new CB should have space at the end for two MSG_PROT packets:
> +      * 1. A packet that will act as a completion packet
> +      * 2. A packet that will generate MSI-X interrupt
> +      */
> +     parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2;
> +
> +     return rc;
> +}

[ ... ]

> +static int goya_patch_cb(struct hl_device *hdev,
> +                             struct hl_cs_parser *parser)
> +{
> +     u32 cb_parsed_length = 0;
> +     u32 cb_patched_cur_length = 0;
> +     int rc = 0;
> +
> +     /* cb_user_size is more than 0 so loop will always be executed */
> +     while ((cb_parsed_length < parser->user_cb_size) && (!rc)) {
> +             enum packet_id pkt_id;
> +             u16 pkt_size;
> +             u32 new_pkt_size = 0;
> +             void *user_pkt, *kernel_pkt;
> +
> +             user_pkt = (void *) (parser->user_cb->kernel_address +
> +                                                     cb_parsed_length);
> +             kernel_pkt = (void *) (parser->patched_cb->kernel_address +
> +                                                     cb_patched_cur_length);
> +
> +             pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
> +                             PACKET_HEADER_PACKET_ID_MASK) >>
> +                                     PACKET_HEADER_PACKET_ID_SHIFT);
> +
> +             pkt_size = goya_packet_sizes[pkt_id];
> +             cb_parsed_length += pkt_size;
> +             if (cb_parsed_length > parser->user_cb_size) {
> +                     dev_err(hdev->dev,
> +                             "packet 0x%x is out of CB boundary\n", pkt_id);
> +                     rc = -EINVAL;
> +                     continue;

Ditto

> +             }
> +
> +             switch (pkt_id) {
> +             case PACKET_LIN_DMA:
> +                     rc = goya_patch_dma_packet(hdev, parser, user_pkt,
> +                                             kernel_pkt, &new_pkt_size);
> +                     cb_patched_cur_length += new_pkt_size;
> +                     break;
> +
> +             case PACKET_WREG_32:
> +                     memcpy(kernel_pkt, user_pkt, pkt_size);
> +                     cb_patched_cur_length += pkt_size;
> +                     rc = goya_validate_wreg32(hdev, parser, kernel_pkt);
> +                     break;
> +
> +             case PACKET_WREG_BULK:
> +                     dev_err(hdev->dev,
> +                             "User not allowed to use WREG_BULK\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_MSG_PROT:
> +                     dev_err(hdev->dev,
> +                             "User not allowed to use MSG_PROT\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_CP_DMA:
> +                     dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_STOP:
> +                     dev_err(hdev->dev, "User not allowed to use STOP\n");
> +                     rc = -EPERM;
> +                     break;
> +
> +             case PACKET_MSG_LONG:
> +             case PACKET_MSG_SHORT:
> +             case PACKET_FENCE:
> +             case PACKET_NOP:
> +                     memcpy(kernel_pkt, user_pkt, pkt_size);
> +                     cb_patched_cur_length += pkt_size;
> +                     break;
> +
> +             default:
> +                     dev_err(hdev->dev, "Invalid packet header 0x%x\n",
> +                             pkt_id);
> +                     rc = -EINVAL;
> +                     break;
> +             }
> +     }
> +
> +     return rc;
> +}

[ ... ]

>  static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
>               u16 event_type, char *axi_name, int len)
>  {
> @@ -4645,6 +5677,48 @@ static void goya_disable_clock_gating(struct hl_device 
> *hdev)
>  
>  }
>  
> +static bool goya_is_device_idle(struct hl_device *hdev)
> +{
> +     u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg;
> +     bool val = true;
> +     int i;
> +
> +     offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;
> +
> +     for (i = 0 ; i < DMA_MAX_NUM ; i++) {
> +             dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
> +
> +             val = val && ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) ==
> +                             DMA_QM_IDLE_MASK);
> +     }
> +
> +     offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;
> +
> +     for (i = 0 ; i < TPC_MAX_NUM ; i++) {
> +             tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
> +             tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
> +             tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
> +
> +             val = val && ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) ==
> +                             TPC_QM_IDLE_MASK);
> +             val = val && ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) ==
> +                             TPC_CMDQ_IDLE_MASK);
> +             val = val && ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) ==
> +                             TPC_CFG_IDLE_MASK);
> +     }
> +
> +     val = val && ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) ==
> +                     MME_QM_IDLE_MASK);
> +     val = val && ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) ==
> +                     MME_CMDQ_IDLE_MASK);
> +     val = val && ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) ==
> +                     MME_ARCH_IDLE_MASK);
> +     val = val && ((RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK) ==
> +                     0);

Huh, these are neat, but IMHO plain

        if ((RREG(reg) & mask) != mask)
                return false;

are more readable...

> +
> +     return val;
> +}
> +

[ ... ]

-- 
Sincerely yours,
Mike.

Re: [PATCH 11/15] habanalabs: add command submission module

Reply via email to