From: Ofir Bitton <obit...@habana.ai>

Training schemes requires much more concurrent command submissions than
inference does. In addition, training command submissions can be completed
in a non serialized manner. Hence, we add support in which each ASIC will
be able to configure the amount of concurrent pending command submissions,
rather than use a predefined amount. This change will enhance performance
by allowing the user to add more concurrent work without waiting for the
previous work to be completed.

Signed-off-by: Ofir Bitton <obit...@habana.ai>
Reviewed-by: Oded Gabbay <oded.gab...@gmail.com>
Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c |  6 ++++--
 drivers/misc/habanalabs/context.c            | 14 +++++++++++---
 drivers/misc/habanalabs/gaudi/gaudi.c        |  2 ++
 drivers/misc/habanalabs/gaudi/gaudiP.h       |  6 ++++++
 drivers/misc/habanalabs/goya/goya.c          |  2 ++
 drivers/misc/habanalabs/goya/goyaP.h         |  6 ++++++
 drivers/misc/habanalabs/habanalabs.h         |  9 +++++----
 drivers/misc/habanalabs/hw_queue.c           |  2 +-
 8 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c 
b/drivers/misc/habanalabs/command_submission.c
index f82974a916c3..e156803f4a99 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -405,7 +405,8 @@ static int allocate_cs(struct hl_device *hdev, struct 
hl_ctx *ctx,
        spin_lock(&ctx->cs_lock);
 
        cs_cmpl->cs_seq = ctx->cs_sequence;
-       other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
+       other = ctx->cs_pending[cs_cmpl->cs_seq &
+                               (hdev->asic_prop.max_pending_cs - 1)];
        if ((other) && (!dma_fence_is_signaled(other))) {
                spin_unlock(&ctx->cs_lock);
                dev_dbg(hdev->dev,
@@ -419,7 +420,8 @@ static int allocate_cs(struct hl_device *hdev, struct 
hl_ctx *ctx,
 
        cs->sequence = cs_cmpl->cs_seq;
 
-       ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+       ctx->cs_pending[cs_cmpl->cs_seq &
+                       (hdev->asic_prop.max_pending_cs - 1)] =
                                                        &cs_cmpl->base_fence;
        ctx->cs_sequence++;
 
diff --git a/drivers/misc/habanalabs/context.c 
b/drivers/misc/habanalabs/context.c
index ec92b3506b1f..1b96fefa4a65 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
         * to this function unless the ref count is 0
         */
 
-       for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+       for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
                dma_fence_put(ctx->cs_pending[i]);
 
+       kfree(ctx->cs_pending);
+
        if (ctx->asid != HL_KERNEL_ASID_ID) {
                /* The engines are stopped as there is no executing CS, but the
                 * Coresight might be still working by accessing addresses
@@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx 
*ctx, bool is_kernel_ctx)
        spin_lock_init(&ctx->cs_lock);
        atomic_set(&ctx->thread_ctx_switch_token, 1);
        ctx->thread_ctx_switch_wait_token = 0;
+       ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
+                               sizeof(struct dma_fence *),
+                               GFP_KERNEL);
+       if (!ctx->cs_pending)
+               return -ENOMEM;
 
        if (is_kernel_ctx) {
                ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
@@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx)
 
 struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 {
+       struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
        struct dma_fence *fence;
 
        spin_lock(&ctx->cs_lock);
@@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, 
u64 seq)
                return ERR_PTR(-EINVAL);
        }
 
-       if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+       if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
                spin_unlock(&ctx->cs_lock);
                return NULL;
        }
 
        fence = dma_fence_get(
-                       ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+                       ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
        spin_unlock(&ctx->cs_lock);
 
        return fence;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c 
b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4d69727bb53b..35e9080f6976 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device 
*hdev)
        strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
                                        CARD_NAME_MAX_LEN);
 
+       prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
+
        return 0;
 }
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h 
b/drivers/misc/habanalabs/gaudi/gaudiP.h
index a46530d375fa..76c3f840e05a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -57,6 +57,12 @@
 
 #define GAUDI_DEFAULT_CARD_NAME                "HL2000"
 
+#define GAUDI_MAX_PENDING_CS           1024
+
+#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
+#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
 #define PCI_DMA_NUMBER_OF_CHNLS                3
 #define HBM_DMA_NUMBER_OF_CHNLS                5
 #define DMA_NUMBER_OF_CHNLS            (PCI_DMA_NUMBER_OF_CHNLS + \
diff --git a/drivers/misc/habanalabs/goya/goya.c 
b/drivers/misc/habanalabs/goya/goya.c
index 0d2952bb58df..e872099a3f7a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 
        strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
                CARD_NAME_MAX_LEN);
+
+       prop->max_pending_cs = GOYA_MAX_PENDING_CS;
 }
 
 /*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h 
b/drivers/misc/habanalabs/goya/goyaP.h
index d36f8d90c9c9..9d8a1761252d 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -57,6 +57,12 @@
 
 #define GOYA_DEFAULT_CARD_NAME         "HL1000"
 
+#define GOYA_MAX_PENDING_CS            64
+
+#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS)
+#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
 /* DRAM Memory Map */
 
 #define CPU_FW_IMAGE_SIZE              0x10000000      /* 256MB */
diff --git a/drivers/misc/habanalabs/habanalabs.h 
b/drivers/misc/habanalabs/habanalabs.h
index 1ecdcf8b763a..64d9b2dd3e19 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -42,9 +42,6 @@
 
 #define HL_MAX_QUEUES                  128
 
-/* MUST BE POWER OF 2 and larger than 1 */
-#define HL_MAX_PENDING_CS              64
-
 #define HL_IDLE_BUSY_TS_ARR_SIZE       4096
 
 /* Memory */
@@ -61,6 +58,9 @@
 
 #define HL_MAX_SOB_VAL                 (1 << 15)
 
+#define IS_POWER_OF_2(n)               (n != 0 && ((n & (n - 1)) == 0))
+#define IS_MAX_PENDING_CS_VALID(n)     (IS_POWER_OF_2(n) && (n > 1))
+
 /**
  * struct pgt_info - MMU hop page info.
  * @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -285,6 +285,7 @@ struct asic_fixed_properties {
        u32                             high_pll;
        u32                             cb_pool_cb_cnt;
        u32                             cb_pool_cb_size;
+       u32                             max_pending_cs;
        u8                              tpc_enabled_mask;
        u8                              completion_queues_count;
 };
@@ -782,7 +783,7 @@ struct hl_ctx {
        struct hl_fpriv         *hpriv;
        struct hl_device        *hdev;
        struct kref             refcount;
-       struct dma_fence        *cs_pending[HL_MAX_PENDING_CS];
+       struct dma_fence        **cs_pending;
        struct hl_va_range      *host_va_range;
        struct hl_va_range      *host_huge_va_range;
        struct hl_va_range      *dram_va_range;
diff --git a/drivers/misc/habanalabs/hw_queue.c 
b/drivers/misc/habanalabs/hw_queue.c
index f4434b39ef1b..29b96d24edc2 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
         * write address offset in the SM block (QMAN LBW message).
         * The write address offset is calculated as "COMP_OFFSET << 2".
         */
-       offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+       offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
        ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
                ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
 
-- 
2.17.1

Reply via email to