[Beignet] [PATCH] Refine command queue's enqueue ndrang.

junyan . he Thu, 02 Mar 2017 03:13:41 -0800

From: Junyan He <junyan...@intel.com>

Delete all the obsolete code in command_queue_gen7.c
Make the code logic more clean and using the elf info
to do the job. After that, we can total split the GBE
backend from the runtime. We do not need to get the
kernel info from GBE backend at runtime.


Signed-off-by: Junyan He <junyan...@intel.com>
---
 src/gen/cl_command_queue_gen.c | 876 +++++++++++++++++++++++++++++++++++++++++
 src/gen/cl_gen.h               |   9 +
 2 files changed, 885 insertions(+)
 create mode 100644 src/gen/cl_command_queue_gen.c

diff --git a/src/gen/cl_command_queue_gen.c b/src/gen/cl_command_queue_gen.c
new file mode 100644
index 0000000..d12ced8
--- /dev/null
+++ b/src/gen/cl_command_queue_gen.c
@@ -0,0 +1,876 @@
+/*
+ * Copyright Â© 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_gen.h"
+#include "gen_device_pci_id.h"
+
+#include "intel_defines.h"
+#include "intel_structs.h"
+#include "intel_batchbuffer.h"
+
+#include <i915_drm.h>
+#include <drm.h>
+#include <intel_bufmgr.h>
+#include <assert.h>
+#include <string.h>
+
+/* We can bind only a limited number of buffers */
+enum { max_buf_n = 128 };
+enum { max_img_n = 128 };
+enum { max_sampler_n = 16 };
+
+typedef struct gen_gpgpu {
+  drm_intel_bufmgr *bufmgr; // The drm buffer mgr
+  cl_device_id device;      // The device of this gpu
+  cl_kernel kernel;         // The kernel we are executing
+  drm_intel_bo *kernel_bo;  // The buffer object holding kernel bitcode
+  uint32_t simd_size;       // The simd size we are executing.
+  uint32_t atomic_test_result;
+
+  struct intel_batchbuffer *batch; // The batch buffer holding GPU command
+
+  struct {
+    drm_intel_bo *aux_bo; // Aux buffer needed by GPU command
+    uint32_t surface_heap_offset;
+    uint32_t curbe_offset;
+    uint32_t idrt_offset;
+    uint32_t sampler_state_offset;
+    uint32_t sampler_border_color_state_offset;
+  } aux; // All aux setting info
+
+  struct {
+    uint32_t local_mem_size; // The total local memory size
+
+    uint32_t max_bti;                      /* Max bti number */
+    uint32_t binded_n;                     /* Number of buffers binded */
+    drm_intel_bo *binded_buf[max_buf_n];   /* All buffers binded for the 
kernel, e.g. kernel's arg */
+    uint32_t binded_offset[max_buf_n];     /* The offset in the curbe buffer */
+    uint32_t target_buf_offset[max_buf_n]; /* The offset within the buffers to 
be binded */
+
+    uint32_t per_thread_scratch_size;
+    uint32_t total_scratch_size;
+    drm_intel_bo *scratch_bo; /* Scratch buffer */
+
+    drm_intel_bo *const_bo; /* Constant buffer */
+
+    drm_intel_bo *stack_bo; /* stack buffer */
+
+    drm_intel_bo *time_stamp_bo; /* The buffer to record exec timestamps */
+  } mem;
+
+  struct {
+    uint64_t sampler_bitmap; /* sampler usage bitmap. */
+  } sampler;
+
+  struct {
+    uint32_t barrier_slm_used;   /* Use barrier or slm */
+    uint32_t thread_num;         // Total thread number we need for this kernel
+    uint32_t max_thread_num;     // Max thread number we can run at same time
+    uint32_t per_thread_scratch; // Scratch buffer size for each thread
+    uint32_t num_cs_entries;     /* Curbe entry number */
+    uint32_t size_cs_entry;      /* size of one entry in 512bit elements */
+    char *curbe;                 /* Curbe content */
+    uint32_t curbe_size;         /* Curbe size */
+  } thread;
+
+} gen_gpgpu;
+
+#define MAX_IF_DESC 32
+
+typedef struct surface_heap {
+  uint32_t binding_table[256];
+  char surface[256 * sizeof(gen_surface_state_t)];
+} surface_heap_t;
+
+#include "gen_gpgpu_func.c"
+
+static cl_int
+check_work_group_capability(cl_command_queue queue, cl_kernel kernel,
+                            const size_t *local_wk_sz, uint32_t wk_dim)
+{
+  size_t sz = 0;
+  int i;
+
+  sz = local_wk_sz[0];
+  for (i = 1; i < wk_dim; ++i)
+    sz *= local_wk_sz[i];
+
+  if (sz > cl_kernel_get_max_workgroup_size_gen(kernel, queue->device))
+    return CL_INVALID_WORK_ITEM_SIZE;
+
+  return CL_SUCCESS;
+}
+
+static cl_int
+gen_gpgpu_setup_curbe(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu 
*gpu,
+                      const uint32_t work_dim, const size_t *global_wk_off,
+                      const size_t *global_wk_sz, const size_t *local_wk_sz,
+                      const size_t *enqueued_local_wk_sz)
+{
+  int curbe_size = 0;
+  char *curbe = NULL;
+  int i;
+  int sz;
+  uint32_t slm_offset;
+
+  /* Calculate the total size needed */
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel->args[i].arg_size + kernel_gen->arg_extra_info[i].arg_offset > 
curbe_size)
+      curbe_size = kernel->args[i].arg_size + 
kernel_gen->arg_extra_info[i].arg_offset;
+  }
+  for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) {
+    sz = kernel_gen->virt_reg_phy_offset[i].phy_offset +
+         kernel_gen->virt_reg_phy_offset[i].size;
+    if (sz > curbe_size)
+      curbe_size = sz;
+  }
+  for (i = 0; i < kernel_gen->image_info_num; i++) {
+    if (kernel_gen->image_info[i].width > curbe_size)
+      curbe_size = sz;
+    if (kernel_gen->image_info[i].height > curbe_size)
+      curbe_size = sz;
+    if (kernel_gen->image_info[i].depth > curbe_size)
+      curbe_size = sz;
+    if (kernel_gen->image_info[i].data_type > curbe_size)
+      curbe_size = sz;
+    if (kernel_gen->image_info[i].channel_order > curbe_size)
+      curbe_size = sz;
+  }
+
+  curbe_size = ALIGN(curbe_size, 32);
+
+  gpu->thread.curbe_size = curbe_size;
+
+  if (curbe_size == 0) {
+    assert(kernel->arg_n == 0);
+    return CL_SUCCESS;
+  }
+
+  curbe = cl_malloc(curbe_size);
+  if (curbe == NULL) {
+    return CL_OUT_OF_HOST_MEMORY;
+  }
+  gpu->thread.curbe = curbe;
+  memset(curbe, 0, curbe_size);
+
+  slm_offset = kernel_gen->local_mem_size;
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel_gen->arg_extra_info[i].arg_offset < 0) // no usage argument
+      continue;
+
+    if (kernel->args[i].arg_type == ArgTypePointer &&
+        kernel->args[i].arg_addrspace == AddressSpaceLocal) { // SLM setting
+      assert(kernel->args[i].val_size > 0);
+      assert(kernel->args[i].arg_size == sizeof(uint32_t));
+      assert(kernel_gen->arg_extra_info[i].arg_align > 0);
+      // Need to be aligned address
+      slm_offset = ALIGN(slm_offset, kernel_gen->arg_extra_info[i].arg_align);
+      *((uint32_t *)(curbe + kernel_gen->arg_extra_info[i].arg_offset)) = 
slm_offset;
+      slm_offset += kernel->args[i].val_size;
+      continue;
+    }
+
+    if (kernel->args[i].arg_type == ArgTypePointer) {
+      assert(kernel->args[i].arg_addrspace == AddressSpaceConstant ||
+             kernel->args[i].arg_addrspace == AddressSpaceGlobal);
+      /* For other buffer, we will set this value in surface binding */
+      continue;
+    }
+
+    if (kernel->args[i].arg_type == ArgTypeSampler) {
+      continue;
+    }
+
+    if (kernel->args[i].arg_type == ArgTypeImage) {
+      continue;
+    }
+
+    /* Common value or struct data, just copy the content */
+    assert(kernel->args[i].val_size == kernel->args[i].arg_size);
+    if (kernel->args[i].arg_type == ArgTypeValue && kernel->args[i].arg_size 
<= sizeof(cl_double))
+      memcpy(curbe + kernel_gen->arg_extra_info[i].arg_offset, 
&kernel->args[i].val, kernel->args[i].arg_size);
+    else
+      memcpy(curbe + kernel_gen->arg_extra_info[i].arg_offset, 
kernel->args[i].val.val_ptr, kernel->args[i].arg_size);
+  }
+
+#define UPLOAD(ENUM, VALUE)                                                    
     \
+  if (kernel_gen->virt_reg_phy_offset[i].virt_reg == ENUM) {                   
     \
+    assert(kernel_gen->virt_reg_phy_offset[i].size == sizeof(uint32_t));       
     \
+    *((uint32_t *)(curbe + kernel_gen->virt_reg_phy_offset[i].phy_offset)) = 
VALUE; \
+    continue;                                                                  
     \
+  }
+
+  for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) {
+    UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
+    UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
+    UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+    UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0]);
+    UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1]);
+    UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2]);
+    UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
+    UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
+    UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
+    UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
+    UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
+    UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
+    UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + 
(global_wk_sz[0] % enqueued_local_wk_sz[0] ? 1 : 0));
+    UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + 
(global_wk_sz[1] % enqueued_local_wk_sz[1] ? 1 : 0));
+    UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + 
(global_wk_sz[2] % enqueued_local_wk_sz[2] ? 1 : 0));
+    UPLOAD(GBE_CURBE_THREAD_NUM, gpu->thread.thread_num);
+    UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
+  }
+#undef UPLOAD
+
+  return CL_SUCCESS;
+}
+
+static void
+gen_gpgpu_bind_one_bo(gen_gpgpu *gpu, drm_intel_bo *buf, uint32_t offset,
+                      uint32_t internal_offset, size_t size, uint8_t bti)
+{
+  if (buf == NULL)
+    return;
+
+  assert(gpu->mem.binded_n < max_buf_n);
+  if (offset != -1) {
+    gpu->mem.binded_buf[gpu->mem.binded_n] = buf;
+    gpu->mem.target_buf_offset[gpu->mem.binded_n] = internal_offset;
+    gpu->mem.binded_offset[gpu->mem.binded_n] = offset;
+    gpu->mem.binded_n++;
+  }
+  gen_gpgpu_setup_bti(gpu, buf, internal_offset, size, bti, 
I965_SURFACEFORMAT_RAW);
+}
+
+static void
+gen_gpgpu_setup_global_mem(cl_kernel kernel, cl_kernel_gen kernel_gen, 
gen_gpgpu *gpu)
+{
+  int i;
+  int32_t offset = 0;
+  cl_mem mem;
+  uint32_t bti;
+
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel->args[i].arg_type != ArgTypePointer)
+      continue;
+
+    if (kernel->args[i].arg_addrspace != AddressSpaceGlobal)
+      continue;
+
+    mem = NULL;
+    offset = -1;
+    if (kernel->args[i].val.val_mem != NULL) {
+      mem = (cl_mem)kernel->args[i].val.val_mem;
+      offset = mem->offset;
+    }
+
+    bti = kernel_gen->arg_extra_info[i].arg_misc;
+    if (CL_OBJECT_IS_BUFFER(mem))
+      offset += ((struct _cl_mem_buffer *)mem)->sub_offset;
+
+    if (gpu->mem.max_bti < bti)
+      gpu->mem.max_bti = bti;
+
+    gen_gpgpu_bind_one_bo(gpu, mem ? (drm_intel_bo *)mem->bo : NULL,
+                          kernel_gen->arg_extra_info[i].arg_offset, offset,
+                          mem ? mem->size : 0, bti);
+  }
+}
+
+static cl_int
+gen_gpgpu_setup_image(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu 
*gpu)
+{
+  int i;
+  cl_mem mem;
+  struct _cl_mem_image *image;
+  cl_gen_image_info_offset info;
+
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel->args[i].arg_type != ArgTypeImage)
+      continue;
+
+    assert(kernel->args[i].val.val_ptr != NULL);
+    mem = (cl_mem)kernel->args[i].val.val_ptr;
+    image = cl_mem_image(mem);
+
+    info = &kernel_gen->image_info[kernel_gen->arg_extra_info[i].arg_misc];
+
+    /* Set the image info to the curbe */
+    if (info->width >= 0)
+      *(uint32_t *)(gpu->thread.curbe + info->width) = image->w;
+    if (info->height >= 0)
+      *(uint32_t *)(gpu->thread.curbe + info->height) = image->h;
+    if (info->depth >= 0)
+      *(uint32_t *)(gpu->thread.curbe + info->depth) = image->depth;
+    if (info->channel_order >= 0)
+      *(uint32_t *)(gpu->thread.curbe + info->channel_order) =
+        image->fmt.image_channel_order;
+    if (info->data_type >= 0)
+      *(uint32_t *)(gpu->thread.curbe + info->data_type) =
+        image->fmt.image_channel_data_type;
+
+    if (gpu->mem.max_bti < info->bti)
+      gpu->mem.max_bti = info->bti;
+
+    gen_gpgpu_bind_image(gpu, info->bti, (drm_intel_bo *)image->base.bo,
+                         image->offset + mem->offset, image->intel_fmt,
+                         image->image_type, image->bpp, image->w, image->h,
+                         image->depth, image->row_pitch, image->slice_pitch,
+                         (cl_gpgpu_tiling)image->tiling);
+
+    // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in 
the driver layer
+    // on demand.
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      gen_gpgpu_bind_image(gpu, info->bti + BTI_WORKAROUND_IMAGE_OFFSET,
+                           (drm_intel_bo *)image->base.bo, image->offset + 
mem->offset,
+                           image->intel_fmt, image->image_type, image->bpp,
+                           image->w, image->h, image->depth,
+                           image->row_pitch, image->slice_pitch, 
(cl_gpgpu_tiling)image->tiling);
+  }
+
+  return CL_SUCCESS;
+}
+
+static cl_int
+gen_gpgpu_setup_scratch(gen_gpgpu *gpu)
+{
+  drm_intel_bufmgr *bufmgr = gpu->bufmgr;
+  cl_uint device_id = gpu->device->device_id;
+
+  gpu->mem.total_scratch_size = gpu->mem.per_thread_scratch_size * 
gpu->thread.max_thread_num;
+  /* Per Bspec, scratch should 2X the desired size when EU index is not 
continuous */
+  if (IS_HASWELL(device_id) || IS_CHERRYVIEW(device_id) ||
+      device_id == PCI_CHIP_BROXTON_1 || device_id == PCI_CHIP_BROXTON_3) {
+    gpu->mem.total_scratch_size = gpu->mem.total_scratch_size * 2;
+  }
+
+  if (gpu->mem.total_scratch_size) {
+    gpu->mem.scratch_bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO",
+                                             gpu->mem.total_scratch_size, 
4096);
+    if (gpu->mem.scratch_bo == NULL)
+      return CL_OUT_OF_RESOURCES;
+  }
+  return CL_SUCCESS;
+}
+
+static cl_int
+gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, 
gen_gpgpu *gpu)
+{
+  cl_program_gen prog_gen;
+  cl_uint const_buf_size = 0;
+  cl_uint aligned_const_buf_size = 0;
+  cl_mem mem;
+  cl_uint addr_offset;
+  char *const_buf_addr = NULL;
+  int i;
+  DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen);
+
+  if (prog_gen->rodata) {
+    const_buf_size = prog_gen->rodata_data->d_size;
+    aligned_const_buf_size = ALIGN(const_buf_size, 4);
+  } else {
+    /* Reserve 8 bytes to get rid of 0 address */
+    aligned_const_buf_size = 8;
+  }
+
+  /* Calculate all the constant mem size */
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel->args[i].arg_type != ArgTypePointer)
+      continue;
+    if (kernel->args[i].arg_addrspace != AddressSpaceConstant)
+      continue;
+
+    if (kernel->args[i].val.val_ptr == NULL)
+      continue;
+
+    assert(kernel_gen->arg_extra_info[i].arg_align != 0);
+    mem = (cl_mem)kernel->args[i].val.val_ptr;
+    const_buf_size += mem->size;
+    aligned_const_buf_size = ALIGN(aligned_const_buf_size, 
kernel_gen->arg_extra_info[i].arg_align);
+    aligned_const_buf_size += mem->size;
+  }
+
+  if (const_buf_size == 0) // No need for constant buffer.
+    return CL_SUCCESS;
+
+  gen_gpgpu_alloc_constant_buffer(gpu, aligned_const_buf_size, BTI_CONSTANT);
+  if (gpu->mem.const_bo == NULL)
+    return CL_OUT_OF_RESOURCES;
+
+  drm_intel_bo_map(gpu->mem.const_bo, 1);
+
+  const_buf_addr = gpu->mem.const_bo->virtual;
+  if (const_buf_addr == NULL)
+    return CL_OUT_OF_RESOURCES;
+
+  addr_offset = 0;
+  /* upload the global constant data, in rodata */
+  if (prog_gen->rodata && prog_gen->rodata_data->d_size > 0) {
+    memcpy(const_buf_addr, prog_gen->rodata_data->d_buf, 
prog_gen->rodata_data->d_size);
+    addr_offset += prog_gen->rodata_data->d_size;
+  } else {
+    addr_offset = 8;
+  }
+
+  /* Upload constant ptr content */
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel->args[i].arg_type != ArgTypePointer)
+      continue;
+    if (kernel->args[i].arg_addrspace != AddressSpaceConstant)
+      continue;
+
+    addr_offset = ALIGN(addr_offset, kernel_gen->arg_extra_info[i].arg_align);
+
+    /* Set curbe */
+    *(uint32_t *)(gpu->thread.curbe + 
kernel_gen->arg_extra_info[i].arg_offset) = addr_offset;
+
+    mem = (cl_mem)kernel->args[i].val.val_ptr;
+    drm_intel_bo_map((drm_intel_bo *)mem->bo, 1);
+    memcpy(const_buf_addr + addr_offset, ((drm_intel_bo *)(mem->bo))->virtual, 
mem->size);
+    drm_intel_bo_unmap((drm_intel_bo *)mem->bo);
+    addr_offset += mem->size;
+  }
+
+  drm_intel_bo_unmap(gpu->mem.const_bo);
+  return CL_SUCCESS;
+}
+
+static cl_int
+gen_gpgpu_upload_final_curbe(cl_kernel kernel, cl_kernel_gen kernel_gen,
+                             gen_gpgpu *gpu, const size_t *local_wk_sz)
+{
+  char *final_curbe = NULL;
+  char *final_curbe_ptr = NULL;
+  cl_gen_virt_phy_offset map = kernel_gen->virt_reg_phy_offset;
+  int i, j, k, curr = 0;
+  uint32_t *ids[3] = {NULL, NULL, NULL};
+  int32_t id_offset[3], ip_offset, tid_offset;
+  uint16_t *block_ips = NULL;
+  uint32_t *thread_ids = NULL;
+  int32_t dw_ip_offset = -1;
+
+  if (gpu->thread.curbe_size == 0) {
+    assert(gpu->thread.curbe == NULL);
+    return CL_SUCCESS;
+  }
+
+  assert(gpu->thread.thread_num > 0);
+  final_curbe = cl_malloc(gpu->thread.thread_num * gpu->thread.curbe_size);
+  if (final_curbe == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  for (i = 0; i < gpu->thread.thread_num; ++i) {
+    memcpy(final_curbe + gpu->thread.curbe_size * i,
+           gpu->thread.curbe, gpu->thread.curbe_size);
+  }
+
+  id_offset[0] = id_offset[1] = id_offset[2] = -1;
+  ip_offset = -1;
+  tid_offset = -1;
+  if (map) {
+    for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) {
+      if (map[i].virt_reg == GBE_CURBE_LOCAL_ID_X ||
+          map[i].virt_reg == GBE_CURBE_LOCAL_ID_Y ||
+          map[i].virt_reg == GBE_CURBE_LOCAL_ID_Z) {
+        id_offset[map[i].virt_reg - GBE_CURBE_LOCAL_ID_X] = map[i].phy_offset;
+        assert(map[i].phy_offset >= 0);
+        assert(map[i].size / gpu->simd_size == sizeof(uint32_t));
+        continue;
+      }
+      if (map[i].virt_reg == GBE_CURBE_BLOCK_IP) {
+        ip_offset = map[i].phy_offset;
+        assert(map[i].phy_offset >= 0);
+        assert(map[i].size / gpu->simd_size == sizeof(uint16_t));
+        continue;
+      }
+      if (map[i].virt_reg == GBE_CURBE_THREAD_ID) {
+        tid_offset = map[i].phy_offset;
+        assert(map[i].phy_offset >= 0);
+        assert(map[i].size == sizeof(uint32_t));
+        continue;
+      }
+      if (map[i].virt_reg == GBE_CURBE_DW_BLOCK_IP) {
+        dw_ip_offset = map[i].phy_offset;
+        assert(map[i].phy_offset >= 0);
+        assert(map[i].size / gpu->simd_size == sizeof(uint32_t));
+        continue;
+      }
+    }
+
+    assert(ip_offset < 0 || dw_ip_offset < 0);
+    assert(ip_offset >= 0 || dw_ip_offset >= 0);
+
+    if (id_offset[0] >= 0) {
+      ids[0] = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num * 
gpu->simd_size);
+      assert(id_offset[0] >= 0);
+    }
+    if (id_offset[1] >= 0) {
+      ids[1] = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num * 
gpu->simd_size);
+      assert(id_offset[1] >= 0);
+    }
+    if (id_offset[2] >= 0) {
+      ids[2] = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num * 
gpu->simd_size);
+      assert(id_offset[2] >= 0);
+    }
+
+    block_ips = (uint16_t *)alloca(sizeof(uint16_t) * gpu->thread.thread_num * 
gpu->simd_size);
+    assert(block_ips >= 0);
+    memset(block_ips, 0xff, sizeof(int16_t) * gpu->thread.thread_num * 
gpu->simd_size);
+
+    if (tid_offset >= 0) {
+      thread_ids = (uint32_t *)alloca(sizeof(uint32_t) * 
gpu->thread.thread_num);
+      assert(thread_ids >= 0);
+      memset(thread_ids, 0, sizeof(uint32_t) * gpu->thread.thread_num);
+    }
+    /* Compute the IDs and the block IPs */
+    for (k = 0; k < local_wk_sz[2]; ++k) {
+      for (j = 0; j < local_wk_sz[1]; ++j) {
+        for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
+          if (id_offset[0] >= 0)
+            ids[0][curr] = i;
+          if (id_offset[1] >= 0)
+            ids[1][curr] = j;
+          if (id_offset[2] >= 0)
+            ids[2][curr] = k;
+          block_ips[curr] = 0;
+          if (thread_ids)
+            thread_ids[curr / gpu->simd_size] = curr / gpu->simd_size;
+        }
+      }
+    }
+
+    /* Set the vary part of curbe */
+    curr = 0;
+    final_curbe_ptr = final_curbe;
+    for (i = 0; i < gpu->thread.thread_num; ++i, final_curbe_ptr += 
gpu->thread.curbe_size) {
+      uint32_t *ids0 = (uint32_t *)(final_curbe_ptr + id_offset[0]);
+      uint32_t *ids1 = (uint32_t *)(final_curbe_ptr + id_offset[1]);
+      uint32_t *ids2 = (uint32_t *)(final_curbe_ptr + id_offset[2]);
+      uint16_t *ips = (uint16_t *)(final_curbe_ptr + ip_offset);
+      uint32_t *dw_ips = (uint32_t *)(final_curbe_ptr + dw_ip_offset);
+
+      if (thread_ids)
+        *(uint32_t *)(final_curbe_ptr + tid_offset) = thread_ids[i];
+
+      for (j = 0; j < gpu->simd_size; ++j, ++curr) {
+        if (id_offset[0] >= 0)
+          ids0[j] = ids[0][curr];
+        if (id_offset[1] >= 0)
+          ids1[j] = ids[1][curr];
+        if (id_offset[2] >= 0)
+          ids2[j] = ids[2][curr];
+        if (ip_offset >= 0)
+          ips[j] = block_ips[curr];
+        if (dw_ip_offset >= 0)
+          dw_ips[j] = block_ips[curr];
+      }
+    }
+  }
+
+  /* All settings are OK, upload it to GPU */
+  gen_gpgpu_upload_curbes(gpu, final_curbe, gpu->thread.thread_num * 
gpu->thread.curbe_size);
+  cl_free(final_curbe);
+  return CL_SUCCESS;
+}
+
+static cl_int
+gen_gpgu_bind_stack(gen_gpgpu *gpu, cl_kernel kernel, cl_kernel_gen kernel_gen)
+{
+  int32_t stack_sz = kernel_gen->stack_size;
+  int32_t stack_offset = -1;
+  int32_t stack_size_offset = -1;
+  int i;
+
+  if (stack_sz == 0)
+    return CL_SUCCESS;
+
+  stack_sz *= kernel_gen->simd_width;
+  stack_sz *= gpu->thread.max_thread_num;
+
+  if (IS_GEN75(gpu->device->device_id))
+    stack_sz = stack_sz * 4;
+  else if (gpu->device->device_id == PCI_CHIP_BROXTON_1 || 
gpu->device->device_id == PCI_CHIP_BROXTON_3 ||
+           IS_CHERRYVIEW(gpu->device->device_id))
+    stack_sz = stack_sz * 2;
+
+  for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) {
+    if (kernel_gen->virt_reg_phy_offset[i].virt_reg == GBE_CURBE_STACK_SIZE) {
+      assert(kernel_gen->virt_reg_phy_offset[i].size == sizeof(uint32_t));
+      stack_size_offset = kernel_gen->virt_reg_phy_offset[i].phy_offset;
+      continue;
+    }
+    if (kernel_gen->virt_reg_phy_offset[i].virt_reg == 
GBE_CURBE_EXTRA_ARGUMENT) {
+      assert(kernel_gen->virt_reg_phy_offset[i].size == sizeof(uint64_t));
+      stack_offset = kernel_gen->virt_reg_phy_offset[i].phy_offset;
+      continue;
+    }
+  }
+  assert(stack_offset >= 0);
+
+  if (stack_size_offset >= 0)
+    *((uint32_t *)(gpu->thread.curbe + stack_offset)) = stack_sz;
+
+  gpu->mem.stack_bo = drm_intel_bo_alloc(gpu->bufmgr, "STACK", stack_sz, 64);
+  if (gpu->mem.stack_bo == NULL)
+    return CL_OUT_OF_RESOURCES;
+
+  gen_gpgpu_bind_one_bo(gpu, gpu->mem.stack_bo, stack_offset, 0, stack_sz, 
BTI_PRIVATE);
+  return CL_SUCCESS;
+}
+
+#define MAX_GROUP_SIZE_IN_HALFSLICE 512
+static size_t
+gen_gpu_compute_batch_sz(cl_kernel k)
+{
+  return 256 + 256;
+}
+
+/* This is a very important function. It is responsible for loading and 
setting GPU
+   execution context based on the cl_kernel and kernel's arguments. */
+LOCAL cl_int
+cl_command_queue_ND_range_gen(cl_command_queue queue, cl_kernel kernel, 
cl_event event,
+                              const uint32_t work_dim, const size_t 
*global_wk_off,
+                              const size_t *global_dim_off, const size_t 
*global_wk_sz,
+                              const size_t *global_wk_sz_use, const size_t 
*local_wk_sz,
+                              const size_t *local_wk_sz_use)
+{
+  cl_int ret = CL_SUCCESS;
+  gen_gpgpu *gpu = NULL;
+  size_t local_size = local_wk_sz_use[0] * local_wk_sz_use[1] * 
local_wk_sz_use[2];
+  cl_kernel_gen kernel_gen;
+  cl_context_gen ctx_gen;
+  int i;
+  drm_intel_bufmgr *bufmgr = NULL;
+
+  DEV_PRIVATE_DATA(kernel, queue->device, kernel_gen);
+  DEV_PRIVATE_DATA(queue->ctx, queue->device, ctx_gen);
+  bufmgr = ctx_gen->drv->bufmgr;
+  assert(bufmgr);
+
+  ret = check_work_group_capability(queue, kernel, local_wk_sz_use, 3);
+  if (ret != CL_SUCCESS)
+    return ret;
+
+  if (kernel_gen->scratch_size > queue->device->scratch_mem_size)
+    return CL_OUT_OF_RESOURCES;
+
+  gpu = cl_calloc(1, sizeof(gen_gpgpu));
+  if (gpu == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  do {
+    /* Init the gpu parameters */
+    gpu->bufmgr = bufmgr;
+    gpu->simd_size = kernel_gen->simd_width;
+    gpu->kernel = kernel;
+    gpu->device = queue->device;
+    gpu->thread.max_thread_num = queue->device->max_compute_unit * 
queue->device->max_thread_per_unit;
+    gpu->thread.thread_num = (local_size + gpu->simd_size - 1) / 
gpu->simd_size;
+    gpu->sampler.sampler_bitmap = ~((1 << max_sampler_n) - 1);
+    gpu->mem.max_bti = 0;
+    gpu->mem.per_thread_scratch_size = kernel_gen->scratch_size;
+    gpu->mem.total_scratch_size = 0;
+    gpu->atomic_test_result = gpu->device->atomic_test_result;
+    gpu->thread.barrier_slm_used = kernel_gen->barrier_slm_used;
+
+    gpu->mem.local_mem_size = kernel_gen->local_mem_size;
+    for (i = 0; i < kernel->arg_n; i++) {
+      if (kernel->args[i].arg_type == ArgTypePointer &&
+          kernel->args[i].arg_addrspace == AddressSpaceLocal) {
+        assert(kernel->args[i].is_set);
+        assert(kernel_gen->arg_extra_info[i].arg_align > 0);
+        gpu->mem.local_mem_size = ALIGN(gpu->mem.local_mem_size, 
kernel_gen->arg_extra_info[i].arg_align);
+        gpu->mem.local_mem_size += kernel->args[i].val_size;
+      }
+    }
+
+    if (gpu->mem.local_mem_size > queue->device->local_mem_size) {
+      ret = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+
+    // Setup the kernel bitcode and upload it to GPU side
+    gpu->kernel_bo = drm_intel_bo_alloc(bufmgr, "CL kernel", 
kernel_gen->kern_base.exec_code_sz, 64u);
+    if (gpu->kernel_bo == NULL) {
+      ret = CL_OUT_OF_RESOURCES;
+      break;
+    }
+    /* Upload the bitcode */
+    drm_intel_bo_subdata(gpu->kernel_bo, 0, kernel_gen->kern_base.exec_code_sz,
+                         kernel_gen->kern_base.exec_code);
+
+    ret = gen_gpgpu_setup_curbe(kernel, kernel_gen, gpu, work_dim, 
global_wk_off, global_wk_sz,
+                                local_wk_sz_use, local_wk_sz);
+    if (ret != CL_SUCCESS)
+      break;
+
+    gpu->thread.num_cs_entries = 64;
+    gpu->thread.size_cs_entry = gpu->thread.curbe_size / 32;
+
+    ret = gen_gpgpu_setup_aux(gpu);
+    if (ret != CL_SUCCESS)
+      break;
+
+    if (queue->props & CL_QUEUE_PROFILING_ENABLE) { // Need to alloc profiling 
buffer
+      gpu->mem.time_stamp_bo = dri_bo_alloc(bufmgr, "timestamp query", 4096, 
4096);
+
+      if (gpu->mem.time_stamp_bo == NULL) {
+        ret = CL_OUT_OF_RESOURCES;
+        break;
+      }
+    }
+
+    /* Bind user buffers */
+    gen_gpgpu_setup_global_mem(kernel, kernel_gen, gpu);
+
+    ret = gen_gpgpu_setup_image(kernel, kernel_gen, gpu);
+    if (ret != CL_SUCCESS)
+      break;
+
+    gen_gpgpu_setup_sampler(gpu, kernel_gen->samper_info, 
kernel_gen->samper_info_num);
+
+    ret = gen_gpgpu_setup_scratch(gpu);
+    if (ret != CL_SUCCESS)
+      break;
+
+    /* Bind a stack if needed */
+    ret = gen_gpgu_bind_stack(gpu, kernel, kernel_gen);
+    if (ret != CL_SUCCESS)
+      break;
+
+    ret = gen_setup_constant_buffer(kernel, kernel_gen, gpu);
+    if (ret != CL_SUCCESS)
+      break;
+
+    gen_gpgpu_build_idrt(gpu);
+    gen_gpgpu_upload_final_curbe(kernel, kernel_gen, gpu, local_wk_sz_use);
+    gen_gpgpu_finish_aux(gpu);
+
+    /* Start a new batch buffer */
+    gpu->batch = intel_batchbuffer_create((struct intel_driver 
*)queue->ctx->drv,
+                                          gen_gpu_compute_batch_sz(kernel));
+    if (gpu->batch == NULL) {
+      ret = CL_OUT_OF_RESOURCES;
+      break;
+    }
+
+    gen_gpgpu_batch_start(gpu);
+    gen_gpgpu_walker(gpu, gpu->simd_size, gpu->thread.thread_num,
+                     global_wk_off, global_dim_off, global_wk_sz_use, 
local_wk_sz_use);
+    gen_gpgpu_batch_end(gpu, 0);
+
+    event->exec_data.gpgpu = gpu;
+    event->exec_data.type = EnqueueNDRangeKernel;
+  } while (0);
+
+  if (ret != CL_SUCCESS) {
+    gen_gpgpu_finish_aux(gpu);
+    cl_command_queue_delete_gpgpu(gpu);
+  }
+
+  return ret;
+}
+
+LOCAL int
+cl_command_queue_flush_gpgpu(void *gpgpu)
+{
+  gen_gpgpu *gpu = gpgpu;
+
+  if (!gpu->batch || !gpu->batch->buffer)
+    return CL_INVALID_VALUE;
+
+  if (intel_batchbuffer_flush(gpu->batch) < 0)
+    return CL_INVALID_VALUE;
+
+  return CL_SUCCESS;
+
+  /* FIXME:
+     Remove old assert here for binded buffer offset 0 which
+     tried to guard possible NULL buffer pointer check in kernel, as
+     in case like "runtime_null_kernel_arg", but that's wrong to just
+     take buffer offset 0 as NULL, and cause failure for normal
+     kernels which has no such NULL ptr check but with buffer offset 0
+     (which is possible now and will be normal if full PPGTT is on).
+
+     Need to fix NULL ptr check otherwise.
+  */
+}
+
+LOCAL int
+cl_command_queue_finish_gpgpu(void *gpgpu)
+{
+  gen_gpgpu *gpu = gpgpu;
+
+  if (!gpu->batch || !gpu->batch->buffer)
+    return CL_INVALID_VALUE;
+
+  intel_batchbuffer_finish(gpu->batch);
+  return CL_SUCCESS;
+}
+
+LOCAL void
+cl_command_queue_delete_gpgpu(void *gpgpu)
+{
+  gen_gpgpu *gpu = gpgpu;
+
+  if (gpgpu == NULL)
+    return;
+
+  if (gpu->kernel_bo) {
+    drm_intel_bo_unreference(gpu->kernel_bo);
+    gpu->kernel_bo = NULL;
+  }
+
+  if (gpu->thread.curbe) {
+    cl_free(gpu->thread.curbe);
+    gpu->thread.curbe = NULL;
+  }
+
+  if (gpu->aux.aux_bo != NULL) {
+    assert(gpu->aux.aux_bo->virtual == NULL);
+    drm_intel_bo_unreference(gpu->aux.aux_bo);
+    gpu->aux.aux_bo = NULL;
+  }
+
+  if (gpu->mem.scratch_bo) {
+    drm_intel_bo_unreference(gpu->mem.scratch_bo);
+    gpu->mem.scratch_bo = NULL;
+  }
+
+  if (gpu->mem.stack_bo) {
+    drm_intel_bo_unreference(gpu->mem.stack_bo);
+    gpu->mem.stack_bo = NULL;
+  }
+
+  if (gpu->mem.const_bo) {
+    drm_intel_bo_unreference(gpu->mem.const_bo);
+    gpu->mem.const_bo = NULL;
+  }
+
+  if (gpu->mem.time_stamp_bo) {
+    drm_intel_bo_unreference(gpu->mem.time_stamp_bo);
+    gpu->mem.time_stamp_bo = NULL;
+  }
+
+  if (gpu->batch) {
+    intel_batchbuffer_delete(gpu->batch);
+    gpu->batch = NULL;
+  }
+
+  cl_free(gpu);
+  return;
+}
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
index f761652..867d84c 100644
--- a/src/gen/cl_gen.h
+++ b/src/gen/cl_gen.h
@@ -157,4 +157,13 @@ extern cl_int cl_program_load_binary_gen(cl_device_id 
device, cl_program prog);
 extern cl_int cl_program_get_info_gen(cl_device_id device, cl_program program,
                                       cl_uint param_name, void *param_value);
 
+/******************************** Command Queue 
*****************************************/
+extern cl_int cl_command_queue_ND_range_gen(cl_command_queue queue, cl_kernel 
ker,
+                                            cl_event event, const uint32_t 
work_dim,
+                                            const size_t *global_wk_off, const 
size_t *global_wk_sz,
+                                            const size_t *local_wk_sz);
+/************************************ Compiler 
******************************************/
+extern cl_int cl_compiler_load_gen(cl_device_id device);
+extern cl_int cl_compiler_unload_gen(cl_device_id device);
+
 #endif /* End of __CL_GEN_H__ */
-- 
2.7.4

_______________________________________________
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet

[Beignet] [PATCH] Refine command queue's enqueue ndrang.

Reply via email to