From: Junyan He <junyan...@intel.com> Signed-off-by: Junyan He <junyan...@intel.com> --- runtime/gen/cl_command_queue_gen.c | 1721 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1721 insertions(+) create mode 100644 runtime/gen/cl_command_queue_gen.c
diff --git a/runtime/gen/cl_command_queue_gen.c b/runtime/gen/cl_command_queue_gen.c new file mode 100644 index 0000000..4c18a52 --- /dev/null +++ b/runtime/gen/cl_command_queue_gen.c @@ -0,0 +1,1721 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "cl_gen.h" +#include "gen_device_pci_id.h" + +#include "intel_defines.h" +#include "intel_structs.h" +#include "intel_batchbuffer.h" + +#include <i915_drm.h> +#include <drm.h> +#include <intel_bufmgr.h> +#include <assert.h> +#include <string.h> + +/* We can bind only a limited number of buffers */ +enum { max_buf_n = 128 }; +enum { max_img_n = 128 }; +enum { max_sampler_n = 16 }; + +// BTI magic number +#define BTI_CONSTANT 0 +#define BTI_PRIVATE 1 +#define BTI_RESERVED_NUM 2 +#define BTI_MAX_READ_IMAGE_ARGS 128 +#define BTI_MAX_WRITE_IMAGE_ARGS 8 +#define BTI_WORKAROUND_IMAGE_OFFSET 128 +#define BTI_MAX_ID 253 +#define BTI_LOCAL 0xfe + +typedef struct gen_gpgpu { + drm_intel_bufmgr *bufmgr; // The drm buffer mgr + cl_device_id device; // The device of this gpu + drm_intel_bo *kernel_bo; // The buffer object holding kernel bitcode + uint32_t simd_size; // The simd size we are executing. + uint32_t atomic_test_result; + + struct intel_batchbuffer *batch; // The batch buffer holding GPU command + + struct { + drm_intel_bo *aux_bo; // Aux buffer needed by GPU command + uint32_t surface_heap_offset; + uint32_t curbe_offset; + uint32_t idrt_offset; + uint32_t sampler_state_offset; + uint32_t sampler_border_color_state_offset; + } aux; // All aux setting info + + struct { + uint32_t local_mem_size; // The total local memory size + + uint32_t max_bti; /* Max bti number */ + uint32_t binded_n; /* Number of buffers binded */ + drm_intel_bo *binded_buf[max_buf_n]; /* All buffers binded for the kernel, e.g. kernel's arg */ + uint32_t binded_offset[max_buf_n]; /* The offset in the curbe buffer */ + uint32_t target_buf_offset[max_buf_n]; /* The offset within the buffers to be binded */ + + uint32_t per_thread_scratch_size; + uint32_t total_scratch_size; + drm_intel_bo *scratch_bo; /* Scratch buffer */ + + drm_intel_bo *const_bo; /* Constant buffer */ + drm_intel_bo *stack_bo; /* stack buffer */ + + drm_intel_bo *time_stamp_bo; /* The buffer to record exec timestamps */ + } mem; + + struct { + uint64_t sampler_bitmap; /* sampler usage bitmap. */ + } sampler; + + struct { + uint32_t barrier_slm_used; /* Use barrier or slm */ + uint32_t thread_num; // Total thread number we need for this kernel + uint32_t max_thread_num; // Max thread number we can run at same time + uint32_t per_thread_scratch; // Scratch buffer size for each thread + uint32_t num_cs_entries; /* Curbe entry number */ + uint32_t size_cs_entry; /* size of one entry in 512bit elements */ + char *curbe; /* Curbe content */ + uint32_t curbe_size; /* Curbe size */ + } thread; + +} gen_gpgpu; + +typedef struct gen_gpgpu_exec_ctx { + void *device_enqueue_helper_ptr; + drm_intel_bo *device_enqueue_helper_bo; + size_t helper_bo_size; + cl_int gpu_num; + gen_gpgpu *all_gpu[8]; +} gen_gpgpu_exec_ctx; + +#define MAX_IF_DESC 32 + +typedef struct surface_heap { + uint32_t binding_table[256]; + char surface[256 * sizeof(gen_surface_state_t)]; +} surface_heap_t; + +#include "gen_gpgpu_func.c" + +static cl_int +check_work_group_capability(cl_command_queue queue, cl_kernel kernel, + const size_t *local_wk_sz, uint32_t wk_dim) +{ + size_t sz = 0; + int i; + + sz = local_wk_sz[0]; + for (i = 1; i < wk_dim; ++i) + sz *= local_wk_sz[i]; + + if (sz > cl_kernel_get_max_workgroup_size_gen(kernel, queue->device)) + return CL_INVALID_WORK_ITEM_SIZE; + + return CL_SUCCESS; +} + +static cl_int +gen_gpgpu_setup_curbe(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu, + const uint32_t work_dim, const size_t *global_wk_off, + const size_t *global_wk_sz, const size_t *local_wk_sz, + const size_t *enqueued_local_wk_sz, uint64_t device_enqueue_helper) +{ + int curbe_size = 0; + char *curbe = NULL; + int i; + int sz; + uint32_t slm_offset; + + /* Calculate the total size needed */ + for (i = 0; i < kernel->arg_n; i++) { + if (kernel->args[i].arg_size + kernel_gen->arg_extra_info[i].arg_offset > curbe_size) + curbe_size = kernel->args[i].arg_size + kernel_gen->arg_extra_info[i].arg_offset; + } + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + sz = kernel_gen->virt_reg_phy_offset[i].phy_offset + + kernel_gen->virt_reg_phy_offset[i].size; + if (sz > curbe_size) + curbe_size = sz; + } + for (i = 0; i < kernel_gen->image_info_num; i++) { + if (kernel_gen->image_info[i].width > curbe_size) + curbe_size = sz; + if (kernel_gen->image_info[i].height > curbe_size) + curbe_size = sz; + if (kernel_gen->image_info[i].depth > curbe_size) + curbe_size = sz; + if (kernel_gen->image_info[i].data_type > curbe_size) + curbe_size = sz; + if (kernel_gen->image_info[i].channel_order > curbe_size) + curbe_size = sz; + } + + curbe_size = ALIGN(curbe_size, 32); + + gpu->thread.curbe_size = curbe_size; + + if (curbe_size == 0) { + assert(kernel->arg_n == 0); + return CL_SUCCESS; + } + + curbe = CL_MALLOC(curbe_size); + if (curbe == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + gpu->thread.curbe = curbe; + memset(curbe, 0, curbe_size); + + slm_offset = kernel_gen->local_mem_size; + for (i = 0; i < kernel->arg_n; i++) { + if (kernel_gen->arg_extra_info[i].arg_offset < 0) // no usage argument + continue; + + if (kernel->args[i].arg_type == ArgTypePointer && + kernel->args[i].arg_addrspace == AddressSpaceLocal) { // SLM setting + assert(kernel->args[i].val_size > 0); + assert(kernel->args[i].arg_size == sizeof(uint32_t) || kernel->args[i].arg_size == sizeof(uint64_t)); + assert(kernel_gen->arg_extra_info[i].arg_align > 0); + // Need to be aligned address + slm_offset = ALIGN(slm_offset, kernel_gen->arg_extra_info[i].arg_align); + if (kernel->args[i].arg_size == sizeof(uint32_t)) { + *((uint32_t *)(curbe + kernel_gen->arg_extra_info[i].arg_offset)) = slm_offset; + } else { + *((uint64_t *)(curbe + kernel_gen->arg_extra_info[i].arg_offset)) = slm_offset; + } + slm_offset += kernel->args[i].val_size; + continue; + } + + if (kernel->args[i].arg_type == ArgTypePointer) { + assert(kernel->args[i].arg_addrspace == AddressSpaceConstant || + kernel->args[i].arg_addrspace == AddressSpaceGlobal); + /* For other buffer, we will set this value in surface binding */ + continue; + } + + if (kernel->args[i].arg_type == ArgTypeSampler) { + continue; + } + + if (kernel->args[i].arg_type == ArgTypeImage) { + continue; + } + + /* Common value or struct data, just copy the content */ + assert(kernel->args[i].val_size == kernel->args[i].arg_size); + if (kernel->args[i].arg_type == ArgTypeValue && kernel->args[i].arg_size <= sizeof(cl_double)) + memcpy(curbe + kernel_gen->arg_extra_info[i].arg_offset, &kernel->args[i].val, kernel->args[i].arg_size); + else + memcpy(curbe + kernel_gen->arg_extra_info[i].arg_offset, kernel->args[i].val.val_ptr, kernel->args[i].arg_size); + } + +#define UPLOAD(ENUM, VALUE, SIZE) \ + if (kernel_gen->virt_reg_phy_offset[i].virt_reg == ENUM) { \ + assert(kernel_gen->virt_reg_phy_offset[i].size == sizeof(SIZE)); \ + *((SIZE *)(curbe + kernel_gen->virt_reg_phy_offset[i].phy_offset)) = VALUE; \ + continue; \ + } + + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + UPLOAD(CL_GEN_VIRT_REG_ENQUEUE_BUF_POINTER, device_enqueue_helper, uint64_t); + UPLOAD(CL_GEN_VIRT_REG_LOCAL_SIZE_X, local_wk_sz[0], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_LOCAL_SIZE_Y, local_wk_sz[1], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_LOCAL_SIZE_Z, local_wk_sz[2], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GLOBAL_SIZE_X, global_wk_sz[0], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GLOBAL_SIZE_Y, global_wk_sz[1], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GLOBAL_SIZE_Z, global_wk_sz[2], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GLOBAL_OFFSET_X, global_wk_off[0], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GLOBAL_OFFSET_Y, global_wk_off[1], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GLOBAL_OFFSET_Z, global_wk_off[2], uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GROUP_NUM_X, + global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0] % enqueued_local_wk_sz[0] ? 1 : 0), + uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GROUP_NUM_Y, + global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1] % enqueued_local_wk_sz[1] ? 1 : 0), + uint32_t); + UPLOAD(CL_GEN_VIRT_REG_GROUP_NUM_Z, + global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2] % enqueued_local_wk_sz[2] ? 1 : 0), + uint32_t); + UPLOAD(CL_GEN_VIRT_REG_THREAD_NUM, gpu->thread.thread_num, + uint32_t); + UPLOAD(CL_GEN_VIRT_REG_WORK_DIM, work_dim, uint32_t); + } +#undef UPLOAD + + return CL_SUCCESS; +} + +static void +gen_gpgpu_bind_one_bo(gen_gpgpu *gpu, drm_intel_bo *buf, uint32_t offset, + uint32_t internal_offset, size_t size, uint8_t bti) +{ + if (buf == NULL) + return; + + assert(gpu->mem.binded_n < max_buf_n); + if (offset != -1) { + gpu->mem.binded_buf[gpu->mem.binded_n] = buf; + gpu->mem.target_buf_offset[gpu->mem.binded_n] = internal_offset; + gpu->mem.binded_offset[gpu->mem.binded_n] = offset; + gpu->mem.binded_n++; + } + gen_gpgpu_setup_bti(gpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW); +} + +static void +gen_gpgpu_setup_global_mem(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu) +{ + int i; + int32_t offset = 0; + cl_mem mem; + uint32_t bti; + cl_program_gen prog_gen; + cl_mem_gen mem_gen; + + DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen); + + for (i = 0; i < kernel->arg_n; i++) { + if (kernel->args[i].arg_type != ArgTypePointer && kernel->args[i].arg_type != ArgTypePipe) + continue; + + if (kernel->args[i].arg_addrspace != AddressSpaceGlobal && + kernel->args[i].arg_addrspace != AddressSpaceConstant) + continue; + + if (prog_gen->cl_version < 200 && kernel->args[i].arg_addrspace == AddressSpaceConstant) + continue; + + mem = NULL; + mem_gen = NULL; + offset = 0; + bti = kernel_gen->arg_extra_info[i].arg_misc; + + if (kernel->args[i].use_svm) { + assert(CL_OBJECT_IS_SVM(kernel->args[i].val.val_svm.svm)); + mem = kernel->args[i].val.val_svm.svm; + DEV_PRIVATE_DATA(mem, gpu->device, mem_gen); + assert(mem_gen->drm_bo); + assert(mem_gen->mem_base.device == gpu->device); + assert(mem->host_ptr); + gen_gpgpu_bind_one_bo(gpu, mem_gen->drm_bo->bo, kernel_gen->arg_extra_info[i].arg_offset, + kernel->args[i].val.val_svm.ptr - mem->host_ptr, + mem_gen->drm_bo->gpu_size, bti); + } else { + if (kernel->args[i].val.val_mem != NULL) { + mem = (cl_mem)kernel->args[i].val.val_mem; + mem_gen = (cl_mem_gen)mem->each_device[0]; + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(mem_gen->mem_base.device == gpu->device); + offset = mem_gen->drm_bo->in_page_offset; + } + + if (CL_OBJECT_IS_BUFFER(mem) && cl_mem_to_buffer(mem)->svm_buf) { + offset += cl_mem_to_buffer(mem)->svm_offset; + } else if (CL_OBJECT_IS_SUB_BUFFER(mem)) { + offset += cl_mem_to_buffer(mem)->sub_offset; + } + + gen_gpgpu_bind_one_bo(gpu, mem ? mem_gen->drm_bo->bo : NULL, + kernel_gen->arg_extra_info[i].arg_offset, offset, + mem ? mem_gen->drm_bo->gpu_size : 0, bti); + } + + if (gpu->mem.max_bti < bti) + gpu->mem.max_bti = bti; + } +} + +static cl_int +gen_gpgpu_setup_kernel_exec_svm_mem(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu) +{ + int i; + int32_t offset = 0; + cl_mem mem; + uint32_t bti; + cl_mem_gen mem_gen; + + if (kernel->exec_info == NULL) + return CL_SUCCESS; + + assert(kernel->exec_info_n > 0); + for (i = 0; i < kernel->exec_info_n; i++) { + offset = kernel->exec_info[i].offset; + mem = kernel->exec_info[i].svm; + DEV_PRIVATE_DATA(mem, gpu->device, mem_gen); + + if (gpu->mem.max_bti == BTI_MAX_ID) + return CL_OUT_OF_RESOURCES; + + bti = gpu->mem.max_bti; + gpu->mem.max_bti++; + + /* No need to setup the offset in curbe, just setup bti */ + gen_gpgpu_setup_bti(gpu, mem_gen->drm_bo->bo, offset, mem->size, bti, I965_SURFACEFORMAT_RAW); + } + return CL_SUCCESS; +} + +static cl_int +gen_gpgpu_setup_image(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu) +{ + int i; + cl_mem mem; + cl_mem_image image; + cl_mem_gen image_gen; + cl_gen_image_info_offset info; + + for (i = 0; i < kernel->arg_n; i++) { + if (kernel->args[i].arg_type != ArgTypeImage) + continue; + + assert(kernel->args[i].val.val_ptr != NULL); + mem = (cl_mem)kernel->args[i].val.val_ptr; + image = cl_mem_to_image(mem); + image_gen = (cl_mem_gen)mem->each_device[0]; + assert(image_gen); + assert(image_gen->drm_bo); + + info = &kernel_gen->image_info[kernel_gen->arg_extra_info[i].arg_misc]; + + /* Set the image info to the curbe */ + if (info->width >= 0) + *(uint32_t *)(gpu->thread.curbe + info->width) = image->w; + if (info->height >= 0) + *(uint32_t *)(gpu->thread.curbe + info->height) = image->h; + if (info->depth >= 0) + *(uint32_t *)(gpu->thread.curbe + info->depth) = image->depth; + if (info->channel_order >= 0) + *(uint32_t *)(gpu->thread.curbe + info->channel_order) = + image->fmt.image_channel_order; + if (info->data_type >= 0) + *(uint32_t *)(gpu->thread.curbe + info->data_type) = + image->fmt.image_channel_data_type; + + if (gpu->mem.max_bti < info->bti) + gpu->mem.max_bti = info->bti; + + gen_gpgpu_bind_image(gpu, info->bti, image_gen->drm_bo->bo, + image_gen->image.sub_offset + image_gen->drm_bo->in_page_offset, + image_gen->image.intel_fmt, image->image_type, image->bpp, image->w, + image->h, image->depth, image_gen->image.gpu_row_pitch, + image_gen->image.gpu_slice_pitch, image_gen->drm_bo->tiling); + + // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer + // on demand. + if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + gen_gpgpu_bind_image(gpu, info->bti + BTI_WORKAROUND_IMAGE_OFFSET, + image_gen->drm_bo->bo, + image_gen->image.sub_offset + image_gen->drm_bo->in_page_offset, + image_gen->image.intel_fmt, image->image_type, image->bpp, image->w, + image->h, image->depth, image_gen->image.gpu_row_pitch, + image_gen->image.gpu_slice_pitch, image_gen->drm_bo->tiling); + } + + return CL_SUCCESS; +} + +static cl_int +gen_gpgpu_setup_scratch(gen_gpgpu *gpu) +{ + drm_intel_bufmgr *bufmgr = gpu->bufmgr; + cl_uint device_id = gpu->device->device_id; + + gpu->mem.total_scratch_size = gpu->mem.per_thread_scratch_size * gpu->thread.max_thread_num; + /* Per Bspec, scratch should 2X the desired size when EU index is not continuous */ + if (IS_HASWELL(device_id) || IS_CHERRYVIEW(device_id) || + device_id == PCI_CHIP_BROXTON_1 || device_id == PCI_CHIP_BROXTON_3) { + gpu->mem.total_scratch_size = gpu->mem.total_scratch_size * 2; + } + + if (gpu->mem.total_scratch_size) { + gpu->mem.scratch_bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", + gpu->mem.total_scratch_size, 4096); + if (gpu->mem.scratch_bo == NULL) + return CL_OUT_OF_RESOURCES; + } + return CL_SUCCESS; +} + +static cl_int +gen_setup_constant_buffer_for_20(cl_kernel kernel, cl_kernel_gen kernel_gen, + cl_program_gen prog_gen, gen_gpgpu *gpu) +{ +#ifndef HAS_BO_SET_SOFTPIN + return CL_OUT_OF_RESOURCES; +#else + int i; + cl_bool need_const_buf = CL_FALSE; + cl_int const_addr_curbe_offset = -1; + cl_gen_virt_phy_offset map = kernel_gen->virt_reg_phy_offset; + + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + if (map[i].virt_reg == CL_GEN_VIRT_REG_CONSTANT_ADDRSPACE) { + need_const_buf = CL_TRUE; + const_addr_curbe_offset = map[i].phy_offset; + assert(map[i].size == 8); + break; + } + } + + if (need_const_buf == CL_FALSE) + return CL_SUCCESS; + + assert(prog_gen->global_mem_data); // Should always have something + assert(const_addr_curbe_offset >= 0); + + gpu->mem.const_bo = intel_buffer_alloc_userptr(gpu->bufmgr, "program global data", + prog_gen->global_mem_data, prog_gen->global_mem_data_size, 0); + drm_intel_bo_set_softpin_offset(gpu->mem.const_bo, (size_t)prog_gen->global_mem_data); + drm_intel_bo_use_48b_address_range(gpu->mem.const_bo, 1); + *(char **)(gpu->thread.curbe + const_addr_curbe_offset) = prog_gen->global_mem_data; + gen_gpgpu_bind_one_bo(gpu, gpu->mem.const_bo, const_addr_curbe_offset, 0, + prog_gen->global_mem_data_size, BTI_CONSTANT); + return CL_SUCCESS; +#endif +} + +static cl_int +gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu) +{ + cl_program_gen prog_gen; + cl_uint const_buf_size = 0; + cl_uint aligned_const_buf_size = 0; + cl_mem mem; + cl_uint addr_offset; + char *const_buf_addr = NULL; + int i; + DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen); + + /* 2.0 is different from before */ + if (prog_gen->cl_version >= 200) { + return gen_setup_constant_buffer_for_20(kernel, kernel_gen, prog_gen, gpu); + } + + if (prog_gen->rodata) { + const_buf_size = prog_gen->rodata_data->d_size; + aligned_const_buf_size = ALIGN(const_buf_size, 8); + } else { + /* Reserve 8 bytes to get rid of 0 address */ + aligned_const_buf_size = 8; + } + + /* Calculate all the constant mem size */ + for (i = 0; i < kernel->arg_n; i++) { + if (kernel->args[i].arg_type != ArgTypePointer) + continue; + if (kernel->args[i].arg_addrspace != AddressSpaceConstant) + continue; + + if (kernel->args[i].val.val_ptr == NULL) + continue; + + assert(kernel_gen->arg_extra_info[i].arg_align != 0); + mem = (cl_mem)kernel->args[i].val.val_ptr; + const_buf_size += mem->size; + aligned_const_buf_size = ALIGN(aligned_const_buf_size, kernel_gen->arg_extra_info[i].arg_align); + aligned_const_buf_size += mem->size; + } + + if (const_buf_size == 0) // No need for constant buffer. + return CL_SUCCESS; + + gen_gpgpu_alloc_constant_buffer(gpu, aligned_const_buf_size, BTI_CONSTANT); + if (gpu->mem.const_bo == NULL) + return CL_OUT_OF_RESOURCES; + + drm_intel_bo_map(gpu->mem.const_bo, 1); + + const_buf_addr = gpu->mem.const_bo->virtual; + if (const_buf_addr == NULL) + return CL_OUT_OF_RESOURCES; + + addr_offset = 0; + /* upload the global constant data, in rodata */ + if (prog_gen->rodata && prog_gen->rodata_data->d_size > 0) { + memcpy(const_buf_addr, prog_gen->rodata_data->d_buf, prog_gen->rodata_data->d_size); + addr_offset = prog_gen->rodata_data->d_size; + addr_offset = ALIGN(addr_offset, 8); + } else { + addr_offset = 8; + } + + /* Upload constant ptr content */ + for (i = 0; i < kernel->arg_n; i++) { + cl_uint ptr_val = 0; + + if (kernel->args[i].arg_type != ArgTypePointer) + continue; + if (kernel->args[i].arg_addrspace != AddressSpaceConstant) + continue; + + assert(kernel_gen->arg_extra_info[i].arg_align > 0); + addr_offset = ALIGN(addr_offset, kernel_gen->arg_extra_info[i].arg_align); + assert(kernel->args[i].arg_size == sizeof(uint32_t) || kernel->args[i].arg_size == sizeof(uint64_t)); + + mem = (cl_mem)kernel->args[i].val.val_ptr; + if (mem) { + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + void *cst_ptr = NULL; + assert(mem_gen); + assert(mem_gen->drm_bo); + cst_ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + memcpy(const_buf_addr + addr_offset, cst_ptr, mem->size); + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + ptr_val = addr_offset; + addr_offset += mem->size; + addr_offset = ALIGN(addr_offset, kernel_gen->arg_extra_info[i].arg_align); + } + + /* Set curbe */ + if (kernel_gen->arg_extra_info[i].arg_offset >= 0) { + if (kernel->args[i].arg_size == sizeof(uint32_t)) { + *(uint32_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = ptr_val; + } else { + *(uint64_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = ptr_val; + } + } + } + + drm_intel_bo_unmap(gpu->mem.const_bo); + return CL_SUCCESS; +} + +static cl_int +gen_gpgpu_upload_final_curbe(cl_kernel kernel, cl_kernel_gen kernel_gen, + gen_gpgpu *gpu, const size_t *local_wk_sz) +{ + char *final_curbe = NULL; + char *final_curbe_ptr = NULL; + cl_gen_virt_phy_offset map = kernel_gen->virt_reg_phy_offset; + int i, j, k, curr = 0; + uint32_t *ids[3] = {NULL, NULL, NULL}; + int32_t id_offset[3], ip_offset, tid_offset; + uint16_t *block_ips = NULL; + uint32_t *thread_ids = NULL; + int32_t dw_ip_offset = -1; + + if (gpu->thread.curbe_size == 0) { + assert(gpu->thread.curbe == NULL); + return CL_SUCCESS; + } + + assert(gpu->thread.thread_num > 0); + final_curbe = CL_MALLOC(gpu->thread.thread_num * gpu->thread.curbe_size); + if (final_curbe == NULL) + return CL_OUT_OF_HOST_MEMORY; + + for (i = 0; i < gpu->thread.thread_num; ++i) { + memcpy(final_curbe + gpu->thread.curbe_size * i, + gpu->thread.curbe, gpu->thread.curbe_size); + } + + id_offset[0] = id_offset[1] = id_offset[2] = -1; + ip_offset = -1; + tid_offset = -1; + if (map) { + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + if (map[i].virt_reg == CL_GEN_VIRT_REG_LOCAL_ID_X || + map[i].virt_reg == CL_GEN_VIRT_REG_LOCAL_ID_Y || + map[i].virt_reg == CL_GEN_VIRT_REG_LOCAL_ID_Z) { + id_offset[map[i].virt_reg - CL_GEN_VIRT_REG_LOCAL_ID_X] = map[i].phy_offset; + assert(map[i].phy_offset >= 0); + assert(map[i].size / gpu->simd_size == sizeof(uint32_t)); + continue; + } + if (map[i].virt_reg == CL_GEN_VIRT_REG_BLOCK_IP) { + ip_offset = map[i].phy_offset; + assert(map[i].phy_offset >= 0); + assert(map[i].size / gpu->simd_size == sizeof(uint16_t)); + continue; + } + if (map[i].virt_reg == CL_GEN_VIRT_REG_THREAD_ID) { + tid_offset = map[i].phy_offset; + assert(map[i].phy_offset >= 0); + assert(map[i].size == sizeof(uint32_t)); + continue; + } + if (map[i].virt_reg == CL_GEN_VIRT_REG_DW_BLOCK_IP) { + dw_ip_offset = map[i].phy_offset; + assert(map[i].phy_offset >= 0); + assert(map[i].size / gpu->simd_size == sizeof(uint32_t)); + continue; + } + } + + assert(ip_offset < 0 || dw_ip_offset < 0); + assert(ip_offset >= 0 || dw_ip_offset >= 0); + + if (id_offset[0] >= 0) { + ids[0] = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num * gpu->simd_size); + assert(id_offset[0] >= 0); + } + if (id_offset[1] >= 0) { + ids[1] = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num * gpu->simd_size); + assert(id_offset[1] >= 0); + } + if (id_offset[2] >= 0) { + ids[2] = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num * gpu->simd_size); + assert(id_offset[2] >= 0); + } + + block_ips = (uint16_t *)alloca(sizeof(uint16_t) * gpu->thread.thread_num * gpu->simd_size); + assert(block_ips >= 0); + memset(block_ips, 0xff, sizeof(int16_t) * gpu->thread.thread_num * gpu->simd_size); + + if (tid_offset >= 0) { + thread_ids = (uint32_t *)alloca(sizeof(uint32_t) * gpu->thread.thread_num); + assert(thread_ids >= 0); + memset(thread_ids, 0, sizeof(uint32_t) * gpu->thread.thread_num); + } + /* Compute the IDs and the block IPs */ + for (k = 0; k < local_wk_sz[2]; ++k) { + for (j = 0; j < local_wk_sz[1]; ++j) { + for (i = 0; i < local_wk_sz[0]; ++i, ++curr) { + if (id_offset[0] >= 0) + ids[0][curr] = i; + if (id_offset[1] >= 0) + ids[1][curr] = j; + if (id_offset[2] >= 0) + ids[2][curr] = k; + block_ips[curr] = 0; + if (thread_ids) + thread_ids[curr / gpu->simd_size] = curr / gpu->simd_size; + } + } + } + + /* Set the vary part of curbe */ + curr = 0; + final_curbe_ptr = final_curbe; + for (i = 0; i < gpu->thread.thread_num; ++i, final_curbe_ptr += gpu->thread.curbe_size) { + uint32_t *ids0 = (uint32_t *)(final_curbe_ptr + id_offset[0]); + uint32_t *ids1 = (uint32_t *)(final_curbe_ptr + id_offset[1]); + uint32_t *ids2 = (uint32_t *)(final_curbe_ptr + id_offset[2]); + uint16_t *ips = (uint16_t *)(final_curbe_ptr + ip_offset); + uint32_t *dw_ips = (uint32_t *)(final_curbe_ptr + dw_ip_offset); + + if (thread_ids) + *(uint32_t *)(final_curbe_ptr + tid_offset) = thread_ids[i]; + + for (j = 0; j < gpu->simd_size; ++j, ++curr) { + if (id_offset[0] >= 0) + ids0[j] = ids[0][curr]; + if (id_offset[1] >= 0) + ids1[j] = ids[1][curr]; + if (id_offset[2] >= 0) + ids2[j] = ids[2][curr]; + if (ip_offset >= 0) + ips[j] = block_ips[curr]; + if (dw_ip_offset >= 0) + dw_ips[j] = block_ips[curr]; + } + } + } + + /* All settings are OK, upload it to GPU */ + gen_gpgpu_upload_curbes(gpu, final_curbe, gpu->thread.thread_num * gpu->thread.curbe_size); + CL_FREE(final_curbe); + return CL_SUCCESS; +} + +static cl_int +gen_gpgu_bind_stack(gen_gpgpu *gpu, cl_kernel kernel, cl_kernel_gen kernel_gen) +{ + int32_t stack_sz = kernel_gen->stack_size; + int32_t stack_offset = -1; + int32_t stack_size_offset = -1; + int i; + + if (stack_sz == 0) + return CL_SUCCESS; + + stack_sz *= kernel_gen->simd_width; + stack_sz *= gpu->thread.max_thread_num; + + if (IS_GEN75(gpu->device->device_id)) + stack_sz = stack_sz * 4; + else if (gpu->device->device_id == PCI_CHIP_BROXTON_1 || gpu->device->device_id == PCI_CHIP_BROXTON_3 || + IS_CHERRYVIEW(gpu->device->device_id)) + stack_sz = stack_sz * 2; + + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + if (kernel_gen->virt_reg_phy_offset[i].virt_reg == CL_GEN_VIRT_REG_STACK_SIZE) { + assert(kernel_gen->virt_reg_phy_offset[i].size == sizeof(uint32_t)); + stack_size_offset = kernel_gen->virt_reg_phy_offset[i].phy_offset; + continue; + } + if (kernel_gen->virt_reg_phy_offset[i].virt_reg == CL_GEN_VIRT_REG_EXTRA_ARGUMENT) { + assert(kernel_gen->virt_reg_phy_offset[i].size == sizeof(uint64_t)); + stack_offset = kernel_gen->virt_reg_phy_offset[i].phy_offset; + continue; + } + } + assert(stack_offset >= 0); + + if (stack_size_offset >= 0) + *((uint32_t *)(gpu->thread.curbe + stack_offset)) = stack_sz; + + gpu->mem.stack_bo = drm_intel_bo_alloc(gpu->bufmgr, "STACK", stack_sz, 64); + if (gpu->mem.stack_bo == NULL) + return CL_OUT_OF_RESOURCES; + + gen_gpgpu_bind_one_bo(gpu, gpu->mem.stack_bo, stack_offset, 0, stack_sz, BTI_PRIVATE); + return CL_SUCCESS; +} + +#define MAX_GROUP_SIZE_IN_HALFSLICE 512 +static size_t +gen_gpu_compute_batch_sz(cl_kernel k) +{ + return 256 + 256; +} + +static void +cl_command_queue_delete_gpgpu(void *gpgpu) +{ + gen_gpgpu *gpu = gpgpu; + + if (gpgpu == NULL) + return; + + if (gpu->kernel_bo) { + drm_intel_bo_unreference(gpu->kernel_bo); + gpu->kernel_bo = NULL; + } + + if (gpu->thread.curbe) { + CL_FREE(gpu->thread.curbe); + gpu->thread.curbe = NULL; + } + + if (gpu->aux.aux_bo != NULL) { + assert(gpu->aux.aux_bo->virtual == NULL); + drm_intel_bo_unreference(gpu->aux.aux_bo); + gpu->aux.aux_bo = NULL; + } + + if (gpu->mem.scratch_bo) { + drm_intel_bo_unreference(gpu->mem.scratch_bo); + gpu->mem.scratch_bo = NULL; + } + + if (gpu->mem.stack_bo) { + drm_intel_bo_unreference(gpu->mem.stack_bo); + gpu->mem.stack_bo = NULL; + } + + if (gpu->mem.const_bo) { + drm_intel_bo_unreference(gpu->mem.const_bo); + gpu->mem.const_bo = NULL; + } + + if (gpu->mem.time_stamp_bo) { + drm_intel_bo_unreference(gpu->mem.time_stamp_bo); + gpu->mem.time_stamp_bo = NULL; + } + + if (gpu->batch) { + intel_batchbuffer_delete(gpu->batch); + gpu->batch = NULL; + } + + CL_FREE(gpu); + return; +} + +static void +gen_gpgpu_setup_sampler(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu) +{ + cl_uint i; + clk_sampler_type *spt; + cl_uint *samper_info = NULL; + + if (kernel_gen->samper_info == NULL) { + assert(kernel_gen->samper_info_num == 0); + return; + } + + samper_info = CL_MALLOC(sizeof(cl_uint) * kernel_gen->samper_info_num); + assert(samper_info); + memcpy(samper_info, kernel_gen->samper_info, sizeof(cl_uint) * kernel_gen->samper_info_num); + + for (i = 0; i < kernel->arg_n; i++) { + if (kernel->args[i].arg_type != ArgTypeSampler) + continue; + + assert(kernel_gen->arg_extra_info != NULL); + assert(kernel_gen->samper_info_num > kernel_gen->arg_extra_info[i].arg_misc); + spt = &(samper_info[kernel_gen->arg_extra_info[i].arg_misc]); + assert(GEN_IS_SAMPLER_ARG(*spt)); + assert(GEN_SAMPLER_ARG_ID(*spt) == i); + *spt = kernel->args[i].val.val_sampler->clkSamplerValue; + + /* Set its value in curbe */ + if (kernel_gen->arg_extra_info[i].arg_offset >= 0) + *(uint32_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = *spt; + } + + gen_gpgpu_bind_sampler(gpu, samper_info, kernel_gen->samper_info_num); + CL_FREE(samper_info); +} + +/* This is a very important function. It is responsible for loading and setting GPU + execution context based on the cl_kernel and kernel's arguments. */ +static gen_gpgpu * +cl_command_queue_ND_range_gen_once(cl_command_queue queue, cl_kernel kernel, cl_int *err, + const uint32_t work_dim, const size_t *global_wk_off, + const size_t *global_dim_off, const size_t *global_wk_sz, + const size_t *global_wk_sz_use, const size_t *local_wk_sz, + const size_t *local_wk_sz_use, gen_gpgpu_exec_ctx *gpu_exec_ctx) +{ + cl_int ret = CL_SUCCESS; + gen_gpgpu *gpu = NULL; + size_t local_size = local_wk_sz_use[0] * local_wk_sz_use[1] * local_wk_sz_use[2]; + cl_kernel_gen kernel_gen; + cl_context_gen ctx_gen; + int i; + drm_intel_bufmgr *bufmgr = NULL; + + DEV_PRIVATE_DATA(kernel, queue->device, kernel_gen); + DEV_PRIVATE_DATA(queue->ctx, queue->device, ctx_gen); + bufmgr = ctx_gen->drv->bufmgr; + assert(bufmgr); + + ret = check_work_group_capability(queue, kernel, local_wk_sz_use, 3); + if (ret != CL_SUCCESS) { + *err = ret; + return NULL; + } + + if (kernel_gen->scratch_size > queue->device->scratch_mem_size) { + *err = CL_OUT_OF_RESOURCES; + return NULL; + } + + gpu = CL_CALLOC(1, sizeof(gen_gpgpu)); + if (gpu == NULL) { + *err = CL_OUT_OF_HOST_MEMORY; + return NULL; + } + + do { + /* Init the gpu parameters */ + gpu->bufmgr = bufmgr; + gpu->simd_size = kernel_gen->simd_width; + gpu->device = queue->device; + gpu->thread.max_thread_num = queue->device->max_compute_unit * queue->device->max_thread_per_unit; + gpu->thread.thread_num = (local_size + gpu->simd_size - 1) / gpu->simd_size; + gpu->sampler.sampler_bitmap = ~((1 << max_sampler_n) - 1); + gpu->mem.max_bti = 0; + gpu->mem.per_thread_scratch_size = kernel_gen->scratch_size; + gpu->mem.total_scratch_size = 0; + gpu->atomic_test_result = gpu->device->atomic_test_result; + gpu->thread.barrier_slm_used = kernel_gen->barrier_slm_used; + + gpu->mem.local_mem_size = kernel_gen->local_mem_size; + for (i = 0; i < kernel->arg_n; i++) { + if (kernel->args[i].arg_type == ArgTypePointer && + kernel->args[i].arg_addrspace == AddressSpaceLocal) { + assert(kernel->args[i].is_set); + assert(kernel_gen->arg_extra_info[i].arg_align > 0); + gpu->mem.local_mem_size = ALIGN(gpu->mem.local_mem_size, kernel_gen->arg_extra_info[i].arg_align); + gpu->mem.local_mem_size += kernel->args[i].val_size; + } + } + + if (gpu->mem.local_mem_size > queue->device->local_mem_size) { + ret = CL_OUT_OF_HOST_MEMORY; + break; + } + + // Setup the kernel bitcode and upload it to GPU side + gpu->kernel_bo = drm_intel_bo_alloc(bufmgr, "CL kernel", kernel_gen->kern_base.exec_code_sz, 64u); + if (gpu->kernel_bo == NULL) { + ret = CL_OUT_OF_RESOURCES; + break; + } + /* Upload the bitcode */ + drm_intel_bo_subdata(gpu->kernel_bo, 0, kernel_gen->kern_base.exec_code_sz, + kernel_gen->kern_base.exec_code); + + ret = gen_gpgpu_setup_curbe(kernel, kernel_gen, gpu, work_dim, global_wk_off, global_wk_sz, + local_wk_sz_use, local_wk_sz, (uint64_t)gpu_exec_ctx->device_enqueue_helper_ptr); + if (ret != CL_SUCCESS) + break; + + gpu->thread.num_cs_entries = 64; + gpu->thread.size_cs_entry = gpu->thread.curbe_size / 32; + + ret = gen_gpgpu_setup_aux(gpu); + if (ret != CL_SUCCESS) + break; + + if (queue->props & CL_QUEUE_PROFILING_ENABLE) { // Need to alloc profiling buffer + gpu->mem.time_stamp_bo = dri_bo_alloc(bufmgr, "timestamp query", 4096, 4096); + + if (gpu->mem.time_stamp_bo == NULL) { + ret = CL_OUT_OF_RESOURCES; + break; + } + } + + /* Bind user buffers */ + gen_gpgpu_setup_global_mem(kernel, kernel_gen, gpu); + + ret = gen_gpgpu_setup_image(kernel, kernel_gen, gpu); + if (ret != CL_SUCCESS) + break; + + gen_gpgpu_setup_kernel_exec_svm_mem(kernel, kernel_gen, gpu); + + /* also setup the device enqueue helper bo if exist */ + if (gpu_exec_ctx->device_enqueue_helper_bo) { + gen_gpgpu_setup_bti(gpu, gpu_exec_ctx->device_enqueue_helper_bo, 0, + gpu_exec_ctx->helper_bo_size, gpu->mem.max_bti, I965_SURFACEFORMAT_RAW); + gpu->mem.max_bti++; + } + + gen_gpgpu_setup_sampler(kernel, kernel_gen, gpu); + + ret = gen_gpgpu_setup_scratch(gpu); + if (ret != CL_SUCCESS) + break; + + /* Bind a stack if needed */ + ret = gen_gpgu_bind_stack(gpu, kernel, kernel_gen); + if (ret != CL_SUCCESS) + break; + + ret = gen_setup_constant_buffer(kernel, kernel_gen, gpu); + if (ret != CL_SUCCESS) + break; + + gen_gpgpu_build_idrt(gpu); + gen_gpgpu_upload_final_curbe(kernel, kernel_gen, gpu, local_wk_sz_use); + gen_gpgpu_finish_aux(gpu); + + /* Start a new batch buffer */ + gpu->batch = intel_batchbuffer_create(ctx_gen->drv, gen_gpu_compute_batch_sz(kernel)); + if (gpu->batch == NULL) { + ret = CL_OUT_OF_RESOURCES; + break; + } + + gen_gpgpu_batch_start(gpu); + gen_gpgpu_walker(gpu, gpu->simd_size, gpu->thread.thread_num, + global_wk_off, global_dim_off, global_wk_sz_use, local_wk_sz_use); + gen_gpgpu_batch_end(gpu, 0); + } while (0); + + if (ret != CL_SUCCESS) { + gen_gpgpu_finish_aux(gpu); + cl_command_queue_delete_gpgpu(gpu); + *err = ret; + return NULL; + } + + *err = CL_SUCCESS; + return gpu; +} + +LOCAL cl_int +cl_command_queue_ND_range(cl_command_queue queue, cl_kernel ker, void *exec_ctx, cl_uint work_dim, + size_t *global_wk_off, size_t *global_wk_sz, size_t *local_wk_sz) +{ + /* Used for non uniform work group size */ + cl_int err = CL_SUCCESS; + gen_gpgpu *gpu = NULL; + gen_gpgpu_exec_ctx *gpu_exec_ctx = exec_ctx; + cl_int n; + int i, j, k; + const size_t global_wk_sz_div[3] = { + global_wk_sz[0] / local_wk_sz[0] * local_wk_sz[0], + global_wk_sz[1] / local_wk_sz[1] * local_wk_sz[1], + global_wk_sz[2] / local_wk_sz[2] * local_wk_sz[2]}; + + const size_t global_wk_sz_rem[3] = { + global_wk_sz[0] % local_wk_sz[0], + global_wk_sz[1] % local_wk_sz[1], + global_wk_sz[2] % local_wk_sz[2]}; + + const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem}; + + /* Go through the at most 8 cases and euque if there is work items left */ + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + for (k = 0; k < 2; k++) { + size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]}; + size_t global_dim_off[3] = { + k * global_wk_sz_div[0] / local_wk_sz[0], + j * global_wk_sz_div[1] / local_wk_sz[1], + i * global_wk_sz_div[2] / local_wk_sz[2]}; + size_t local_wk_sz_use[3] = { + k ? global_wk_sz_rem[0] : local_wk_sz[0], + j ? global_wk_sz_rem[1] : local_wk_sz[1], + i ? global_wk_sz_rem[2] : local_wk_sz[2]}; + if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) + continue; + + gpu = cl_command_queue_ND_range_gen_once(queue, ker, &err, work_dim, global_wk_off, global_dim_off, + global_wk_sz, global_wk_sz_use, local_wk_sz, local_wk_sz_use, + exec_ctx); + if (err != CL_SUCCESS) { + assert(gpu == NULL); + for (n = 0; n < gpu_exec_ctx->gpu_num; n++) { + assert(gpu_exec_ctx->all_gpu[n]); + cl_command_queue_delete_gpgpu(gpu_exec_ctx->all_gpu[n]); + } + + return err; + } + + gpu_exec_ctx->all_gpu[gpu_exec_ctx->gpu_num] = gpu; + gpu_exec_ctx->gpu_num++; + } + if (work_dim < 2) + break; + } + if (work_dim < 3) + break; + } + + assert(err == CL_SUCCESS); + return err; +} + +LOCAL cl_int +cl_command_queue_ND_range_wrap(cl_command_queue queue, cl_kernel ker, cl_event e, cl_uint work_dim, + size_t *global_wk_off, size_t *global_wk_sz, size_t *local_wk_sz) +{ + cl_int err = CL_SUCCESS; + cl_kernel_gen kernel_gen; + cl_uint i; + cl_bool use_device_enqueue = CL_FALSE; + + gen_gpgpu_exec_ctx *exec_ctx = CL_CALLOC(1, sizeof(gen_gpgpu_exec_ctx)); + if (exec_ctx == NULL) + return CL_OUT_OF_HOST_MEMORY; + + DEV_PRIVATE_DATA(ker, queue->device, kernel_gen); + + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + if (kernel_gen->virt_reg_phy_offset[i].virt_reg == CL_GEN_VIRT_REG_ENQUEUE_BUF_POINTER) { + use_device_enqueue = CL_TRUE; + break; + } + } + /* We need to make all kernel entry mem uniform address, later device enqueue can use them */ + if (use_device_enqueue) { + cl_mem mem; + cl_mem_gen mem_gen; + size_t buf_size = 32 * 1024 * 1024; //fix 32M + cl_context_gen ctx_gen; + DEV_PRIVATE_DATA(queue->ctx, queue->device, ctx_gen); + void *tmp_ptr; + + exec_ctx->device_enqueue_helper_ptr = CL_MEMALIGN(4096, buf_size); + if (exec_ctx->device_enqueue_helper_ptr == NULL) { + CL_FREE(exec_ctx); + return CL_OUT_OF_RESOURCES; + } + memset(exec_ctx->device_enqueue_helper_ptr, 0, buf_size); + exec_ctx->helper_bo_size = buf_size; + + exec_ctx->device_enqueue_helper_bo = + intel_buffer_alloc_userptr(ctx_gen->drv->bufmgr, "CL device enqueue helper object", + exec_ctx->device_enqueue_helper_ptr, buf_size, 0); + assert(exec_ctx->device_enqueue_helper_bo); + + drm_intel_bo_set_softpin_offset(exec_ctx->device_enqueue_helper_bo, + (size_t)exec_ctx->device_enqueue_helper_ptr); + drm_intel_bo_use_48b_address_range(exec_ctx->device_enqueue_helper_bo, 1); + drm_intel_bo_disable_reuse(exec_ctx->device_enqueue_helper_bo); + + for (i = 0; i < ker->arg_n; i++) { + if (ker->args[i].arg_type != ArgTypePointer && + ker->args[i].arg_type != ArgTypePipe && ker->args[i].arg_type != ArgTypeImage) + continue; + + if (ker->args[i].arg_type == ArgTypePointer && ker->args[i].arg_addrspace == AddressSpaceLocal) + continue; + + mem = ker->args[i].val.val_mem; + if (mem == NULL) + continue; + + if (ker->args[i].use_svm) // Already SVM + continue; + + mem_gen = (cl_mem_gen)mem->each_device[0]; + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(mem_gen->mem_base.device == queue->device); + + /* Just find a unused virtual address for binding, make the BO always use same address in GTT */ + drm_intel_bo_map(mem_gen->drm_bo->bo, 1); + tmp_ptr = mem_gen->drm_bo->bo->virtual; + drm_intel_bo_set_softpin_offset(mem_gen->drm_bo->bo, (size_t)tmp_ptr); + drm_intel_bo_use_48b_address_range(mem_gen->drm_bo->bo, 1); + drm_intel_bo_disable_reuse(mem_gen->drm_bo->bo); + drm_intel_bo_unmap(mem_gen->drm_bo->bo); + } + } + + err = cl_command_queue_ND_range(queue, ker, exec_ctx, work_dim, global_wk_off, global_wk_sz, local_wk_sz); + if (err != CL_SUCCESS) { + if (exec_ctx->device_enqueue_helper_bo) + drm_intel_bo_unreference(exec_ctx->device_enqueue_helper_bo); + if (exec_ctx->device_enqueue_helper_ptr) + CL_FREE(exec_ctx->device_enqueue_helper_ptr); + CL_FREE(exec_ctx); + return err; + } + + e->exec_data.exec_ctx = exec_ctx; + return err; +} + +LOCAL int +cl_command_queue_flush_gpgpu(void *gpgpu) +{ + gen_gpgpu_exec_ctx *gpu_exec_ctx = gpgpu; + gen_gpgpu *gpu; + cl_int i; + + for (i = 0; i < gpu_exec_ctx->gpu_num; i++) { + gpu = gpu_exec_ctx->all_gpu[i]; + assert(gpu); + + if (!gpu->batch || !gpu->batch->buffer) + return CL_INVALID_VALUE; + + if (intel_batchbuffer_flush(gpu->batch) < 0) + return CL_INVALID_VALUE; + } + return CL_SUCCESS; + + /* FIXME: + Remove old assert here for binded buffer offset 0 which + tried to guard possible NULL buffer pointer check in kernel, as + in case like "runtime_null_kernel_arg", but that's wrong to just + take buffer offset 0 as NULL, and cause failure for normal + kernels which has no such NULL ptr check but with buffer offset 0 + (which is possible now and will be normal if full PPGTT is on). + + Need to fix NULL ptr check otherwise. + */ +} + +typedef struct ndrange_info_t { + int type; + int global_work_size[3]; + int local_work_size[3]; + int global_work_offset[3]; +} ndrange_info_t; + +typedef struct Block_literal { + void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock + int flags; + int reserved; + size_t index; + struct Block_descriptor_1 { + unsigned long int slm_size; // NULL + unsigned long int size; // sizeof(struct Block_literal_1) + // optional helper functions + void *copy_helper; // IFF (1<<25) + void *dispose_helper; // IFF (1<<25) + // required ABI.2010.3.16 + const char *signature; // IFF (1<<30) + } * descriptor; + // imported variables +} Block_literal; + +static cl_int +cl_command_queue_gen_device_enqueue_once(cl_command_queue queue, cl_kernel kernel, drm_intel_bufmgr *bufmgr, + const uint32_t work_dim, const size_t *global_wk_off, + const size_t *global_dim_off, const size_t *global_wk_sz, + const size_t *global_wk_sz_use, const size_t *local_wk_sz, + const size_t *local_wk_sz_use, gen_gpgpu_exec_ctx *gpu_ctx) +{ + cl_int ret = CL_SUCCESS; + gen_gpgpu *gpu = NULL; + size_t local_size = local_wk_sz_use[0] * local_wk_sz_use[1] * local_wk_sz_use[2]; + cl_kernel_gen kernel_gen; + cl_program_gen prog_gen; + cl_context_gen ctx_gen; + gen_gpgpu *parent_gpu = gpu_ctx->all_gpu[0]; + assert(parent_gpu); + cl_uint i; + + DEV_PRIVATE_DATA(kernel, queue->device, kernel_gen); + DEV_PRIVATE_DATA(kernel->program, queue->device, prog_gen); + DEV_PRIVATE_DATA(queue->ctx, queue->device, ctx_gen); + + ret = check_work_group_capability(queue, kernel, local_wk_sz_use, 3); + if (ret != CL_SUCCESS) { + return ret; + } + + if (kernel_gen->scratch_size > queue->device->scratch_mem_size) { + return CL_OUT_OF_RESOURCES; + } + + gpu = CL_CALLOC(1, sizeof(gen_gpgpu)); + if (gpu == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + gpu->bufmgr = bufmgr; + gpu->simd_size = kernel_gen->simd_width; + gpu->device = queue->device; + gpu->thread.max_thread_num = queue->device->max_compute_unit * queue->device->max_thread_per_unit; + gpu->thread.thread_num = (local_size + gpu->simd_size - 1) / gpu->simd_size; + gpu->sampler.sampler_bitmap = ~((1 << max_sampler_n) - 1); + gpu->mem.max_bti = 0; + gpu->mem.per_thread_scratch_size = kernel_gen->scratch_size; + gpu->mem.total_scratch_size = 0; + gpu->atomic_test_result = gpu->device->atomic_test_result; + gpu->thread.barrier_slm_used = kernel_gen->barrier_slm_used; + + // TODO: Need to handle SLM here + gpu->mem.local_mem_size = 0; + + // Setup the kernel bitcode and upload it to GPU side + gpu->kernel_bo = drm_intel_bo_alloc(bufmgr, "CL kernel", kernel_gen->kern_base.exec_code_sz, 64u); + if (gpu->kernel_bo == NULL) { + cl_command_queue_delete_gpgpu(gpu); + return CL_OUT_OF_RESOURCES; + } + /* Upload the bitcode */ + drm_intel_bo_subdata(gpu->kernel_bo, 0, kernel_gen->kern_base.exec_code_sz, + kernel_gen->kern_base.exec_code); + + ret = gen_gpgpu_setup_curbe(kernel, kernel_gen, gpu, work_dim, global_wk_off, global_wk_sz, + local_wk_sz_use, local_wk_sz, (uint64_t)gpu_ctx->device_enqueue_helper_ptr); + if (ret != CL_SUCCESS) { + cl_command_queue_delete_gpgpu(gpu); + return ret; + } + + gpu->thread.num_cs_entries = 64; + gpu->thread.size_cs_entry = gpu->thread.curbe_size / 32; + + ret = gen_gpgpu_setup_aux(gpu); + if (ret != CL_SUCCESS) { + cl_command_queue_delete_gpgpu(gpu); + return ret; + } + + /* Copy the aux setting of the parent kernel except curbe */ + dri_bo_map(parent_gpu->aux.aux_bo, 1); + memcpy(gpu->aux.aux_bo->virtual + gpu->aux.surface_heap_offset, + parent_gpu->aux.aux_bo->virtual + parent_gpu->aux.surface_heap_offset, + sizeof(surface_heap_t)); + memcpy(gpu->aux.aux_bo->virtual + gpu->aux.sampler_state_offset, + parent_gpu->aux.aux_bo->virtual + parent_gpu->aux.sampler_state_offset, + MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t), GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t))); + memcpy(gpu->aux.aux_bo->virtual + gpu->aux.sampler_border_color_state_offset, + parent_gpu->aux.aux_bo->virtual + parent_gpu->aux.sampler_border_color_state_offset, + GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t)); + dri_bo_unmap(parent_gpu->aux.aux_bo); + + /* Setup the kernel arg. First one must be SVM and SLM later */ + for (i = 0; i < kernel->arg_n; i++) { + if (i == 0) { + assert(kernel->args[i].arg_type == ArgTypePointer); + assert(kernel->args[i].arg_addrspace == AddressSpaceGlobal); + /* No need to bind BTI, already in parent's BTI table */ + *(uint64_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = + (uint64_t)gpu_ctx->device_enqueue_helper_ptr; + continue; + } + + assert(0); // TODO: SLM setting + } + + ret = gen_gpgpu_setup_scratch(gpu); + if (ret != CL_SUCCESS) { + cl_command_queue_delete_gpgpu(gpu); + return ret; + } + /* Bind a stack if needed */ + ret = gen_gpgu_bind_stack(gpu, kernel, kernel_gen); + if (ret != CL_SUCCESS) { + cl_command_queue_delete_gpgpu(gpu); + return ret; + } + + /* Must be a 2.0 OpenCL */ + ret = gen_setup_constant_buffer_for_20(kernel, kernel_gen, prog_gen, gpu); + if (ret != CL_SUCCESS) { + cl_command_queue_delete_gpgpu(gpu); + return ret; + } + + gen_gpgpu_build_idrt(gpu); + gen_gpgpu_upload_final_curbe(kernel, kernel_gen, gpu, local_wk_sz_use); + gen_gpgpu_finish_aux(gpu); + + /* Start a new batch buffer */ + gpu->batch = intel_batchbuffer_create(ctx_gen->drv, gen_gpu_compute_batch_sz(kernel)); + if (gpu->batch == NULL) { + cl_command_queue_delete_gpgpu(gpu); + return CL_OUT_OF_RESOURCES; + } + + gen_gpgpu_batch_start(gpu); + gen_gpgpu_walker(gpu, gpu->simd_size, gpu->thread.thread_num, + global_wk_off, global_dim_off, global_wk_sz_use, local_wk_sz_use); + gen_gpgpu_batch_end(gpu, 0); + + if (intel_batchbuffer_flush(gpu->batch) < 0) { + cl_command_queue_delete_gpgpu(gpu); + return CL_INVALID_VALUE; + } + + intel_batchbuffer_finish(gpu->batch); + cl_command_queue_delete_gpgpu(gpu); + return CL_SUCCESS; +} + +static cl_int +cl_command_queue_gen_device_enqueue(cl_command_queue queue, cl_kernel kernel, drm_intel_bufmgr *bufmgr, + const uint32_t work_dim, const size_t *global_wk_off, + const size_t *global_wk_sz, const size_t *local_wk_sz, + gen_gpgpu_exec_ctx *gpu_ctx) +{ + cl_int err = CL_SUCCESS; + int i, j, k; + const size_t global_wk_sz_div[3] = { + global_wk_sz[0] / local_wk_sz[0] * local_wk_sz[0], + global_wk_sz[1] / local_wk_sz[1] * local_wk_sz[1], + global_wk_sz[2] / local_wk_sz[2] * local_wk_sz[2]}; + + const size_t global_wk_sz_rem[3] = { + global_wk_sz[0] % local_wk_sz[0], + global_wk_sz[1] % local_wk_sz[1], + global_wk_sz[2] % local_wk_sz[2]}; + + const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem}; + /* Go through the at most 8 cases and euque if there is work items left */ + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + for (k = 0; k < 2; k++) { + size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]}; + size_t global_dim_off[3] = { + k * global_wk_sz_div[0] / local_wk_sz[0], + j * global_wk_sz_div[1] / local_wk_sz[1], + i * global_wk_sz_div[2] / local_wk_sz[2]}; + size_t local_wk_sz_use[3] = { + k ? global_wk_sz_rem[0] : local_wk_sz[0], + j ? global_wk_sz_rem[1] : local_wk_sz[1], + i ? global_wk_sz_rem[2] : local_wk_sz[2]}; + if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) + continue; + + err = cl_command_queue_gen_device_enqueue_once(queue, kernel, bufmgr, work_dim, global_wk_off, global_dim_off, + global_wk_sz, global_wk_sz_use, local_wk_sz, local_wk_sz_use, + gpu_ctx); + + if (err != CL_SUCCESS) + return err; + } + if (work_dim < 2) + break; + } + if (work_dim < 3) + break; + } + + assert(err == CL_SUCCESS); + return err; +} + +/* If some device enqueue happen, we need to enqueue another enqueue_nd_range to imitate it */ +static cl_int +cl_command_queue_gen_handle_device_enqueue(cl_command_queue queue, cl_kernel kernel, gen_gpgpu_exec_ctx *gpu_ctx) +{ + cl_program program = kernel->program; + cl_kernel new_kernel; + cl_program_gen program_gen; + cl_context_gen ctx_gen; + cl_int err = CL_SUCCESS; + void *ptr; + int type; + int dim; + char *name; + int i; + + DEV_PRIVATE_DATA(queue->ctx, queue->device, ctx_gen); + DEV_PRIVATE_DATA(program, queue->device, program_gen); + + assert(gpu_ctx->device_enqueue_helper_ptr); + assert(gpu_ctx->device_enqueue_helper_bo); + drm_intel_bo_wait_rendering(gpu_ctx->device_enqueue_helper_bo); + + int total_size = *(int *)gpu_ctx->device_enqueue_helper_ptr; + ptr = gpu_ctx->device_enqueue_helper_ptr; + ptr += sizeof(int); + + while (ptr - gpu_ctx->device_enqueue_helper_ptr < total_size) { + size_t fixed_global_off[] = {0, 0, 0}; + size_t fixed_global_sz[] = {1, 1, 1}; + size_t fixed_local_sz[] = {1, 1, 1}; + ndrange_info_t *ndrange_info = (ndrange_info_t *)ptr; + ptr += sizeof(ndrange_info_t); + + Block_literal *block = (Block_literal *)ptr; + ptr += block->descriptor->size; + + type = ndrange_info->type; + dim = (type & 0xf0) >> 4; + type = type & 0xf; + assert(dim <= 2); + + for (i = 0; i <= dim; i++) { + fixed_global_sz[i] = ndrange_info->global_work_size[i]; + if (type > 1) + fixed_local_sz[i] = ndrange_info->local_work_size[i]; + if (type > 2) + fixed_global_off[i] = ndrange_info->global_work_offset[i]; + } + +// int *slm_sizes = (int *)ptr; + int slm_size = block->descriptor->slm_size; + ptr += slm_size; + + assert(block->index < program_gen->device_enqueue_info_num); + name = program_gen->device_enqueue_info[block->index].kernel_name; + + new_kernel = CL_CALLOC(1, sizeof(struct _cl_kernel)); + if (new_kernel == NULL) + return CL_OUT_OF_HOST_MEMORY; + + CL_OBJECT_INIT_BASE(new_kernel, CL_OBJECT_KERNEL_MAGIC); + new_kernel->program = program; + + new_kernel->name = CL_CALLOC(1, strlen(name) + 1); + if (new_kernel->name == NULL) { + CL_FREE(new_kernel); + return CL_OUT_OF_HOST_MEMORY; + } + memcpy(new_kernel->name, name, strlen(name) + 1); + + new_kernel->each_device = CL_CALLOC(program->each_device_num, sizeof(cl_kernel_for_device)); + if (new_kernel->each_device == NULL) { + CL_FREE(new_kernel->name); + CL_FREE(new_kernel); + return CL_OUT_OF_HOST_MEMORY; + } + + new_kernel->each_device_num = program->each_device_num; + /* No need to add to program's list. */ + err = cl_kernel_create_gen(queue->device, new_kernel); + if (err != CL_SUCCESS) { + cl_kernel_delete_gen(queue->device, new_kernel); + CL_FREE(new_kernel->each_device); + CL_FREE(new_kernel->name); + CL_FREE(new_kernel); + } + + err = cl_command_queue_gen_device_enqueue(queue, new_kernel, ctx_gen->drv->bufmgr, dim + 1, + fixed_global_off, fixed_global_sz, fixed_local_sz, gpu_ctx); + + cl_kernel_delete_gen(queue->device, new_kernel); + CL_FREE(new_kernel->each_device); + CL_FREE(new_kernel->name); + CL_FREE(new_kernel); + + if (err != CL_SUCCESS) + return err; + } + + return CL_SUCCESS; +} + +LOCAL int +cl_command_queue_finish_gpgpu(void *gpgpu) +{ + gen_gpgpu_exec_ctx *gpu_exec_ctx = gpgpu; + gen_gpgpu *gpu; + cl_int i; + + for (i = 0; i < gpu_exec_ctx->gpu_num; i++) { + gpu = gpu_exec_ctx->all_gpu[i]; + assert(gpu); + + if (!gpu->batch || !gpu->batch->buffer) + return CL_INVALID_VALUE; + + intel_batchbuffer_finish(gpu->batch); + } + + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_handle_nd_range_gen(cl_event event, cl_int status) +{ + cl_int err = CL_SUCCESS; + + assert(event->exec_data.type == EnqueueNDRangeKernel); + + if (status == CL_QUEUED) { + size_t fixed_global_off[] = {0, 0, 0}; + size_t fixed_global_sz[] = {1, 1, 1}; + size_t fixed_local_sz[] = {1, 1, 1}; + cl_command_queue queue = event->queue; + cl_kernel kernel = event->exec_data.nd_range.kernel; + cl_int work_dim = event->exec_data.nd_range.work_dim; + size_t *global_wk_off = event->exec_data.nd_range.global_wk_off; + size_t *global_wk_sz = event->exec_data.nd_range.global_wk_sz; + size_t *local_wk_sz = event->exec_data.nd_range.local_wk_sz; + cl_int i; + + if (local_wk_sz[0] != 0 || local_wk_sz[1] != 0 || local_wk_sz[2] != 0) { + for (i = 0; i < work_dim; ++i) { + fixed_local_sz[i] = local_wk_sz[i]; + } + } else { + uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large + size_t realGroupSize = 1; + for (i = 0; i < work_dim; i++) { + for (j = maxDimSize; j > 1; j--) { + if (global_wk_sz[i] % j == 0 && j <= maxGroupSize) { + fixed_local_sz[i] = j; + maxGroupSize = maxGroupSize / j; + maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize; + break; //choose next work_dim + } + } + realGroupSize *= fixed_local_sz[i]; + } + + //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop: + //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel + //it triggers the following message for many times. + //to avoid too many messages, only print it for the first time of the process. + //just use static variable since it doesn't matter to print a few times at multi-thread case. + static int warn_no_good_localsize = 1; + if (realGroupSize % 8 != 0 && warn_no_good_localsize) { + warn_no_good_localsize = 0; + CL_LOG_WARNING("unable to find good values for local_work_size[i], please provide\n" + " local_work_size[] explicitly, you can find good values with\n" + " trial-and-error method."); + } + } + + for (i = 0; i < work_dim; ++i) + fixed_global_sz[i] = global_wk_sz[i]; + + if (global_wk_off[0] != 0 || global_wk_off[1] != 0 || global_wk_off[2] != 0) + for (i = 0; i < work_dim; ++i) + fixed_global_off[i] = global_wk_off[i]; + + if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) { + if (fixed_local_sz[0] != kernel->compile_wg_sz[0] || + fixed_local_sz[1] != kernel->compile_wg_sz[1] || + fixed_local_sz[2] != kernel->compile_wg_sz[2]) { + err = CL_INVALID_WORK_GROUP_SIZE; + return err; + } + } + + err = cl_command_queue_ND_range_wrap(queue, kernel, event, work_dim, fixed_global_off, + fixed_global_sz, fixed_local_sz); + return err; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + err = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return err; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + err = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + + if (err == CL_SUCCESS) { + if (((gen_gpgpu_exec_ctx *)event->exec_data.exec_ctx)->device_enqueue_helper_ptr) { + err = cl_command_queue_gen_handle_device_enqueue(event->queue, event->exec_data.nd_range.kernel, + event->exec_data.exec_ctx); + } + } + + /* If profiling, we will delay the GPU's delete to event's delete */ + if ((event->queue->props & CL_QUEUE_PROFILING_ENABLE) == 0) { + cl_enqueue_nd_range_delete_gen(event); + event->exec_data.exec_ctx = NULL; + } + + return err; +} + +LOCAL void +cl_enqueue_nd_range_delete_gen(cl_event event) +{ + gen_gpgpu_exec_ctx *gpu_exec_ctx = event->exec_data.exec_ctx; + + if (gpu_exec_ctx) { + gen_gpgpu *gpu; + cl_int i; + + if (gpu_exec_ctx->device_enqueue_helper_bo) { + drm_intel_bo_unreference(gpu_exec_ctx->device_enqueue_helper_bo); + gpu_exec_ctx->device_enqueue_helper_bo = NULL; + } + if (gpu_exec_ctx->device_enqueue_helper_ptr) { + CL_FREE(gpu_exec_ctx->device_enqueue_helper_ptr); + gpu_exec_ctx->device_enqueue_helper_ptr = NULL; + } + + for (i = 0; i < gpu_exec_ctx->gpu_num; i++) { + gpu = gpu_exec_ctx->all_gpu[i]; + assert(gpu); + cl_command_queue_delete_gpgpu(gpu); + } + + CL_FREE(gpu_exec_ctx); + event->exec_data.exec_ctx = NULL; + } +} + +LOCAL cl_int +cl_command_queue_create_gen(cl_device_id device, cl_command_queue queue) +{ + return CL_SUCCESS; +} + +LOCAL void +cl_command_queue_delete_gen(cl_device_id device, cl_command_queue queue) +{ +} -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet