From: Junyan He <junyan...@intel.com> Signed-off-by: Junyan He <junyan...@intel.com> --- runtime/gen/cl_image_gen.c | 1247 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1247 insertions(+) create mode 100644 runtime/gen/cl_image_gen.c
diff --git a/runtime/gen/cl_image_gen.c b/runtime/gen/cl_image_gen.c new file mode 100644 index 0000000..8f6617a --- /dev/null +++ b/runtime/gen/cl_image_gen.c @@ -0,0 +1,1247 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "cl_gen.h" +#include "intel_defines.h" +#include <math.h> + +#define LOCAL_SZ_0 16 +#define LOCAL_SZ_1 4 +#define LOCAL_SZ_2 4 + +LOCAL cl_int +cl_image_format_support_gen(cl_device_id device, cl_mem_object_type image_type, + cl_image_format *image_format) +{ + uint32_t fmt = cl_image_get_gen_format(image_format); + if (fmt == INTEL_UNSUPPORTED_FORMAT) + return CL_FALSE; + + return CL_TRUE; +} + +LOCAL uint32_t +cl_image_get_gen_format(const cl_image_format *fmt) +{ + const uint32_t type = fmt->image_channel_data_type; + const uint32_t order = fmt->image_channel_order; + switch (order) { + case CL_R: +#if 0 + case CL_Rx: + case CL_A: + case CL_INTENSITY: + case CL_LUMINANCE: + if ((order == CL_INTENSITY || order == CL_LUMINANCE) + && (type != CL_UNORM_INT8 && type != CL_UNORM_INT16 + && type != CL_SNORM_INT8 && type != CL_SNORM_INT16 + && type != CL_HALF_FLOAT && type != CL_FLOAT)) + return INTEL_UNSUPPORTED_FORMAT; +#endif + + /* XXX it seems we have some acuracy compatible issue with snomr_int8/16, + * have to disable those formats currently. */ + + switch (type) { + case CL_HALF_FLOAT: + return I965_SURFACEFORMAT_R16_FLOAT; + case CL_FLOAT: + return I965_SURFACEFORMAT_R32_FLOAT; + // case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16_SNORM; + // case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8_SNORM; + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_R8_UNORM; + case CL_UNORM_INT16: + return I965_SURFACEFORMAT_R16_UNORM; + case CL_SIGNED_INT8: + return I965_SURFACEFORMAT_R8_SINT; + case CL_SIGNED_INT16: + return I965_SURFACEFORMAT_R16_SINT; + case CL_SIGNED_INT32: + return I965_SURFACEFORMAT_R32_SINT; + case CL_UNSIGNED_INT8: + return I965_SURFACEFORMAT_R8_UINT; + case CL_UNSIGNED_INT16: + return I965_SURFACEFORMAT_R16_UINT; + case CL_UNSIGNED_INT32: + return I965_SURFACEFORMAT_R32_UINT; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; + case CL_RG: + switch (type) { + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_R8G8_UNORM; + case CL_UNORM_INT16: + return I965_SURFACEFORMAT_R16G16_UNORM; + case CL_UNSIGNED_INT8: + return I965_SURFACEFORMAT_R8G8_UINT; + case CL_UNSIGNED_INT16: + return I965_SURFACEFORMAT_R16G16_UINT; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; +#if 0 + case CL_RG: + case CL_RA: + switch (type) { + case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16G16_FLOAT; + case CL_FLOAT: return I965_SURFACEFORMAT_R32G32_FLOAT; + case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16G16_SNORM; + case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8G8_SNORM; + case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8G8_UNORM; + case CL_UNORM_INT16: return I965_SURFACEFORMAT_R16G16_UNORM; + case CL_SIGNED_INT8: return I965_SURFACEFORMAT_R8G8_SINT; + case CL_SIGNED_INT16: return I965_SURFACEFORMAT_R16G16_SINT; + case CL_SIGNED_INT32: return I965_SURFACEFORMAT_R32G32_SINT; + case CL_UNSIGNED_INT8: return I965_SURFACEFORMAT_R8G8_UINT; + case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16G16_UINT; + case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32G32_UINT; + default: return INTEL_UNSUPPORTED_FORMAT; + }; + case CL_RGB: + case CL_RGBx: + switch (type) { + case CL_UNORM_INT_101010: return I965_SURFACEFORMAT_R10G10B10A2_UNORM; + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + default: return INTEL_UNSUPPORTED_FORMAT; + }; +#endif + case CL_RGBA: + switch (type) { + case CL_HALF_FLOAT: + return I965_SURFACEFORMAT_R16G16B16A16_FLOAT; + case CL_FLOAT: + return I965_SURFACEFORMAT_R32G32B32A32_FLOAT; + // case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16G16B16A16_SNORM; + // case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_SNORM; + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_R8G8B8A8_UNORM; + case CL_UNORM_INT16: + return I965_SURFACEFORMAT_R16G16B16A16_UNORM; + case CL_SIGNED_INT8: + return I965_SURFACEFORMAT_R8G8B8A8_SINT; + case CL_SIGNED_INT16: + return I965_SURFACEFORMAT_R16G16B16A16_SINT; + case CL_SIGNED_INT32: + return I965_SURFACEFORMAT_R32G32B32A32_SINT; + case CL_UNSIGNED_INT8: + return I965_SURFACEFORMAT_R8G8B8A8_UINT; + case CL_UNSIGNED_INT16: + return I965_SURFACEFORMAT_R16G16B16A16_UINT; + case CL_UNSIGNED_INT32: + return I965_SURFACEFORMAT_R32G32B32A32_UINT; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; + case CL_ARGB: + return INTEL_UNSUPPORTED_FORMAT; + case CL_BGRA: + switch (type) { + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_B8G8R8A8_UNORM; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; + case CL_sRGBA: + switch (type) { + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; + case CL_sBGRA: + switch (type) { + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; + case CL_NV12_INTEL: + switch (type) { + case CL_UNORM_INT8: + return I965_SURFACEFORMAT_PLANAR_420_8; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; + default: + return INTEL_UNSUPPORTED_FORMAT; + }; +} + +LOCAL cl_int +cl_enqueue_image_fill_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueFillImage); + + if (status == CL_QUEUED) { + cl_command_queue queue = event->queue; + const void *pattern = event->exec_data.fill_image.pattern; + cl_mem mem = event->exec_data.fill_image.image; + const size_t *origin = event->exec_data.fill_image.origin; + const size_t *region = event->exec_data.fill_image.region; + + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + cl_mem_image src_image = cl_mem_to_image(mem); + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + assert(mem_gen); + uint32_t savedIntelFmt = mem_gen->image.intel_fmt; + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_1D); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_2D); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_3D); + } else { + return CL_IMAGE_FORMAT_NOT_SUPPORTED; + } + + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image); + if (src_image->fmt.image_channel_order >= CL_sRGBA) { +#define RGB2sRGB(linear) (linear <= 0.0031308f) ? (12.92f * linear) : (1.055f * powf(linear, 1.0f / 2.4f) - 0.055f); + cl_image_format fmt; + float newpattern[4] = {0.0, 0.0, 0.0, ((float *)pattern)[3]}; + int i; + for (i = 0; i < 3; i++) { + if (src_image->fmt.image_channel_order == CL_sRGBA) { + newpattern[i] = RGB2sRGB(((float *)pattern)[i]); + } else + newpattern[2 - i] = RGB2sRGB(((float *)pattern)[i]); + } + cl_kernel_set_arg(ker, 1, sizeof(float) * 4, newpattern); + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNORM_INT8; + mem_gen->image.intel_fmt = cl_image_get_gen_format(&fmt); +#undef RGB2sRGB + } else + cl_kernel_set_arg(ker, 1, sizeof(float) * 4, pattern); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]); + + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 3, global_off, global_sz, local_sz); + mem_gen->image.intel_fmt = savedIntelFmt; + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +LOCAL cl_int +cl_enqueue_image_copy_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueCopyImage); + + if (status == CL_QUEUED) { + cl_command_queue queue = event->queue; + cl_mem src = event->exec_data.copy_image.src_image; + cl_mem dst = event->exec_data.copy_image.dst_image; + const size_t *src_origin = event->exec_data.copy_image.src_origin; + const size_t *dst_origin = event->exec_data.copy_image.dst_origin; + const size_t *region = event->exec_data.copy_image.region; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + uint32_t fixupDataType; + uint32_t savedIntelFmt; + cl_mem_image src_image = cl_mem_to_image(src); + cl_mem_image dst_image = cl_mem_to_image(dst); + cl_mem_gen src_mem_gen = (cl_mem_gen)src->each_device[0]; + cl_mem_gen dst_mem_gen = (cl_mem_gen)dst->each_device[0]; + assert(src_mem_gen); + assert(dst_mem_gen); + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + switch (src_image->fmt.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_UNORM_INT8: + fixupDataType = CL_UNSIGNED_INT8; + break; + case CL_HALF_FLOAT: + case CL_SNORM_INT16: + case CL_UNORM_INT16: + fixupDataType = CL_UNSIGNED_INT16; + break; + case CL_FLOAT: + fixupDataType = CL_UNSIGNED_INT32; + break; + default: + fixupDataType = 0; + } + + if (fixupDataType) { + cl_image_format fmt; + if (src_image->fmt.image_channel_order != CL_BGRA && + src_image->fmt.image_channel_order != CL_sBGRA && + src_image->fmt.image_channel_order != CL_sRGBA) + fmt.image_channel_order = src_image->fmt.image_channel_order; + else + fmt.image_channel_order = CL_RGBA; + + fmt.image_channel_data_type = fixupDataType; + savedIntelFmt = src_mem_gen->image.intel_fmt; + src_mem_gen->image.intel_fmt = cl_image_get_gen_format(&fmt); + dst_mem_gen->image.intel_fmt = src_mem_gen->image.intel_fmt; + } + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(src_image->base.ctx == dst_image->base.ctx); + + /* setup the kernel and run. */ + if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY); + } + } + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]); + cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]); + cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]); + + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + + if (fixupDataType) { + src_mem_gen->image.intel_fmt = savedIntelFmt; + dst_mem_gen->image.intel_fmt = savedIntelFmt; + } + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +LOCAL cl_int +cl_enqueue_copy_image_to_buffer_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueCopyImageToBuffer); + + if (status == CL_QUEUED) { + cl_command_queue queue = event->queue; + cl_mem the_image = event->exec_data.copy_image_and_buffer.image; + cl_mem buffer = event->exec_data.copy_image_and_buffer.buffer; + const size_t *src_origin = event->exec_data.copy_image_and_buffer.origin; + const size_t *region = event->exec_data.copy_image_and_buffer.region; + const size_t dst_offset = event->exec_data.copy_image_and_buffer.offset; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + uint32_t intel_fmt, bpp; + cl_image_format fmt; + size_t origin0, region0; + size_t kn_dst_offset; + int align16 = 0; + size_t align_size = 1; + size_t w_saved; + cl_mem_image image = cl_mem_to_image(the_image); + cl_mem_gen image_gen = (cl_mem_gen)the_image->each_device[0]; + assert(image_gen); + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(image->base.ctx == buffer->ctx); + + intel_fmt = image_gen->image.intel_fmt; + bpp = image->bpp; + w_saved = image->w; + region0 = region[0] * bpp; + kn_dst_offset = dst_offset; + if ((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) && + ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (dst_offset % 16 == 0)) { + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNSIGNED_INT32; + align16 = 1; + align_size = 16; + } else { + fmt.image_channel_order = CL_R; + fmt.image_channel_data_type = CL_UNSIGNED_INT8; + align_size = 1; + } + image_gen->image.intel_fmt = cl_image_get_gen_format(&fmt); + image->w = (image->w * image->bpp) / align_size; + image->bpp = align_size; + region0 = (region[0] * bpp) / align_size; + origin0 = (src_origin[0] * bpp) / align_size; + kn_dst_offset /= align_size; + global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + + /* setup the kernel and run. */ + if (image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (align16) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16); + } else { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER); + } + } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER); + } + + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset); + + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + + image_gen->image.intel_fmt = intel_fmt; + image->bpp = bpp; + image->w = w_saved; + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +LOCAL cl_int +cl_enqueue_copy_buffer_to_image_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueCopyBufferToImage); + + if (status == CL_QUEUED) { + cl_command_queue queue = event->queue; + cl_mem buffer = event->exec_data.copy_image_and_buffer.buffer; + cl_mem the_image = event->exec_data.copy_image_and_buffer.image; + const size_t src_offset = event->exec_data.copy_image_and_buffer.offset; + const size_t *dst_origin = event->exec_data.copy_image_and_buffer.origin; + const size_t *region = event->exec_data.copy_image_and_buffer.region; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + uint32_t intel_fmt, bpp; + cl_image_format fmt; + size_t origin0, region0; + size_t kn_src_offset; + int align16 = 0; + size_t align_size = 1; + size_t w_saved = 0; + cl_mem_image image = cl_mem_to_image(the_image); + cl_mem_gen image_gen = (cl_mem_gen)the_image->each_device[0]; + assert(image_gen); + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(image->base.ctx == buffer->ctx); + + intel_fmt = image_gen->image.intel_fmt; + bpp = image->bpp; + w_saved = image->w; + region0 = region[0] * bpp; + kn_src_offset = src_offset; + if ((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) && + ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)) { + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNSIGNED_INT32; + align16 = 1; + align_size = 16; + } else { + fmt.image_channel_order = CL_R; + fmt.image_channel_data_type = CL_UNSIGNED_INT8; + align_size = 1; + } + image_gen->image.intel_fmt = cl_image_get_gen_format(&fmt); + image->w = (image->w * image->bpp) / align_size; + image->bpp = align_size; + region0 = (region[0] * bpp) / align_size; + origin0 = (dst_origin[0] * bpp) / align_size; + kn_src_offset /= align_size; + global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + + /* setup the kernel and run. */ + if (image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (align16) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16); + } else { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D); + } + } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D); + } + + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset); + + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + + image_gen->image.intel_fmt = intel_fmt; + image->bpp = bpp; + image->w = w_saved; + + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +static cl_image_gen_tiling +cl_gen_get_default_tiling(cl_device_id device) +{ + static int initialized = 0; + static cl_image_gen_tiling tiling = CL_TILE_X; + + if (!initialized) { + // FIXME, need to find out the performance diff's root cause on BDW. + // SKL's 3D Image can't use TILE_X, so use TILE_Y as default + if (IS_GEN9(device->device_id) || IS_GEN8(device->device_id)) + tiling = CL_TILE_Y; + + char *tilingStr = getenv("OCL_TILING"); + if (tilingStr != NULL) { + switch (tilingStr[0]) { + case '0': + tiling = CL_NO_TILE; + break; + case '1': + tiling = CL_TILE_X; + break; + case '2': + tiling = CL_TILE_Y; + break; + default: + break; + } + } + initialized = 1; + } + + return tiling; +} + +static uint32_t +cl_gen_get_tiling_align(cl_device_id device, uint32_t tiling_mode, uint32_t dim) +{ + uint32_t ret = 0; + + switch (tiling_mode) { + case CL_TILE_X: + if (dim == 0) { //tileX width in bytes + ret = 512; + } else if (dim == 1) { //tileX height in number of rows + ret = 8; + } else if (dim == 2) { //height to calculate slice pitch + if (IS_GEN9(device->device_id)) //SKL same as tileY height + ret = 8; + else if (IS_GEN8(device->device_id)) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment + ret = 4; + else + ret = 2; + } else + assert(0); + break; + + case CL_TILE_Y: + if (dim == 0) { //tileY width in bytes + ret = 128; + } else if (dim == 1) { //tileY height in number of rows + ret = 32; + } else if (dim == 2) { //height to calculate slice pitch + if (IS_GEN9(device->device_id)) //SKL same as tileY height + ret = 32; + else if (IS_GEN8(device->device_id)) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment + ret = 4; + else + ret = 2; + } else + assert(0); + break; + + case CL_NO_TILE: + if (dim == 1 || dim == 2) { //vertical alignment + if (IS_GEN8(device->device_id) || IS_GEN9(device->device_id)) //SKL 1D array need 4 alignment qpitch + ret = 4; + else + ret = 2; + } else + assert(0); + break; + } + + return ret; +} + +static void +cl_mem_gen_upload_image(cl_mem_image image, cl_mem_gen mem_gen) +{ + cl_mem mem = &image->base; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {image->w, image->h, image->depth}; + void *dst_ptr; + + assert(mem_gen->drm_bo); + assert(mem_gen->drm_bo->host_coherent == CL_FALSE); + assert(image->mem_from == NULL); // If image from buffer, no need to upload + + if ((mem->flags & CL_MEM_COPY_HOST_PTR) || (mem->flags & CL_MEM_USE_HOST_PTR)) { + assert(mem->host_ptr); + + dst_ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + assert(dst_ptr); + cl_mem_copy_image_region_helper(origin, region, + dst_ptr, mem_gen->image.gpu_row_pitch, mem_gen->image.gpu_slice_pitch, + mem->host_ptr, image->row_pitch, image->slice_pitch, + image->bpp, image->w, image->h, CL_FALSE, CL_FALSE); + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + + if (mem->flags & CL_MEM_COPY_HOST_PTR) + mem->host_ptr = NULL; // Clear the content set by user + } +} + +static void +cl_mem_gen_image_parameter_init(cl_mem_image image, cl_mem_gen mem_gen) +{ + mem_gen->image.gpu_w = image->w; + mem_gen->image.gpu_h = image->h; + mem_gen->image.gpu_depth = image->depth; + mem_gen->image.gpu_row_pitch = image->row_pitch; + if (image->slice_pitch == 0) + mem_gen->image.gpu_slice_pitch = mem_gen->image.gpu_h * mem_gen->image.gpu_row_pitch; +} + +/* 1D and 1D array, never tiling and never real user ptr, + because CL_NO_TILE need at least 2 alignment for height */ +static cl_int +cl_mem_allocate_image_gen_1D(cl_device_id device, cl_mem mem) +{ + cl_context_gen ctx_gen; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + cl_mem_image image = cl_mem_to_image(mem); + size_t alignment = 64; + + DEV_PRIVATE_DATA(mem->ctx, device, ctx_gen); + assert(ctx_gen); + assert(image->mem_from == NULL); + assert(mem_gen->drm_bo == NULL); + + /* Allocate the real mem bo */ + if (mem->flags & CL_MEM_PINNABLE) + alignment = 4096; + + cl_mem_gen_image_parameter_init(image, mem_gen); + mem_gen->image.gpu_row_pitch = image->w * image->bpp; + assert(mem_gen->image.gpu_row_pitch <= image->row_pitch); + + assert(image->h == 1); + mem_gen->image.gpu_slice_pitch = image->h * mem_gen->image.gpu_row_pitch; + + if (CL_OBJECT_IS_IMAGE_ARRAY(mem)) + mem_gen->image.gpu_slice_pitch = mem_gen->image.gpu_row_pitch * + ALIGN(image->h, cl_gen_get_tiling_align(device, CL_NO_TILE, 2)); + + mem_gen->drm_bo = cl_mem_gen_create_drm_bo(ctx_gen->drv->bufmgr, + mem_gen->image.gpu_slice_pitch * mem_gen->image.gpu_depth, + alignment, CL_NO_TILE, 0, NULL); + + cl_mem_gen_upload_image(image, mem_gen); + return CL_SUCCESS; +} + +/* No tiling and no real host ptr */ +static cl_int +cl_mem_allocate_image_gen_1D_buffer(cl_device_id device, cl_mem mem) +{ + cl_context_gen ctx_gen; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + cl_mem_image image = cl_mem_to_image(mem); + cl_mem mem_from = image->mem_from; + size_t alignment = 64; + size_t aligned_h; + DEV_PRIVATE_DATA(mem->ctx, device, ctx_gen); + + assert(CL_OBJECT_IS_BUFFER(mem_from)); + if (CL_OBJECT_IS_SUB_BUFFER(mem_from)) { + mem_gen->image.sub_offset = cl_mem_to_buffer(mem_from)->sub_offset; + } + cl_mem_gen mem_from_gen = (cl_mem_gen)(mem_from->each_device[0]); + assert(mem_from_gen); + assert(mem_from); + assert(ctx_gen); + assert(image->h == 1); + assert(image->depth == 1); + assert(mem_gen->drm_bo == NULL); + + cl_mem_gen_image_parameter_init(image, mem_gen); + /* This is an image1d buffer which exceeds normal image size restrication + We have to use a 2D image to simulate this 1D image. */ + mem_gen->image.gpu_h = (image->w + device->image2d_max_width - 1) / device->image2d_max_width; + mem_gen->image.gpu_w = image->w > device->image2d_max_width ? device->image2d_max_width : image->w; + + mem_gen->image.gpu_row_pitch = mem_gen->image.gpu_w * image->bpp; + assert(mem_gen->image.gpu_row_pitch <= image->row_pitch); + + aligned_h = ALIGN(mem_gen->image.gpu_h, cl_gen_get_tiling_align(device, CL_NO_TILE, 1)); + mem_gen->image.gpu_slice_pitch = aligned_h * mem_gen->image.gpu_row_pitch; + + /* FIXME, we use 2D to imitate the 1D image for 1D buffer, the drm bo size is different and so + we need to replace the old one with a new drm bo. Some risk if some is using it */ + if (mem_from_gen->buffer.already_convert_image) { // Already do the convert + mem_gen->drm_bo = mem_from_gen->drm_bo; + cl_mem_gen_drm_bo_ref(mem_gen->drm_bo); + assert(mem_gen->drm_bo->tiling == CL_NO_TILE); + return CL_SUCCESS; + } + + /* Allocate the real mem bo */ + if (mem->flags & CL_MEM_PINNABLE) + alignment = 4096; + + /* Just calculate a big enough size, later image from this buffer is always enough */ + size_t max_h = (mem_from->size + device->image2d_max_width - 1) / device->image2d_max_width; + size_t max_sz = ALIGN(max_h, cl_gen_get_tiling_align(device, CL_NO_TILE, 1)) * device->image2d_max_width; + assert(mem_gen->image.gpu_row_pitch * mem_gen->image.gpu_h <= max_sz); + + if (cl_mem_gen_drm_bo_expand(mem_from_gen->drm_bo, max_sz, alignment) == CL_FALSE) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + mem_from_gen->buffer.already_convert_image = CL_TRUE; + mem_gen->drm_bo = mem_from_gen->drm_bo; + cl_mem_gen_drm_bo_ref(mem_gen->drm_bo); + assert(mem_gen->drm_bo->tiling == CL_NO_TILE); + return CL_SUCCESS; +} + +#define MAX_TILING_SIZE 128 * MB + +/* 2D,3D and 2D array image. */ +static cl_int +cl_mem_allocate_image_gen_2D_3D(cl_device_id device, cl_mem mem) +{ + cl_context_gen ctx_gen; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + cl_mem_image image = cl_mem_to_image(mem); + int enableUserptr = 0; + int enable_true_hostptr = 0; + cl_uint cacheline_size = 0; + size_t alignment = 64; + cl_image_gen_tiling tiling; + size_t total_gpu_size = 0; + size_t aligned_h; + + DEV_PRIVATE_DATA(mem->ctx, device, ctx_gen); + assert(ctx_gen); + assert(image->mem_from == NULL); + assert(mem_gen->drm_bo == NULL); + + cl_mem_gen_image_parameter_init(image, mem_gen); + +#ifdef HAS_USERPTR + /* Only enable real user ptr if user set */ + const char *env = getenv("OCL_IMAGE_HOSTPTR"); + if (env != NULL) { + sscanf(env, "%i", &enable_true_hostptr); + } +#endif + + enableUserptr = 0; + tiling = cl_gen_get_default_tiling(device); + + if (enable_true_hostptr && device->host_unified_memory && (mem->flags & CL_MEM_USE_HOST_PTR)) { + cacheline_size = device->global_mem_cache_line_size; + if (ALIGN((unsigned long)mem->host_ptr, cacheline_size) == (unsigned long)mem->host_ptr && + ALIGN(image->h, cl_gen_get_tiling_align(device, CL_NO_TILE, 1)) == image->h && + ALIGN(image->h * image->row_pitch * image->depth, cacheline_size) == + image->h * image->row_pitch * image->depth && + /* If 3D and 2D array, slice pitch must match */ + ((image->image_type == CL_MEM_OBJECT_IMAGE3D || image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) && + image->row_pitch * image->h == image->slice_pitch)) { + tiling = CL_NO_TILE; + enableUserptr = 1; + } + } + + if (enableUserptr) { + total_gpu_size = mem_gen->image.gpu_slice_pitch * mem_gen->image.gpu_depth; + mem_gen->drm_bo = cl_mem_gen_create_drm_bo_from_hostptr( + ctx_gen->drv->bufmgr, CL_FALSE, total_gpu_size, cacheline_size, mem->host_ptr); + } else { // change for the param. + mem_gen->image.gpu_row_pitch = image->w * image->bpp; + if (tiling != CL_NO_TILE) + mem_gen->image.gpu_row_pitch = ALIGN(mem_gen->image.gpu_row_pitch, + cl_gen_get_tiling_align(device, tiling, 0)); + + if (CL_OBJECT_IS_IMAGE_ARRAY(mem) || CL_OBJECT_IS_3D_IMAGE(mem)) + aligned_h = ALIGN(image->h, cl_gen_get_tiling_align(device, tiling, 2)); + else + aligned_h = ALIGN(image->h, cl_gen_get_tiling_align(device, tiling, 1)); + + mem_gen->image.gpu_slice_pitch = mem_gen->image.gpu_row_pitch * aligned_h; + total_gpu_size = mem_gen->image.gpu_slice_pitch * mem_gen->image.gpu_depth; + + /* If sz is large than 128MB, map gtt may fail in some system. + Because there is no obviours performance drop, disable tiling. */ + if (tiling != CL_NO_TILE && total_gpu_size > MAX_TILING_SIZE) { + tiling = CL_NO_TILE; + + mem_gen->image.gpu_row_pitch = image->w * image->bpp; + + if (CL_OBJECT_IS_IMAGE_ARRAY(mem) || CL_OBJECT_IS_3D_IMAGE(mem)) + aligned_h = ALIGN(image->h, cl_gen_get_tiling_align(device, tiling, 2)); + else + aligned_h = ALIGN(image->h, cl_gen_get_tiling_align(device, tiling, 1)); + + mem_gen->image.gpu_slice_pitch = mem_gen->image.gpu_row_pitch * aligned_h; + total_gpu_size = mem_gen->image.gpu_slice_pitch * mem_gen->image.gpu_depth; + } + } + + /* Allocate the real mem bo */ + if (mem->flags & CL_MEM_PINNABLE || tiling != CL_NO_TILE) + alignment = 4096; + + if (mem_gen->drm_bo == NULL) + mem_gen->drm_bo = cl_mem_gen_create_drm_bo(ctx_gen->drv->bufmgr, total_gpu_size, alignment, + tiling, mem_gen->image.gpu_row_pitch, NULL); + + assert(mem_gen->drm_bo); + cl_mem_gen_upload_image(image, mem_gen); + + return CL_SUCCESS; +} + +static cl_int +cl_mem_allocate_image_gen_2D_buffer(cl_device_id device, cl_mem mem) +{ + cl_context_gen ctx_gen; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + cl_mem_image image = cl_mem_to_image(mem); + cl_mem mem_from = image->mem_from; + if (CL_OBJECT_IS_SUB_BUFFER(mem_from)) { + mem_gen->image.sub_offset = cl_mem_to_buffer(mem_from)->sub_offset; + } + cl_mem_gen mem_from_gen = (cl_mem_gen)(mem_from->each_device[0]); + + DEV_PRIVATE_DATA(mem->ctx, device, ctx_gen); + assert(mem_from); + assert(ctx_gen); + assert(mem_gen->drm_bo == NULL); + cl_mem_gen_image_parameter_init(image, mem_gen); + + if (CL_OBJECT_IS_2D_IMAGE(mem_from)) { + assert(mem_gen->image.sub_offset == 0); + /* According to spec, if from another image, just the channel order + is different, so we can inherit all parameters of the old image */ + mem_gen->image.gpu_w = mem_from_gen->image.gpu_w; + mem_gen->image.gpu_h = mem_from_gen->image.gpu_h; + mem_gen->image.gpu_row_pitch = mem_from_gen->image.gpu_row_pitch; + mem_gen->image.gpu_slice_pitch = mem_from_gen->image.gpu_slice_pitch; + mem_gen->drm_bo = mem_from_gen->drm_bo; + cl_mem_gen_drm_bo_ref(mem_gen->drm_bo); + return CL_SUCCESS; + } + + assert(CL_OBJECT_IS_BUFFER(mem_from)); + /* Image from a real buffer */ + mem_gen->image.gpu_row_pitch = image->row_pitch; + mem_gen->image.gpu_slice_pitch = mem_gen->image.gpu_row_pitch * + ALIGN(image->h, cl_gen_get_tiling_align(device, CL_NO_TILE, 1)); + mem_gen->drm_bo = mem_from_gen->drm_bo; + cl_mem_gen_drm_bo_ref(mem_gen->drm_bo); + assert(mem_gen->drm_bo->tiling == CL_NO_TILE); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_mem_allocate_image_gen(cl_device_id device, cl_mem mem) +{ + cl_mem_gen mem_gen; + cl_mem_image image = cl_mem_to_image(mem); + cl_int err = CL_SUCCESS; + + mem_gen = CL_CALLOC(1, sizeof(_cl_mem_gen)); + if (mem_gen == NULL) + return CL_OUT_OF_HOST_MEMORY; + + mem_gen->mem_base.device = device; + mem->each_device[0] = (cl_mem_for_device)mem_gen; + + /* Only a sub-set of the formats are supported */ + mem_gen->image.intel_fmt = cl_image_get_gen_format(&image->fmt); + if (mem_gen->image.intel_fmt == INTEL_UNSUPPORTED_FORMAT) { + mem->each_device[0] = NULL; + CL_FREE(mem_gen); + return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; + } + + if (image->image_type == CL_MEM_OBJECT_IMAGE1D || + image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + err = cl_mem_allocate_image_gen_1D(device, mem); + } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D || + image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || + image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (image->mem_from) { + err = cl_mem_allocate_image_gen_2D_buffer(device, mem); + } else { + err = cl_mem_allocate_image_gen_2D_3D(device, mem); + } + } else if (image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + err = cl_mem_allocate_image_gen_1D_buffer(device, mem); + } else + assert(0); + +#if 0 + printf("---- Create image with width: %ld, height: %ld, depth %ld, row_pitch: %ld, slice_pitch %ld, \n" + "--- GPU Real size: width: %ld, height: %ld, depth %ld, row_pitch: %ld, slice_pitch %ld, tiling is %d\n", + image->w, image->h, image->depth, image->row_pitch, image->slice_pitch, + mem_gen->image.gpu_w, mem_gen->image.gpu_h, mem_gen->image.gpu_depth, mem_gen->image.gpu_row_pitch, + mem_gen->image.gpu_slice_pitch, mem_gen->drm_bo->tiling); +#endif + + if (err != CL_SUCCESS) { + mem->each_device[0] = NULL; + CL_FREE(mem_gen); + } + + return err; +} + +LOCAL cl_int +cl_enqueue_handle_map_image_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.map_image.mem_obj; + cl_mem_image image = cl_mem_to_image(event->exec_data.map_image.mem_obj); + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + void *ptr = NULL; + assert(mem_gen); + assert(event->exec_data.map_image.origin[0] + event->exec_data.map_image.region[0] <= image->w); + assert(event->exec_data.map_image.origin[1] + event->exec_data.map_image.region[1] <= image->h); + assert(event->exec_data.map_image.origin[2] + event->exec_data.map_image.region[2] <= image->depth); + + if (status == CL_SUBMITTED || status == CL_RUNNING) + return CL_SUCCESS; + + if (status == CL_QUEUED) { + ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, event->exec_data.map_image.unsync_map); + assert(ptr); + ptr += mem_gen->image.sub_offset; + + if (mem->flags & CL_MEM_USE_HOST_PTR) { + assert(mem->host_ptr); + event->exec_data.map_image.ptr = mem->host_ptr + image->bpp * event->exec_data.map_image.origin[0] + + image->row_pitch * event->exec_data.map_image.origin[1] + + image->slice_pitch * event->exec_data.map_image.origin[2]; + event->exec_data.map_image.row_pitch = image->row_pitch; + event->exec_data.map_image.slice_pitch = image->slice_pitch; + } else { + event->exec_data.map_image.ptr = ptr + image->bpp * event->exec_data.map_image.origin[0] + + mem_gen->image.gpu_row_pitch * event->exec_data.map_image.origin[1] + + mem_gen->image.gpu_slice_pitch * event->exec_data.map_image.origin[2]; + event->exec_data.map_image.row_pitch = mem_gen->image.gpu_row_pitch; + event->exec_data.map_image.slice_pitch = mem_gen->image.gpu_slice_pitch; + if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + event->exec_data.map_image.row_pitch = event->exec_data.map_image.slice_pitch; + if (image->image_type == CL_MEM_OBJECT_IMAGE1D || image->image_type == CL_MEM_OBJECT_IMAGE2D) + event->exec_data.map_image.slice_pitch = 0; + } + + event->exec_data.exec_ctx = ptr; // Find a place to store the mapped ptr temp + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + /* Assure the map complete */ + if (event->exec_data.map_image.unsync_map) + cl_mem_gen_drm_bo_sync(mem_gen->drm_bo); + + ptr = event->exec_data.exec_ctx; + assert(ptr); + + /* Sync back the data to host if fake USE_HOST_PTR */ + if ((mem->flags & CL_MEM_USE_HOST_PTR) && ptr != mem->host_ptr) { + assert(event->exec_data.map_image.ptr == ((char *)mem->host_ptr + + image->bpp * event->exec_data.map_image.origin[0] + + image->row_pitch * event->exec_data.map_image.origin[1] + + image->slice_pitch * event->exec_data.map_image.origin[2])); + + cl_mem_copy_image_region_helper(event->exec_data.map_image.origin, event->exec_data.map_image.region, + event->exec_data.map_image.ptr, image->row_pitch, image->slice_pitch, + ptr, mem_gen->image.gpu_row_pitch, mem_gen->image.gpu_slice_pitch, + image->bpp, image->w, image->h, CL_FALSE, CL_TRUE); + } + + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_handle_unmap_image_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.unmap.mem_obj; + cl_mem_image image = cl_mem_to_image(event->exec_data.unmap.mem_obj); + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(event->exec_data.unmap.ptr); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + /* Sync back the content if fake USE_HOST_PTR */ + void *host_offset_ptr = mem->host_ptr + image->bpp * event->exec_data.map_image.origin[0] + + image->row_pitch * event->exec_data.map_image.origin[1] + + image->slice_pitch * event->exec_data.map_image.origin[2]; + if ((mem->flags & CL_MEM_USE_HOST_PTR) && (host_offset_ptr != mem_gen->drm_bo->mapped_ptr)) { + assert(mem_gen->drm_bo->mapped_ptr); + + void *dst_ptr = mem_gen->drm_bo->mapped_ptr + mem_gen->image.sub_offset; + cl_mem_copy_image_region_helper(event->exec_data.unmap.origin, event->exec_data.unmap.region, + dst_ptr, mem_gen->image.gpu_row_pitch, mem_gen->image.gpu_slice_pitch, + event->exec_data.unmap.ptr, image->row_pitch, image->slice_pitch, + image->bpp, image->w, image->h, CL_TRUE, CL_FALSE); + } + + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_read_image_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.read_write_image.image; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + void *data_ptr = NULL; + const size_t *origin = event->exec_data.read_write_image.origin; + const size_t *region = event->exec_data.read_write_image.region; + cl_mem_image image; + + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(CL_OBJECT_IS_IMAGE(mem)); + assert(event->exec_data.type == EnqueueReadImage); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + image = cl_mem_to_image(mem); + data_ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + if (data_ptr == NULL) + return CL_OUT_OF_RESOURCES; + + data_ptr += mem_gen->image.sub_offset; + + cl_mem_copy_image_region_helper(origin, region, + event->exec_data.read_write_image.ptr, event->exec_data.read_write_image.row_pitch, + event->exec_data.read_write_image.slice_pitch, + data_ptr, mem_gen->image.gpu_row_pitch, mem_gen->image.gpu_slice_pitch, + image->bpp, image->w, image->h, CL_FALSE, CL_TRUE); + + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_write_image_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.read_write_image.image; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + void *data_ptr = NULL; + const size_t *origin = event->exec_data.read_write_image.origin; + const size_t *region = event->exec_data.read_write_image.region; + cl_mem_image image; + void *src_ptr = event->exec_data.read_write_image.ptr; + + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(CL_OBJECT_IS_IMAGE(mem)); + assert(event->exec_data.type == EnqueueWriteImage); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + image = cl_mem_to_image(mem); + data_ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + if (data_ptr == NULL) + return CL_OUT_OF_RESOURCES; + + data_ptr += mem_gen->image.sub_offset; + + cl_mem_copy_image_region_helper(origin, region, + data_ptr, mem_gen->image.gpu_row_pitch, mem_gen->image.gpu_slice_pitch, + src_ptr, event->exec_data.read_write_image.row_pitch, + event->exec_data.read_write_image.slice_pitch, + image->bpp, image->w, image->h, CL_TRUE, CL_FALSE); + + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + return CL_SUCCESS; +} -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet