From: Junyan He <junyan...@intel.com> We will use enqueue kernel range to implement GPU side mem/image copy and fill. This is gen specific logc and move to gen dir.
Signed-off-by: Junyan He <junyan...@intel.com> --- src/gen/cl_gen.h | 24 +++ src/gen/cl_image_gen.c | 394 +++++++++++++++++++++++++++++++++++++++++++++++++ src/gen/cl_mem_gen.c | 327 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 745 insertions(+) create mode 100644 src/gen/cl_image_gen.c create mode 100644 src/gen/cl_mem_gen.c diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h index 710068a..c4294eb 100644 --- a/src/gen/cl_gen.h +++ b/src/gen/cl_gen.h @@ -172,4 +172,28 @@ extern char *cl_internal_built_in_kernel_str; extern size_t cl_internal_built_in_kernel_str_size; extern cl_device_id cl_get_device_id_gen(cl_platform_id platform); + +/*************************************** Mem *******************************************/ +extern cl_int cl_mem_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, + size_t src_offset, size_t dst_offset, size_t cb); +extern cl_int cl_mem_fill_gen(cl_command_queue queue, cl_event e, const void *pattern, size_t pattern_size, + cl_mem buffer, size_t offset, size_t size); +extern cl_int cl_mem_copy_buffer_rect_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, + cl_mem dst_buf, const size_t *src_origin, const size_t *dst_origin, + const size_t *region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch); + +/*************************************** Image ******************************************/ +extern cl_int cl_image_fill_gen(cl_command_queue queue, cl_event e, const void *pattern, + cl_mem src_image, const size_t *origin, const size_t *region); +extern cl_int cl_image_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_image, + cl_mem dst_image, const size_t *src_origin, + const size_t *dst_origin, const size_t *region); +extern cl_int cl_mem_copy_image_to_buffer_gen(cl_command_queue queue, cl_event event, cl_mem image, + cl_mem buffer, const size_t *src_origin, + const size_t dst_offset, const size_t *region); +extern cl_int cl_mem_copy_buffer_to_image_gen(cl_command_queue queue, cl_event event, cl_mem buffer, + cl_mem image, const size_t src_offset, + const size_t *dst_origin, const size_t *region); + #endif /* End of __CL_GEN_H__ */ diff --git a/src/gen/cl_image_gen.c b/src/gen/cl_image_gen.c new file mode 100644 index 0000000..39d3d23 --- /dev/null +++ b/src/gen/cl_image_gen.c @@ -0,0 +1,394 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "cl_gen.h" +#include <math.h> + +#define LOCAL_SZ_0 16 +#define LOCAL_SZ_1 4 +#define LOCAL_SZ_2 4 + +LOCAL cl_int +cl_image_fill_gen(cl_command_queue queue, cl_event e, const void *pattern, + cl_mem mem, const size_t *origin, const size_t *region) +{ + cl_int ret = CL_SUCCESS; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + struct _cl_mem_image *src_image = cl_mem_image(mem); + uint32_t savedIntelFmt = src_image->intel_fmt; + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_1D); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_2D); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY); + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_3D); + } else { + return CL_IMAGE_FORMAT_NOT_SUPPORTED; + } + + if (!ker) + return CL_OUT_OF_RESOURCES; + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image); + if (src_image->fmt.image_channel_order >= CL_sRGBA) { +#define RGB2sRGB(linear) (linear <= 0.0031308f) ? (12.92f * linear) : (1.055f * powf(linear, 1.0f / 2.4f) - 0.055f); + cl_image_format fmt; + float newpattern[4] = {0.0, 0.0, 0.0, ((float *)pattern)[3]}; + int i; + for (i = 0; i < 3; i++) { + if (src_image->fmt.image_channel_order == CL_sRGBA) { + newpattern[i] = RGB2sRGB(((float *)pattern)[i]); + } else + newpattern[2 - i] = RGB2sRGB(((float *)pattern)[i]); + } + cl_kernel_set_arg(ker, 1, sizeof(float) * 4, newpattern); + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNORM_INT8; + src_image->intel_fmt = cl_image_get_intel_format(&fmt); +#undef RGB2sRGB + } else + cl_kernel_set_arg(ker, 1, sizeof(float) * 4, pattern); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]); + + ret = cl_command_queue_ND_range(queue, ker, e, 3, global_off, global_sz, local_sz); + src_image->intel_fmt = savedIntelFmt; + return ret; +} + +LOCAL cl_int +cl_image_copy_gen(cl_command_queue queue, cl_event event, cl_mem src, cl_mem dst, + const size_t *src_origin, const size_t *dst_origin, const size_t *region) +{ + cl_int ret; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + uint32_t fixupDataType; + uint32_t savedIntelFmt; + struct _cl_mem_image *src_image = cl_mem_image(src); + struct _cl_mem_image *dst_image = cl_mem_image(dst); + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + switch (src_image->fmt.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_UNORM_INT8: + fixupDataType = CL_UNSIGNED_INT8; + break; + case CL_HALF_FLOAT: + case CL_SNORM_INT16: + case CL_UNORM_INT16: + fixupDataType = CL_UNSIGNED_INT16; + break; + case CL_FLOAT: + fixupDataType = CL_UNSIGNED_INT32; + break; + default: + fixupDataType = 0; + } + + if (fixupDataType) { + cl_image_format fmt; + if (src_image->fmt.image_channel_order != CL_BGRA && + src_image->fmt.image_channel_order != CL_sBGRA && + src_image->fmt.image_channel_order != CL_sRGBA) + fmt.image_channel_order = src_image->fmt.image_channel_order; + else + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = fixupDataType; + savedIntelFmt = src_image->intel_fmt; + src_image->intel_fmt = cl_image_get_intel_format(&fmt); + dst_image->intel_fmt = src_image->intel_fmt; + } + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(src_image->base.ctx == dst_image->base.ctx); + + /* setup the kernel and run. */ + if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D); + } + } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D); + } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY); + } + } + + if (!ker) { + ret = CL_OUT_OF_RESOURCES; + goto fail; + } + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]); + cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]); + cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]); + + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + +fail: + if (fixupDataType) { + src_image->intel_fmt = savedIntelFmt; + dst_image->intel_fmt = savedIntelFmt; + } + return ret; +} + +LOCAL cl_int +cl_mem_copy_image_to_buffer_gen(cl_command_queue queue, cl_event event, cl_mem the_image, cl_mem buffer, + const size_t *src_origin, const size_t dst_offset, const size_t *region) +{ + cl_int ret; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + uint32_t intel_fmt, bpp; + cl_image_format fmt; + size_t origin0, region0; + size_t kn_dst_offset; + int align16 = 0; + size_t align_size = 1; + size_t w_saved; + struct _cl_mem_image *image = cl_mem_image(the_image); + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(image->base.ctx == buffer->ctx); + + intel_fmt = image->intel_fmt; + bpp = image->bpp; + w_saved = image->w; + region0 = region[0] * bpp; + kn_dst_offset = dst_offset; + if ((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) && + ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (dst_offset % 16 == 0)) { + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNSIGNED_INT32; + align16 = 1; + align_size = 16; + } else { + fmt.image_channel_order = CL_R; + fmt.image_channel_data_type = CL_UNSIGNED_INT8; + align_size = 1; + } + image->intel_fmt = cl_image_get_intel_format(&fmt); + image->w = (image->w * image->bpp) / align_size; + image->bpp = align_size; + region0 = (region[0] * bpp) / align_size; + origin0 = (src_origin[0] * bpp) / align_size; + kn_dst_offset /= align_size; + global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + + /* setup the kernel and run. */ + if (image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (align16) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16); + } else { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER); + } + } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER); + } + + if (!ker) { + ret = CL_OUT_OF_RESOURCES; + goto fail; + } + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset); + + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + +fail: + + image->intel_fmt = intel_fmt; + image->bpp = bpp; + image->w = w_saved; + + return ret; +} + +LOCAL cl_int +cl_mem_copy_buffer_to_image_gen(cl_command_queue queue, cl_event event, cl_mem buffer, cl_mem the_image, + const size_t src_offset, const size_t *dst_origin, const size_t *region) +{ + cl_int ret; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2}; + uint32_t intel_fmt, bpp; + cl_image_format fmt; + size_t origin0, region0; + size_t kn_src_offset; + int align16 = 0; + size_t align_size = 1; + size_t w_saved = 0; + struct _cl_mem_image *image = cl_mem_image(the_image); + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(image->base.ctx == buffer->ctx); + + intel_fmt = image->intel_fmt; + bpp = image->bpp; + w_saved = image->w; + region0 = region[0] * bpp; + kn_src_offset = src_offset; + if ((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) && + ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)) { + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNSIGNED_INT32; + align16 = 1; + align_size = 16; + } else { + fmt.image_channel_order = CL_R; + fmt.image_channel_data_type = CL_UNSIGNED_INT8; + align_size = 1; + } + image->intel_fmt = cl_image_get_intel_format(&fmt); + image->w = (image->w * image->bpp) / align_size; + image->bpp = align_size; + region0 = (region[0] * bpp) / align_size; + origin0 = (dst_origin[0] * bpp) / align_size; + kn_src_offset /= align_size; + global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + + /* setup the kernel and run. */ + if (image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if (align16) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16); + } else { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D); + } + } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D); + } + if (!ker) + return CL_OUT_OF_RESOURCES; + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset); + + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + + image->intel_fmt = intel_fmt; + image->bpp = bpp; + image->w = w_saved; + + return ret; +} diff --git a/src/gen/cl_mem_gen.c b/src/gen/cl_mem_gen.c new file mode 100644 index 0000000..0d8c35c --- /dev/null +++ b/src/gen/cl_mem_gen.c @@ -0,0 +1,327 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "cl_gen.h" + +#define LOCAL_SZ_0 16 +#define LOCAL_SZ_1 4 +#define LOCAL_SZ_2 4 + +LOCAL cl_int +cl_mem_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, + size_t src_offset, size_t dst_offset, size_t cb) +{ + cl_int ret = CL_SUCCESS; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {1, 1, 1}; + const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff}; + int aligned = 0; + int dw_src_offset = src_offset / 4; + int dw_dst_offset = dst_offset / 4; + + if (!cb) + return ret; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(src_buf->ctx == dst_buf->ctx); + + /* All 16 bytes aligned, fast and easy one. */ + if ((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_ALIGN16); + cb = cb / 16; + aligned = 1; + } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/ + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_ALIGN4); + cb = cb / 4; + aligned = 1; + } + + if (aligned) { + if (!ker) + return CL_OUT_OF_RESOURCES; + + if (cb < LOCAL_SZ_0) { + local_sz[0] = 1; + } else { + local_sz[0] = LOCAL_SZ_0; + } + global_sz[0] = ((cb + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &cb); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* Now handle the unaligned cases. */ + int dw_num = ((dst_offset % 4 + cb) + 3) / 4; + unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4]; + unsigned int last_mask = masks[(dst_offset + cb) % 4]; + /* handle the very small range copy. */ + if (cb < 4 && dw_num == 1) { + first_mask = first_mask | ~last_mask; + } + + if (cb < LOCAL_SZ_0) { + local_sz[0] = 1; + } else { + local_sz[0] = LOCAL_SZ_0; + } + global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; + + if (src_offset % 4 == dst_offset % 4) { + /* Src and dst has the same unaligned offset, just handle the + header and tail. */ + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET); + + if (!ker) + return CL_OUT_OF_RESOURCES; + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); + cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); + cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */ + if (dst_offset % 4 < src_offset % 4) { + int align_diff = src_offset % 4 - dst_offset % 4; + unsigned int dw_mask = masks[align_diff]; + int shift = align_diff * 8; + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET); + if (!ker) + return CL_OUT_OF_RESOURCES; + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); + cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); + cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); + cl_kernel_set_arg(ker, 7, sizeof(int), &shift); + cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */ + if (dst_offset % 4 > src_offset % 4) { + int align_diff = dst_offset % 4 - src_offset % 4; + unsigned int dw_mask = masks[4 - align_diff]; + int shift = align_diff * 8; + int src_less = !(src_offset % 4) && !((src_offset + cb) % 4); + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET); + if (!ker) + return CL_OUT_OF_RESOURCES; + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); + cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); + cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); + cl_kernel_set_arg(ker, 7, sizeof(int), &shift); + cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); + cl_kernel_set_arg(ker, 9, sizeof(int), &src_less); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* no case can hanldle? */ + assert(0); + + return ret; +} + +LOCAL cl_int +cl_mem_fill_gen(cl_command_queue queue, cl_event e, const void *pattern, size_t pattern_size, + cl_mem buffer, size_t offset, size_t size) +{ + cl_int ret = CL_SUCCESS; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {1, 1, 1}; + char pattern_comb[4]; + int is_128 = 0; + const void *pattern1 = NULL; + + assert(offset % pattern_size == 0); + assert(size % pattern_size == 0); + + if (!size) + return ret; + + if (pattern_size == 128) { + /* 128 is according to pattern of double16, but double works not very + well on some platform. We use two float16 to handle this. */ + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN128); + is_128 = 1; + pattern_size = pattern_size / 2; + pattern1 = pattern + pattern_size; + size = size / 2; + } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */ + int order = ffs(pattern_size / 8) - 1; + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order); + } else if (pattern_size == 4) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN4); + } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) { + /* The unaligned case. But if copy size and offset are aligned to 4, we can fake + the pattern with the pattern duplication fill in. */ + assert(pattern_size == 1 || pattern_size == 2); + + if (pattern_size == 2) { + memcpy(pattern_comb, pattern, sizeof(char) * 2); + memcpy(pattern_comb + 2, pattern, sizeof(char) * 2); + } else { + pattern_comb[0] = pattern_comb[1] = pattern_comb[2] = pattern_comb[3] = *(char *)pattern; + } + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN4); + pattern_size = 4; + pattern = pattern_comb; + } + //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel + //functions. This depend on the usage but now we just use aligned 1 and 2. + else if (pattern_size == 2) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN2); + } else if (pattern_size == 1) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_UNALIGN); + } else + assert(0); + + if (!ker) + return CL_OUT_OF_RESOURCES; + + size = size / pattern_size; + offset = offset / pattern_size; + + if (size < LOCAL_SZ_0) { + local_sz[0] = 1; + } else { + local_sz[0] = LOCAL_SZ_0; + } + global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer); + cl_kernel_set_arg(ker, 1, pattern_size, pattern); + cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset); + cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size); + if (is_128) + cl_kernel_set_arg(ker, 4, pattern_size, pattern1); + + ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off, global_sz, local_sz); + return ret; +} + +LOCAL cl_int +cl_mem_copy_buffer_rect_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch) +{ + cl_int ret; + cl_kernel ker; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_1}; + // the src and dst mem rect is continuous, the copy is degraded to buf copy + if ((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) && + (region[1] * src_row_pitch == src_slice_pitch) && + (region[1] * dst_row_pitch == dst_slice_pitch)) { + cl_int src_offset = src_origin[2] * src_slice_pitch + + src_origin[1] * src_row_pitch + src_origin[0]; + cl_int dst_offset = dst_origin[2] * dst_slice_pitch + + dst_origin[1] * dst_row_pitch + dst_origin[0]; + cl_int size = region[0] * region[1] * region[2]; + ret = cl_mem_copy_gen(queue, NULL, src_buf, dst_buf, src_offset, dst_offset, size); + return ret; + } + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + cl_int src_offset = src_origin[2] * src_slice_pitch + src_origin[1] * src_row_pitch + src_origin[0]; + cl_int dst_offset = dst_origin[2] * dst_slice_pitch + dst_origin[1] * dst_row_pitch + dst_origin[0]; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(src_buf->ctx == dst_buf->ctx); + + /* setup the kernel and run. */ + size_t region0 = region[0]; + if ((src_offset % 4 == 0) && (dst_offset % 4 == 0) && + (src_row_pitch % 4 == 0) && (dst_row_pitch % 4 == 0) && + (src_slice_pitch % 4 == 0) && (dst_slice_pitch % 4 == 0) && (region0 % 4 == 0)) { + region0 /= 4; + src_offset /= 4; + dst_offset /= 4; + src_row_pitch /= 4; + dst_row_pitch /= 4; + src_slice_pitch /= 4; + dst_slice_pitch /= 4; + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4); + } else { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_RECT); + } + + if (!ker) + return CL_OUT_OF_RESOURCES; + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch); + cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch); + cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch); + + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; +} -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet