Some comments, On δΊ”, 2014-07-04 at 12:00 +0800, Lv Meng wrote: > Signed-off-by: Lv Meng <meng...@intel.com> > --- > src/CMakeLists.txt | 3 ++- > src/cl_context.h | 1 + > src/cl_mem.c | 27 > +++++++++++++++++++++++-- > src/kernels/cl_internal_copy_buf_rect_align4.cl | 15 ++++++++++++++ > 4 files changed, 43 insertions(+), 3 deletions(-) > create mode 100644 src/kernels/cl_internal_copy_buf_rect_align4.cl > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt > index 46426d9..dff8fdf 100644 > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -41,7 +41,8 @@ set (KERNEL_STR_FILES) > set (KERNEL_NAMES cl_internal_copy_buf_align4 > cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset > cl_internal_copy_buf_unalign_dst_offset > cl_internal_copy_buf_unalign_src_offset > -cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d > cl_internal_copy_image_2d_to_2d > +cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4 > +cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d > cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d > cl_internal_copy_image_3d_to_3d > cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer > cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d > diff --git a/src/cl_context.h b/src/cl_context.h > index 75afbf6..f8342d3 100644 > --- a/src/cl_context.h > +++ b/src/cl_context.h > @@ -47,6 +47,7 @@ enum _cl_internal_ker_type { > CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, > CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, > CL_ENQUEUE_COPY_BUFFER_RECT, > + CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, > CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d > CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d > CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d > diff --git a/src/cl_mem.c b/src/cl_mem.c > index 70bc3eb..b78258f 100644 > --- a/src/cl_mem.c > +++ b/src/cl_mem.c > @@ -1396,9 +1396,20 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem > src_buf, cl_mem dst_buf, > size_t dst_row_pitch, size_t dst_slice_pitch) { > cl_int ret; > cl_kernel ker; > + cl_int index; > size_t global_off[] = {0,0,0}; > size_t global_sz[] = {1,1,1}; > size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1}; > + // the src and dst mem rect is continuous, the copy is degraded to buf > copy > + if((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) && > + (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * > dst_row_pitch == dst_slice_pitch)){ > + cl_int src_offset = src_origin[2]*src_slice_pitch + > src_origin[1]*src_row_pitch + src_origin[0]; > + cl_int dst_offset = dst_origin[2]*dst_slice_pitch + > dst_origin[1]*dst_row_pitch + dst_origin[0]; > + cl_int size = region[0]*region[1]*region[2]; > + ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size); > + return ret; > + } > + > if(region[1] == 1) local_sz[1] = 1; > if(region[2] == 1) local_sz[2] = 1; > global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; > @@ -1413,8 +1424,20 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem > src_buf, cl_mem dst_buf, > /* setup the kernel and run. */ > extern char cl_internal_copy_buf_rect_str[]; > extern size_t cl_internal_copy_buf_rect_str_size; > - > - ker = cl_context_get_static_kernel_from_bin(queue->ctx, > CL_ENQUEUE_COPY_BUFFER_RECT, > + index = CL_ENQUEUE_COPY_BUFFER_RECT; > + if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== > 0) && (dst_row_pitch % 4== 0) > + && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (global_sz[0] > % 4 == 0) ){ > + global_sz[0] /= 4; > + src_offset /= 4; > + dst_offset /= 4; > + src_row_pitch /= 4; > + dst_row_pitch /= 4; > + src_slice_pitch /= 4; > + dst_slice_pitch /= 4; > + index = CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4; > + } > + > + ker = cl_context_get_static_kernel_from_bin(queue->ctx, index, > cl_internal_copy_buf_rect_str, > (size_t)cl_internal_copy_buf_rect_str_size, NULL);
I think here you use the wrong source string. For align 4 rect buffer, you should use cl_internal_copy_buf_rect_align4_str here, which I notice that already exists in the cl_internal_copy_buf_rect_align4_str.c I think you separate align and unalign cases as cl_mem_fill > > if (!ker) > diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl > b/src/kernels/cl_internal_copy_buf_rect_align4.cl > new file mode 100644 > index 0000000..fbfe7b2 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl > @@ -0,0 +1,15 @@ > +kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int src_offset, unsigned > int dst_offset, > + unsigned int src_row_pitch, > unsigned int src_slice_pitch, > + unsigned int dst_row_pitch, > unsigned int dst_slice_pitch) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_offset += k * src_slice_pitch + j * src_row_pitch + i; > + dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; > + dst[dst_offset] = src[src_offset]; > +} _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet