[Beignet] [patch v3 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
From: Luo Xionghu xionghu@intel.com We can change the image_channel_order to CL_RGBA and image_channel_data_type to CL_UNSIGNED_INT32 for some special case, thus 16 bytes can be read by one work item. Bandwidth is fully used. v2: merge patch 3 of initializing region0; remove k dimension in kernel for 2d image. Signed-off-by: Luo Xionghu xionghu@intel.com --- src/CMakeLists.txt | 2 +- src/cl_context.h | 1 + src/cl_mem.c | 44 ++ .../cl_internal_copy_buffer_to_image_2d_align16.cl | 18 + 4 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da69532..4e67c71 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer -cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d +cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d cl_internal_fill_buf_align8 cl_internal_fill_buf_align4 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign cl_internal_fill_buf_align128 cl_internal_fill_image_1d diff --git a/src/cl_context.h b/src/cl_context.h index fdbfd2a..249fed8 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -63,6 +63,7 @@ enum _cl_internal_ker_type { CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne pattern, pattern size=1 CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne pattern, pattern size=2 diff --git a/src/cl_mem.c b/src/cl_mem.c index b41ec14..0a2613d 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -1816,6 +1816,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me uint32_t intel_fmt, bpp; cl_image_format fmt; size_t origin0, region0; + size_t kn_src_offset; + int align16 = 0; + size_t align_size = 1; + size_t w_saved = 0; if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; @@ -1826,24 +1830,48 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me /* We use one kernel to copy the data. The kernel is lazily created. */ assert(image-base.ctx == buffer-ctx); - fmt.image_channel_order = CL_R; - fmt.image_channel_data_type = CL_UNSIGNED_INT8; intel_fmt = image-intel_fmt; bpp = image-bpp; - image-intel_fmt = cl_image_get_intel_format(fmt); - image-w = image-w * image-bpp; - image-bpp = 1; + w_saved = image-w; region0 = region[0] * bpp; - origin0 = dst_origin[0] * bpp; + kn_src_offset = src_offset; + if((image-image_type == CL_MEM_OBJECT_IMAGE2D) ((image-w * image-bpp) % 16 == 0) + ((dst_origin[0] * bpp) % 16 == 0) (region0 % 16 == 0) (src_offset % 16 == 0)){ +fmt.image_channel_order = CL_RGBA; +fmt.image_channel_data_type = CL_UNSIGNED_INT32; +align16 = 1; +align_size = 16; + } + else{ +fmt.image_channel_order = CL_R; +fmt.image_channel_data_type = CL_UNSIGNED_INT8; +align_size = 1; + } + image-intel_fmt = cl_image_get_intel_format(fmt); + image-w = (image-w * image-bpp) / align_size; + image-bpp = align_size; + region0 = (region[0] * bpp) / align_size; + origin0 = (dst_origin[0] * bpp) / align_size; + kn_src_offset /= align_size; global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; /* setup the kernel and run. */ if(image-image_type == CL_MEM_OBJECT_IMAGE2D) { +if(align16){ + extern char cl_internal_copy_buffer_to_image_2d_align16_str[]; + extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size; + + ker = cl_context_get_static_kernel_from_bin(queue-ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16, +cl_internal_copy_buffer_to_image_2d_align16_str, +(size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL); +} +else{ extern char cl_internal_copy_buffer_to_image_2d_str[]; extern size_t cl_internal_copy_buffer_to_image_2d_str_size; ker = cl_context_get_static_kernel_from_bin(queue-ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL); +
Re: [Beignet] [PATCH v3 1/2] Add example to show v4l2 buffer sharing with extension clGetMemObjectFdIntel.
On 2015.04.08 14:50:47 +0800, Chuanbo Weng wrote: This example captures yuy2 frame directly to cl buffer object by the way of dma, processed by OpenCL kernel, then convert to nv12 format and shown by libva. One thing confusing here is that for current beignet's clGetMemObjectFdIntel(), you are actually exporting cl_mem instead of written as 'import_buf_fd' in your example. That you create CL mem and export that with dmabuf fd, v4l imports that buf fd and returns captured data in that buffer. Then libva imports that buf fd for VA surface. So for CL this is an example to show how to export cl_mem with dmabuf. What about design for import buffer possibly from EGL/libva? v2: Close cl buffer's fd by clCloseMemObjectFdIntel instead of close function. v3: Just use close function, no need of clCloseMemObjectFdIntel. Signed-off-by: Chuanbo Weng chuanbo.w...@intel.com --- CMakeLists.txt | 35 +- examples/CMakeLists.txt| 29 +- .../v4l2_buffer_sharing/v4l2_buffer_sharing.cpp| 590 + kernels/runtime_yuy2_processing.cl | 15 + 4 files changed, 645 insertions(+), 24 deletions(-) create mode 100644 examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp create mode 100644 kernels/runtime_yuy2_processing.cl diff --git a/CMakeLists.txt b/CMakeLists.txt index 5474447..4f627cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,23 +216,30 @@ IF(BUILD_EXAMPLES) IF(NOT X11_FOUND) MESSAGE(FATAL_ERROR XLib is necessary for examples - not found) ENDIF(NOT X11_FOUND) -# libva -pkg_check_modules(LIBVA REQUIRED libva=0.36.0) -IF(LIBVA_FOUND) +# libva libva-x11 +#pkg_check_modules(LIBVA REQUIRED libva=0.36.0) +pkg_check_modules(LIBVA REQUIRED libva) +pkg_check_modules(LIBVA-X11 REQUIRED libva-x11) +set(LIBVA_BUF_SH_DEP false) +set(V4L2_BUF_SH_DEP false) +IF(LIBVA_FOUND AND LIBVA-X11_FOUND) MESSAGE(STATUS Looking for LIBVA - found at ${LIBVA_PREFIX} ${LIBVA_VERSION}) - INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS}) -ELSE(LIBVA_FOUND) - MESSAGE(STATUS Looking for LIBVA (= 0.36.0) - not found) -ENDIF(LIBVA_FOUND) - -# libva-x11 -pkg_check_modules(LIBVA-X11 REQUIRED libva-x11=0.36.0) -IF(LIBVA-X11_FOUND) MESSAGE(STATUS Looking for LIBVA-X11 - found at ${LIBVA-X11_PREFIX} ${LIBVA-X11_VERSION}) + INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS}) INCLUDE_DIRECTORIES(${LIBVA-X11_INCLUDE_DIRS}) -ELSE(LIBVA-X11_FOUND) - MESSAGE(STATUS Looking for LIBVA-X11 (= 0.36.0) - not found) -ENDIF(LIBVA-X11_FOUND) + set(V4L2_BUF_SH_DEP true) + IF(LIBVA_VERSION VERSION_LESS 0.36.0 OR LIBVA-X11_VERSION VERSION_LESS 0.36.0) +IF(LIBVA_VERSION VERSION_LESS 0.36.0) + MESSAGE(STATUS Looking for LIBVA (= 0.36.0) - not found) +ENDIF(LIBVA_VERSION VERSION_LESS 0.36.0) +IF(LIBVA-X11_VERSION VERSION_LESS 0.36.0) + MESSAGE(STATUS Looking for LIBVA-X11 (= 0.36.0) - not found) +ENDIF(LIBVA-X11_VERSION VERSION_LESS 0.36.0) +MESSAGE(STATUS Example libva_buffer_sharing will not be built) + ELSE(LIBVA_VERSION VERSION_LESS 0.36.0 OR LIBVA-X11_VERSION VERSION_LESS 0.36.0) +set(LIBVA_BUF_SH_DEP true) + ENDIF(LIBVA_VERSION VERSION_LESS 0.36.0 OR LIBVA-X11_VERSION VERSION_LESS 0.36.0) +ENDIF(LIBVA_FOUND AND LIBVA-X11_FOUND) ENDIF(BUILD_EXAMPLES) ADD_SUBDIRECTORY(include) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 904f259..ab31fe7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,3 +1,9 @@ +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/../utests +${CMAKE_CURRENT_SOURCE_DIR}/../include +${X11_INCLUDE_DIR}) + +IF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP) EXEC_PROGRAM(ls ARGS ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva OUTPUT_VARIABLE LS_OUTPUT) IF(NOT LS_OUTPUT) EXEC_PROGRAM(git ${CMAKE_CURRENT_SOURCE_DIR}/.. ARGS submodule init) @@ -5,17 +11,13 @@ EXEC_PROGRAM(git ${CMAKE_CURRENT_SOURCE_DIR}/.. ARGS submodule update) EXEC_PROGRAM(git ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva ARGS checkout master) ENDIF(NOT LS_OUTPUT) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/../utests -${CMAKE_CURRENT_SOURCE_DIR}/../include -${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va -${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common -${X11_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va +${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common) link_directories (${LIBVA_LIBDIR} ${LIBVA-X11_LIBDIR}) -set (examples_sources +set (va_ocl_basic_sources ../utests/utest_error.c ../utests/utest_assert.cpp ../utests/utest_file_map.cpp @@ -23,13 +25,20 @@ set
Re: [Beignet] [PATCH] Fix a segmentation fault.
Some of these register is request as scalar ir::register and retype to uniform GenRegister. These temp registers are all used as send's address or other payload, the send payload need continuous register, so I change these registers to scalar register. -Original Message- From: Zhigang Gong [mailto:zhigang.g...@linux.intel.com] Sent: Wednesday, April 8, 2015 13:47 To: Yang, Rong R Cc: beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH] Fix a segmentation fault. On Tue, Mar 31, 2015 at 04:39:03PM +0800, Yang Rong wrote: There is a segmentation fault in function isSrcDstDiffSpan, when src's hstrde is not GEN_HORIZONTAL_STRIDE_0 but dst's hstride is GEN_HORIZONTAL_STRIDE_0. This is wrong state, and the LoadInstruction using GenRegister::udxgrf with simd is 1, will introduce this state, when dst is scalar. Use sel.selReg instead of GenRegister::udxgrf. Nice catch, but the patch will always use non-uniform temporary register. Could you refine it to only use non-uniform registers on this segfault case. For other normal cases, they can still use simd1 mode which has better performance. Thanks, Zhigang Gong. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/backend/gen_insn_selection.cpp | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 7f9c95a..058d22b 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -3069,7 +3069,7 @@ namespace gbe GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F); // get dword based address - GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + GenRegister addrDW = sel.selReg(sel.reg(FAMILY_DWORD), + ir::TYPE_U32); sel.push(); if (sel.isScalarReg(addr.reg())) { @@ -3116,9 +3116,9 @@ namespace gbe uint8_t bti) const { using namespace ir; -Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1); -GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); -GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg); +Register tmpReg = sel.reg(FAMILY_DWORD); +GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); +GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32); // Get dword aligned addr sel.push(); if (simdWidth == 1) { @@ -3154,8 +3154,6 @@ namespace gbe { using namespace ir; const uint32_t valueNum = insn.getValueNum(); - const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? - 1 : sel.ctx.getSimdWidth(); RegisterFamily family = getFamily(insn.getValueType()); vectorGenRegister dst(valueNum); @@ -3170,7 +3168,7 @@ namespace gbe vectorRegister tmpReg(tmpRegNum); for(uint32_t i = 0; i tmpRegNum; i++) { tmpReg[i] = sel.reg(FAMILY_DWORD); -tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]); +tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32); } readDWord(sel, tmp, tmp2, address, tmpRegNum, bti); @@ -3254,9 +3252,9 @@ namespace gbe vectorGenRegister tmp2(effectDataNum + 1); vectorGenRegister effectData(effectDataNum); for(uint32_t i = 0; i effectDataNum + 1; i++) - tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), + ir::TYPE_U32); -GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); +GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD), + ir::TYPE_U32); sel.push(); if (simdWidth == 1) sel.curr.noMask = 1; @@ -3465,7 +3463,7 @@ namespace gbe } else { const GenRegister value = sel.selReg(insn.getValue(0)); GBE_ASSERT(insn.getValueNum() == 1); -const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); +const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), + ir::TYPE_U32); if (elemSize == GEN_BYTE_SCATTER_WORD) { sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW)); } else if (elemSize == GEN_BYTE_SCATTER_BYTE) { -- 1.8.3.2 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/2] CHV: Add cherryview support in the runtime.
OK, good idea, I will change the default configure and add the warning. -Original Message- From: Mcgee, Jeff Sent: Saturday, April 4, 2015 03:46 To: Zhigang Gong Cc: Yang, Rong R; beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH 1/2] CHV: Add cherryview support in the runtime. On Fri, Apr 03, 2015 at 10:11:54AM +0800, Zhigang Gong wrote: One minor comment is just as you mentioned, CHV requires cl_driver_update_device_info() to get the corresponding information from the libdrm layer. So we may need to do the following things: Rework the cl_driver_update_device_info()'s interface, one solution is to add one parameter to indicate whether requires to EU/SLICE information from libdrm. If the device is CHV, we need to set that parameter to true. The cl_driver_update_device_info() will return a value to indicate whether it get information from libdrm successfully. If the device is CHV, and we get a false return value from cl_driver_update_device_info(), we can simply return a NULL device and complain the error. Which is much better to keep going with the possible incorrect EU/slice information and may hang the GPU sometime latter. Thanks, Zhigang Gong. Another option would be to set the CHV minimum configuration (2 subslice, 8 EU) as the default configuration before the call to cl_driver_update_device_info. So if the kernel query for these values fails or is not available, you can continue to run assuming the minimal config. But of course it would still be a good idea to make a loud warning that the true config could not be detected. -Jeff On Mon, Mar 30, 2015 at 11:23:55AM +0800, Yang Rong wrote: Cherryview's EU configurations is not decided by pciid, must get from kernel by libdrm. Thanks for Jeff adding this support in the kernel and libdrm. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/backend/gen/gen_mesa_disasm.c | 8 backend/src/backend/gen_program.cpp | 7 +++ backend/src/gbe_bin_generater.cpp | 4 src/cl_device_data.h | 12 +++- src/cl_device_id.c| 29 - src/intel/intel_gpgpu.c | 5 - 6 files changed, 58 insertions(+), 7 deletions(-) diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index 711b943..f8d89e0 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -1136,13 +1136,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac { int err = 0; int space = 0; - if (IS_IVYBRIDGE(deviceID)) { + if (IS_GEN7(deviceID)) { gen_version = 70; - } else if (IS_HASWELL(deviceID)) { + } else if (IS_GEN75(deviceID)) { gen_version = 75; - } else if (IS_BROADWELL(deviceID)) { + } else if (IS_GEN8(deviceID)) { gen_version = 80; - } else if (IS_SKYLAKE(deviceID)) { + } else if (IS_GEN9(deviceID)) { gen_version = 90; } diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp index f4c74f8..f53d5fb 100644 --- a/backend/src/backend/gen_program.cpp +++ b/backend/src/backend/gen_program.cpp @@ -166,6 +166,8 @@ namespace gbe { ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath); } else if (IS_BROADWELL(deviceID)) { ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath); +} else if (IS_CHERRYVIEW(deviceID)) { + ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath); } else if (IS_SKYLAKE(deviceID)) { ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath); } @@ -210,6 +212,7 @@ namespace gbe { (IS_BAYTRAIL_T(typeA) !strcmp(src_hw_info, BYT)) || \ (IS_HASWELL(typeA) !strcmp(src_hw_info, HSW)) || \ (IS_BROADWELL(typeA) !strcmp(src_hw_info, BDW)) || \ + (IS_CHERRYVIEW(typeA) + !strcmp(src_hw_info, CHV)) || \ (IS_SKYLAKE(typeA) !strcmp(src_hw_info, SKL)) ) static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) { @@ -316,6 +319,10 @@ namespace gbe { src_hw_info[0]='B'; src_hw_info[1]='D'; src_hw_info[2]='W'; + }else if(IS_CHERRYVIEW(prog-deviceID)){ +src_hw_info[0]='C'; +src_hw_info[1]='H'; +src_hw_info[2]='V'; }else if(IS_SKYLAKE(prog-deviceID)){ src_hw_info[0]='S'; src_hw_info[1]='K'; diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp index
[Beignet] [patch v3 2/2] add benckmark for copy data from buffer to image.
From: Luo Xionghu xionghu@intel.com v2: use random input data; update comments. v3: change the image attribute to __write_only. Signed-off-by: Luo Xionghu xionghu@intel.com --- benchmark/CMakeLists.txt | 1 + benchmark/benchmark_copy_buffer_to_image.cpp | 66 ++ .../cl_internal_copy_buffer_to_image_2d_align16.cl | 2 +- 3 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 benchmark/benchmark_copy_buffer_to_image.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 7bd61ee..3e43a21 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -15,6 +15,7 @@ set (benchmark_sources benchmark_use_host_ptr_buffer.cpp benchmark_read_buffer.cpp benchmark_read_image.cpp + benchmark_copy_buffer_to_image.cpp benchmark_copy_image_to_buffer.cpp) diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp new file mode 100644 index 000..2177cfe --- /dev/null +++ b/benchmark/benchmark_copy_buffer_to_image.cpp @@ -0,0 +1,66 @@ +#include string.h +#include utests/utest_helper.hpp +#include sys/time.h + +#define IMAGE_BPP 2 + +double benchmark_copy_buffer_to_image(void) +{ + struct timeval start,stop; + const size_t w = 960 * 4; + const size_t h = 540 * 4; + const size_t sz = IMAGE_BPP * w * h; + cl_image_format format; + cl_image_desc desc; + + memset(desc, 0x0, sizeof(cl_image_desc)); + memset(format, 0x0, sizeof(cl_image_format)); + + // Setup image and buffer + buf_data[0] = (unsigned short*) malloc(sz); + for (uint32_t i = 0; i w*h; ++i) { +((unsigned short*)buf_data[0])[i] = (rand() 0x); + } + + format.image_channel_order = CL_R; + format.image_channel_data_type = CL_UNSIGNED_INT16; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = w; + desc.image_height = h; + desc.image_row_pitch = 0; + OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, buf_data[0]); + OCL_CREATE_IMAGE(buf[1], 0, format, desc, NULL); + + /*copy buffer to image*/ + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {w, h, 1}; + + OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region, +0, NULL, NULL); + OCL_FINISH(); + OCL_MAP_BUFFER_GTT(1); + /*check result*/ + for (uint32_t j = 0; j h; ++j) +for (uint32_t i = 0; i w; i++) +{ + OCL_ASSERT(((unsigned short*)buf_data[0])[j * w + i] == ((unsigned short*)buf_data[1])[j * w + i]); +} + OCL_UNMAP_BUFFER_GTT(1); + gettimeofday(start,0); + + for (uint32_t i=0; i100; i++) { +OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region, +0, NULL, NULL); + } + OCL_FINISH(); + + gettimeofday(stop,0); + free(buf_data[0]); + buf_data[0] = NULL; + + double elapsed = time_subtract(stop, start, 0); + + return BANDWIDTH(sz * 100, elapsed); +} + +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image); diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl index e4cef73..5b32cd5 100644 --- a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl +++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl @@ -1,4 +1,4 @@ -kernel void __cl_copy_buffer_to_image_2d_align16(__read_only image2d_t image, global uint4* buffer, +kernel void __cl_copy_buffer_to_image_2d_align16(__write_only image2d_t image, global uint4* buffer, unsigned int region0, unsigned int region1, unsigned int region2, unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2, unsigned int src_offset) -- 1.9.1 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Fix a segmentation fault.
On Tue, Mar 31, 2015 at 04:39:03PM +0800, Yang Rong wrote: There is a segmentation fault in function isSrcDstDiffSpan, when src's hstrde is not GEN_HORIZONTAL_STRIDE_0 but dst's hstride is GEN_HORIZONTAL_STRIDE_0. This is wrong state, and the LoadInstruction using GenRegister::udxgrf with simd is 1, will introduce this state, when dst is scalar. Use sel.selReg instead of GenRegister::udxgrf. Nice catch, but the patch will always use non-uniform temporary register. Could you refine it to only use non-uniform registers on this segfault case. For other normal cases, they can still use simd1 mode which has better performance. Thanks, Zhigang Gong. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/backend/gen_insn_selection.cpp | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 7f9c95a..058d22b 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -3069,7 +3069,7 @@ namespace gbe GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F); // get dword based address - GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + GenRegister addrDW = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); sel.push(); if (sel.isScalarReg(addr.reg())) { @@ -3116,9 +3116,9 @@ namespace gbe uint8_t bti) const { using namespace ir; -Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1); -GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); -GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg); +Register tmpReg = sel.reg(FAMILY_DWORD); +GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); +GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32); // Get dword aligned addr sel.push(); if (simdWidth == 1) { @@ -3154,8 +3154,6 @@ namespace gbe { using namespace ir; const uint32_t valueNum = insn.getValueNum(); - const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? - 1 : sel.ctx.getSimdWidth(); RegisterFamily family = getFamily(insn.getValueType()); vectorGenRegister dst(valueNum); @@ -3170,7 +3168,7 @@ namespace gbe vectorRegister tmpReg(tmpRegNum); for(uint32_t i = 0; i tmpRegNum; i++) { tmpReg[i] = sel.reg(FAMILY_DWORD); -tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]); +tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32); } readDWord(sel, tmp, tmp2, address, tmpRegNum, bti); @@ -3254,9 +3252,9 @@ namespace gbe vectorGenRegister tmp2(effectDataNum + 1); vectorGenRegister effectData(effectDataNum); for(uint32_t i = 0; i effectDataNum + 1; i++) - tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); -GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); +GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); sel.push(); if (simdWidth == 1) sel.curr.noMask = 1; @@ -3465,7 +3463,7 @@ namespace gbe } else { const GenRegister value = sel.selReg(insn.getValue(0)); GBE_ASSERT(insn.getValueNum() == 1); -const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); +const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); if (elemSize == GEN_BYTE_SCATTER_WORD) { sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW)); } else if (elemSize == GEN_BYTE_SCATTER_BYTE) { -- 1.8.3.2 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH v3 1/2] Add example to show v4l2 buffer sharing with extension clGetMemObjectFdIntel.
This example captures yuy2 frame directly to cl buffer object by the way of dma, processed by OpenCL kernel, then convert to nv12 format and shown by libva. v2: Close cl buffer's fd by clCloseMemObjectFdIntel instead of close function. v3: Just use close function, no need of clCloseMemObjectFdIntel. Signed-off-by: Chuanbo Weng chuanbo.w...@intel.com --- CMakeLists.txt | 35 +- examples/CMakeLists.txt| 29 +- .../v4l2_buffer_sharing/v4l2_buffer_sharing.cpp| 590 + kernels/runtime_yuy2_processing.cl | 15 + 4 files changed, 645 insertions(+), 24 deletions(-) create mode 100644 examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp create mode 100644 kernels/runtime_yuy2_processing.cl diff --git a/CMakeLists.txt b/CMakeLists.txt index 5474447..4f627cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,23 +216,30 @@ IF(BUILD_EXAMPLES) IF(NOT X11_FOUND) MESSAGE(FATAL_ERROR XLib is necessary for examples - not found) ENDIF(NOT X11_FOUND) -# libva -pkg_check_modules(LIBVA REQUIRED libva=0.36.0) -IF(LIBVA_FOUND) +# libva libva-x11 +#pkg_check_modules(LIBVA REQUIRED libva=0.36.0) +pkg_check_modules(LIBVA REQUIRED libva) +pkg_check_modules(LIBVA-X11 REQUIRED libva-x11) +set(LIBVA_BUF_SH_DEP false) +set(V4L2_BUF_SH_DEP false) +IF(LIBVA_FOUND AND LIBVA-X11_FOUND) MESSAGE(STATUS Looking for LIBVA - found at ${LIBVA_PREFIX} ${LIBVA_VERSION}) - INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS}) -ELSE(LIBVA_FOUND) - MESSAGE(STATUS Looking for LIBVA (= 0.36.0) - not found) -ENDIF(LIBVA_FOUND) - -# libva-x11 -pkg_check_modules(LIBVA-X11 REQUIRED libva-x11=0.36.0) -IF(LIBVA-X11_FOUND) MESSAGE(STATUS Looking for LIBVA-X11 - found at ${LIBVA-X11_PREFIX} ${LIBVA-X11_VERSION}) + INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS}) INCLUDE_DIRECTORIES(${LIBVA-X11_INCLUDE_DIRS}) -ELSE(LIBVA-X11_FOUND) - MESSAGE(STATUS Looking for LIBVA-X11 (= 0.36.0) - not found) -ENDIF(LIBVA-X11_FOUND) + set(V4L2_BUF_SH_DEP true) + IF(LIBVA_VERSION VERSION_LESS 0.36.0 OR LIBVA-X11_VERSION VERSION_LESS 0.36.0) +IF(LIBVA_VERSION VERSION_LESS 0.36.0) + MESSAGE(STATUS Looking for LIBVA (= 0.36.0) - not found) +ENDIF(LIBVA_VERSION VERSION_LESS 0.36.0) +IF(LIBVA-X11_VERSION VERSION_LESS 0.36.0) + MESSAGE(STATUS Looking for LIBVA-X11 (= 0.36.0) - not found) +ENDIF(LIBVA-X11_VERSION VERSION_LESS 0.36.0) +MESSAGE(STATUS Example libva_buffer_sharing will not be built) + ELSE(LIBVA_VERSION VERSION_LESS 0.36.0 OR LIBVA-X11_VERSION VERSION_LESS 0.36.0) +set(LIBVA_BUF_SH_DEP true) + ENDIF(LIBVA_VERSION VERSION_LESS 0.36.0 OR LIBVA-X11_VERSION VERSION_LESS 0.36.0) +ENDIF(LIBVA_FOUND AND LIBVA-X11_FOUND) ENDIF(BUILD_EXAMPLES) ADD_SUBDIRECTORY(include) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 904f259..ab31fe7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,3 +1,9 @@ +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/../utests +${CMAKE_CURRENT_SOURCE_DIR}/../include +${X11_INCLUDE_DIR}) + +IF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP) EXEC_PROGRAM(ls ARGS ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva OUTPUT_VARIABLE LS_OUTPUT) IF(NOT LS_OUTPUT) EXEC_PROGRAM(git ${CMAKE_CURRENT_SOURCE_DIR}/.. ARGS submodule init) @@ -5,17 +11,13 @@ EXEC_PROGRAM(git ${CMAKE_CURRENT_SOURCE_DIR}/.. ARGS submodule update) EXEC_PROGRAM(git ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva ARGS checkout master) ENDIF(NOT LS_OUTPUT) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} -${CMAKE_CURRENT_SOURCE_DIR}/../utests -${CMAKE_CURRENT_SOURCE_DIR}/../include -${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va -${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common -${X11_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va +${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common) link_directories (${LIBVA_LIBDIR} ${LIBVA-X11_LIBDIR}) -set (examples_sources +set (va_ocl_basic_sources ../utests/utest_error.c ../utests/utest_assert.cpp ../utests/utest_file_map.cpp @@ -23,13 +25,20 @@ set (examples_sources ./thirdparty/libva/test/common/va_display.c ./thirdparty/libva/test/common/va_display_x11.c) - ADD_DEFINITIONS(-DHAVE_VA_X11) -ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT=${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12) -ADD_LIBRARY(va_ocl_basic SHARED ${examples_sources}) +ADD_LIBRARY(va_ocl_basic SHARED ${va_ocl_basic_sources}) TARGET_LINK_LIBRARIES(va_ocl_basic cl m va va-x11 ${X11_X11_LIB}) +IF(LIBVA_BUF_SH_DEP) +ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT=${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12) ADD_EXECUTABLE(example-libva_buffer_sharing
Re: [Beignet] Haswell-ULT Support?
with a 4.0.0-rc6 kernel ...which includes that fix as standard. Has anyone else tried beignet on Haswell-ULT, and if so, did it work? ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH v2 3/3] Add document to describe the detials of v4l2 buffer sharing.
In general I agree with you. We will clean up and refine these extensions later. I've sent out a new version of this patchset and ask Yuan Feng for review. Thanks, Chuanbo Weng -Original Message- From: Zhigang Gong [mailto:zhigang.g...@linux.intel.com] Sent: Friday, April 03, 2015 9:45 To: Weng, Chuanbo Cc: beignet@lists.freedesktop.org; Yuan, Feng Subject: Re: [Beignet] [PATCH v2 3/3] Add document to describe the detials of v4l2 buffer sharing. One major comment is we should not introduce such an extension without considering the queue. The correct APIs should be something like clEnqueueAcquireMemObjectFdIntel/clEnqueueReleaseMemObjectFdIntel. And the corresponding queue must be the parameters. Without that, the user even has no idea how to make the external application synchronized with opencl. Another more generic comment for all of these internal extensions which are not going to be submitted to the standard is as below: It seems better to add some beignet specified suffix after Intel which indicates this is a beignet only extension. Let's us leave the INTEL suffix as a official name for those Intel specific extensions which have been accepted by Khronos. And for the internal libva sharing extension, as there is already an officially accepted extension. It's time to just apply that offical extension rather than the current unofficial one. Thanks, Zhigang Gong. On Sat, Mar 28, 2015 at 12:34:17AM +0800, Chuanbo Weng wrote: This document includes the steps of using DMABUF buffer sharing between v4l2 and Beignet. Also steps to run corresponding example. Signed-off-by: Chuanbo Weng chuanbo.w...@intel.com --- docs/Beignet.mdwn | 1 + docs/howto/v4l2-buffer-sharing-howto.mdwn | 67 +++ 2 files changed, 68 insertions(+) create mode 100644 docs/howto/v4l2-buffer-sharing-howto.mdwn diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn index aacd7d2..31c0d9a 100644 --- a/docs/Beignet.mdwn +++ b/docs/Beignet.mdwn @@ -254,6 +254,7 @@ Documents for OpenCL application developers - [[Work with old system without c++11|Beignet/howto/oldgcc-howto]] - [[Kernel Optimization Guide|Beignet/optimization-guide]] - [[Libva Buffer Sharing|Beignet/howto/libva-buffer-sharing-howto]] +- [[V4l2 Buffer Sharing|Beignet/howto/v4l2-buffer-sharing-howto]] The wiki URL is as below: [http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedes ktop.org/wiki/Software/Beignet/) diff --git a/docs/howto/v4l2-buffer-sharing-howto.mdwn b/docs/howto/v4l2-buffer-sharing-howto.mdwn new file mode 100644 index 000..d5a9b56 --- /dev/null +++ b/docs/howto/v4l2-buffer-sharing-howto.mdwn @@ -0,0 +1,67 @@ +V4l2 Buffer Sharing HowTo += + +Beignet has extensions +(clGetMemObjectFdIntel/clCloseMemObjectFdIntel) to share gpu buffer +object with v4l2. So users can utilize OpenCL to do processing on input/ouput buffers of v4l2 device without buffer copy. + +Prerequisite + + +Linux kernel supports DMABUF buffer sharing for v4l2 from version +3.8. DMABUF buffer sharing runs well for V4L2_PIX_FMT_MJPEG format on +this version, but there is a bug for V4L2_PIX_FMT_YUYV format. Linux +kernel 3.19.0-rc1 fix this bug, so please use kernel version +3.19.0-rc1 at least if you want to utilize this feature for V4L2_PIX_FMT_YUYV format. + +Steps +- + +The below official v4l2 document describes the details of sharing DMA +buffers between v4l devices and other devices using v4l2 as a DMABUF importer: +[http://linuxtv.org/downloads/v4l-dvb-apis/dmabuf.html](http://linuxt +v.org/downloads/v4l-dvb-apis/dmabuf.html) +Beignet has added +extensions(clGetMemObjectFdIntel/clCloseMemObjectFdIntel) to support +this mechanism. Please follow the steps as below to utilize DMABUF buffer sharing between v4l devices and Beignet: + +- Get the address of this extension by the function: + clGetExtensionFunctionAddress(clGetMemObjectFdIntel) + and clGetExtensionFunctionAddress(clCloseMemObjectFdIntel) + +- Create a number of cl buffer objects, invoke clGetMemObjectFdIntel +to get these buffer + objects' file descriptors. + +- Initiating streaming I/O with DMABUF buffer sharing by calling the VIDIOC_REQBUFS v4l2 ioctl. + +- Enqueue these buffers by calling the VIDIOC_QBUF, dequeue a buffer +by calling VIDIOC_DQBUF, + use OpenCL to do processing on this buffer and re-enqueue... + +- Close file descriptors of these buffers by clCloseMemObjectFdIntel +if your program doesn't + need DMABUF buffer sharing anymore. + +Sample code +--- + +We have developed an example showing how to share DMA buffers between +webcam and Beignet in examples/v4l2_buffer_sharing directory. The +webcam directly captures V4L2_PIX_FMT_YUYV frames into cl buffer +objects by the way of DMABUF buffer sharing, then frames are got mirror effect by OpenCL kernel,
Re: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
ok. As we discussed, you'd better send out a new version:) -Original Message- From: Luo, Xionghu Sent: Friday, April 03, 2015 11:17 To: Weng, Chuanbo; beignet@lists.freedesktop.org Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case. I don't agree with your description. Take your CopyImageToBuffer kernel as example: Acturally, the unaligned kernel also read 16bytes to the color, but only the color.x is useful, so 1 bytes written to the buffer; And for aligned case, read 16bytes then write 16 bytes. So the difference is write instead of read or 16 times per work item . kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer, unsigned int region0, unsigned int region1, unsigned int region2, unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, unsigned int dst_offset) { int i = get_global_id(0); int j = get_global_id(1); int k = get_global_id(2); uint4 color; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; int2 src_coord; if((i = region0) || (j= region1) || (k=region2)) return; src_coord.x = src_origin0 + i; src_coord.y = src_origin1 + j; color = read_imageui(image, sampler, src_coord); dst_offset += (k * region1 + j) * region0 + i; buffer[dst_offset] = color.x; } Luo Xionghu Best Regards -Original Message- From: Weng, Chuanbo Sent: Friday, April 3, 2015 10:57 AM To: Luo, Xionghu; beignet@lists.freedesktop.org Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case. The key point of this optimization is that we change image_channel_order and image_channel_data_type of image internally, so 16 bytes image data can be processed one time instead of processed 16 times per work item. So we give patch description as below: (For CopyImageToBuffer kernel) thus 16 bytes can be read by one work item. (For CopyBufferToImage kernel) thus 16 bytes can be written by one work item. For __read_only issue, maybe our compiler should give a warning for this. This is another topic, I suggest you can send out another mail to discuss this issue:) -Original Message- From: Luo, Xionghu Sent: Thursday, April 02, 2015 11:16 To: Weng, Chuanbo; beignet@lists.freedesktop.org Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case. For CopyBufferToImage kernel, thus 16 bytes can be read by one work item is correct from the Buffer side, data is read from the buffer then write to the image. PS: a typo in the image attribute of this kernel, this image should be __write_only instead of __read_only as it is written to. Why this attribute doesn't work as expected even set to __read_only but still writable? Luo Xionghu Best Regards -Original Message- From: Weng, Chuanbo Sent: Wednesday, April 1, 2015 4:15 PM To: Luo, Xionghu; beignet@lists.freedesktop.org Cc: Luo, Xionghu Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case. One warning when running git am command: warning: 1 line adds whitespace errors. And the words of the patch description thus 16 bytes can be read by one work item should be changed to thus 16 bytes can be written by one work item . I think Zhigang can help to do this minor modification before pushing this patch. Other part of this patch LGTM. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of xionghu@intel.com Sent: Wednesday, April 01, 2015 13:11 To: beignet@lists.freedesktop.org Cc: Luo, Xionghu Subject: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case. From: Luo Xionghu xionghu@intel.com We can change the image_channel_order to CL_RGBA and image_channel_data_type to CL_UNSIGNED_INT32 for some special case, thus 16 bytes can be read by one work item. Bandwidth is fully used. v2: merge patch 3 of initializing region0; remove k dimension in kernel for 2d image. Signed-off-by: Luo Xionghu xionghu@intel.com --- src/CMakeLists.txt | 2 +- src/cl_context.h | 1 + src/cl_mem.c | 44 ++ .../cl_internal_copy_buffer_to_image_2d_align16.cl | 18 + 4 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da69532..4e67c71 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra cl_internal_copy_image_2d_array_to_2d_array
Re: [Beignet] Haswell-ULT Support?
Ah yes, echo 0 /sys/module/i915/parameters/enable_cmd_parser fixed everything. Now all the tests pass and other OpenCL using programs work great! If I get a chance I will spend the time to try and bisect which kernel version causes issues. On 04/08/2015 11:04 AM, Zhigang Gong wrote: We tested several HSW ULT machines, and they work fine. But the kernel version is not the 4.0.0-rc6. The symptom indicates the secure batch buffer feature is not disabled. CC to Mengmeng, could you help to try this issue on a HSW ULT machine with 4.0.0-rc6 kernel? I'm afraid there may be some latest changes in the kernel which make our secure batch disable patch broken now. Thanks. -Original Message- From: Rebecca N. Palmer [mailto:rebecca_pal...@zoho.com] Sent: Wednesday, April 8, 2015 4:53 PM To: Zhigang Gong; 'Julian Simioni' Cc: beignet@lists.freedesktop.org Subject: Re: [Beignet] Haswell-ULT Support? with a 4.0.0-rc6 kernel ...which includes that fix as standard. Has anyone else tried beignet on Haswell-ULT, and if so, did it work? ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet