On Tue, 2014-04-29 at 13:57 +0800, Zhigang Gong wrote: > Some minor comments as below: > > On Wed, Apr 23, 2014 at 04:35:25PM +0800, junyan...@inbox.com wrote: > > From: Junyan He <junyan...@linux.intel.com> > > > > We use the floatn's assigment to do the copy. > > 128 pattern size is according to double16, and because > > the double problem on our platform, we use to float16 > > to handle this. > > unaligned cases is not optimized now, just use the char > > assigment. > > > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > > --- > > src/cl_api.c | 78 ++++++++++++++++++++++++++++++++ > > src/cl_context.c | 133 > > ++++++++++++++++++++++++++++++++++++++++++------------- > > src/cl_context.h | 8 ++++ > > src/cl_enqueue.c | 1 + > > src/cl_enqueue.h | 1 + > > src/cl_event.c | 1 + > > src/cl_mem.c | 102 ++++++++++++++++++++++++++++++++++++++++++ > > src/cl_mem.h | 3 ++ > > 8 files changed, 295 insertions(+), 32 deletions(-) > > > > diff --git a/src/cl_api.c b/src/cl_api.c > > index 1543ff4..be94bcb 100644 > > --- a/src/cl_api.c > > +++ b/src/cl_api.c > > @@ -1592,6 +1592,84 @@ error: > > } > > > > cl_int > > +clEnqueueFillBuffer(cl_command_queue command_queue, > > + cl_mem buffer, > > + const void * pattern, > > + size_t pattern_size, > > + size_t offset, > > + size_t size, > > + cl_uint num_events_in_wait_list, > > + const cl_event * event_wait_list, > > + cl_event * event) > > +{ > > + cl_int err = CL_SUCCESS; > > + enqueue_data *data, no_wait_data = { 0 }; > > + static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128}; > > + int i = 0; > > + > > + CHECK_QUEUE(command_queue); > > + CHECK_MEM(buffer); > > + > > + if (command_queue->ctx != buffer->ctx) { > > + err = CL_INVALID_CONTEXT; > > + goto error; > > + } > > + > > + if (offset < 0 || offset + size > buffer->size) { > > + err = CL_INVALID_VALUE; > > + goto error; > > + } > > + > > + if (pattern == NULL) { > > + err = CL_INVALID_VALUE; > > + goto error; > > + } > > + > > + for (i = 0; i < sizeof(valid_sz)/sizeof(size_t); i++) { > coding style issue, we'd better to use "sizeof(valid_sz) / sizeof(size_t)" > rather than the above compact style. I noticed you mixed two styles in the > same patch, please fix it in the new version. OK, that needs to be refined.
> > > + if (valid_sz[i] == pattern_size) > > + break; > > + } > > + if (i == sizeof(valid_sz)/sizeof(size_t)) { > > + err = CL_INVALID_VALUE; > > + goto error; > > + } > > + > > + if (offset%pattern_size || size%pattern_size) { > > + err = CL_INVALID_VALUE; > > + goto error; > > + } > > + > > + err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, > > size); > > + if (err) { > > + goto error; > > + } > > + > > + TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, > > event, buffer->ctx); > > + > > + data = &no_wait_data; > > + data->type = EnqueueFillBuffer; > > + data->queue = command_queue; > > + > > + if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, > > + event, data, CL_COMMAND_FILL_BUFFER) == > > CL_ENQUEUE_EXECUTE_IMM) { > > + if (event && (*event)->type != CL_COMMAND_USER > > + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { > > + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); > > + } > > + > > + err = cl_command_queue_flush(command_queue); > > + } > > + > > + if(b_output_kernel_perf) > > + time_end(command_queue->ctx, "beignet internal kernel : > > cl_fill_buffer", command_queue); > > + > > + return 0; > > + > > + error: > > + return err; > > +} > > + > > +cl_int > > clEnqueueCopyBuffer(cl_command_queue command_queue, > > cl_mem src_buffer, > > cl_mem dst_buffer, > > diff --git a/src/cl_context.c b/src/cl_context.c > > index 8190e6a..e2dba65 100644 > > --- a/src/cl_context.c > > +++ b/src/cl_context.c > > @@ -1,4 +1,4 @@ > > -/* > > +/* > > * Copyright © 2012 Intel Corporation > > * > > * This library is free software; you can redistribute it and/or > > @@ -188,6 +188,7 @@ error: > > LOCAL void > > cl_context_delete(cl_context ctx) > > { > > + int i = 0; > > if (UNLIKELY(ctx == NULL)) > > return; > > > > @@ -195,6 +196,18 @@ cl_context_delete(cl_context ctx) > > if (atomic_dec(&ctx->ref_n) > 1) > > return; > > > > + /* delete the internal programs. */ > > + for (i = CL_ENQUEUE_COPY_BUFFER_ALIGN4; i < CL_INTERNAL_KERNEL_MAX; i++) > > { > Use i = 0 here may be better or define CL_INTERNAL_KERNEL_MIN to 0 and use > that macro > instead of using a specific enum number. Because the CL_ENQUEUE_COPY_BUFFER_ALIGN4 is not the first one, and we just handle these 4 cases, so it is hard to start from 0 or CL_INTERNAL_KERNEL_MIN > > + if (ctx->internel_kernels[i]) { > > + cl_kernel_delete(ctx->internel_kernels[i]); > > + ctx->internel_kernels[i] = NULL; > > + > > + assert(ctx->internal_prgs[i]); > > + cl_program_delete(ctx->internal_prgs[i]); > > + ctx->internal_prgs[i] = NULL; > > + } > > + } > > + > > /* All object lists should have been freed. Otherwise, the reference > > counter > > * of the context cannot be 0 > > */ > > @@ -252,47 +265,103 @@ cl_context_get_static_kernel(cl_context ctx, cl_int > > index, const char * str_kern > > { > > cl_int ret; > > if (!ctx->internal_prgs[index]) > > - { > > - size_t length = strlen(str_kernel) + 1; > > - ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, > > &str_kernel, &length, NULL); > > - > > - if (!ctx->internal_prgs[index]) > > - return NULL; > > - > > - ret = cl_program_build(ctx->internal_prgs[index], str_option); > > - if (ret != CL_SUCCESS) > > - return NULL; > > - > > - ctx->internal_prgs[index]->is_built = 1; > > - > > - ctx->internel_kernels[index] = > > cl_kernel_dup(ctx->internal_prgs[index]->ker[0]); > > - } > > + { > > + size_t length = strlen(str_kernel) + 1; > > + ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, > > &str_kernel, &length, NULL); > > + > > + if (!ctx->internal_prgs[index]) > > + return NULL; > > + > > + ret = cl_program_build(ctx->internal_prgs[index], str_option); > > + if (ret != CL_SUCCESS) > > + return NULL; > > + > > + ctx->internal_prgs[index]->is_built = 1; > > + > > + /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, > > different kernel. */ > > + if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= > > CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) { > > + int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8; > > + for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) { > > + if (index != i) { > > + assert(ctx->internal_prgs[i] == NULL); > > + assert(ctx->internel_kernels[i] == NULL); > > + cl_program_add_ref(ctx->internal_prgs[index]); > > + ctx->internal_prgs[i] = ctx->internal_prgs[index]; > > + } > > + > > + if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_2", NULL); > > + } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_4", NULL); > > + } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_8", NULL); > > + } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_16", NULL); > > + } else > > + assert(0); > > + } > > + } else { > > + ctx->internel_kernels[index] = > > cl_kernel_dup(ctx->internal_prgs[index]->ker[0]); > > + } > > + } > > > > return ctx->internel_kernels[index]; > > } > > > > cl_kernel > > cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index, > > - const char * str_kernel, size_t size, const char * > > str_option) > > + const char * str_kernel, size_t > > size, const char * str_option) > > { > > cl_int ret; > > cl_int binary_status = CL_SUCCESS; > > if (!ctx->internal_prgs[index]) > > - { > > - ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, > > &ctx->device, > > - &size, (const unsigned char **)&str_kernel, &binary_status, &ret); > > - > > - if (!ctx->internal_prgs[index]) > > - return NULL; > > - > > - ret = cl_program_build(ctx->internal_prgs[index], str_option); > > - if (ret != CL_SUCCESS) > > - return NULL; > > - > > - ctx->internal_prgs[index]->is_built = 1; > > - > > - ctx->internel_kernels[index] = > > cl_kernel_dup(ctx->internal_prgs[index]->ker[0]); > > - } > > + { > > + ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, > > &ctx->device, > > + &size, > > (const unsigned char **)&str_kernel, &binary_status, &ret); > > + > > + if (!ctx->internal_prgs[index]) > > + return NULL; > > + > > + ret = cl_program_build(ctx->internal_prgs[index], str_option); > > + if (ret != CL_SUCCESS) > > + return NULL; > > + > > + ctx->internal_prgs[index]->is_built = 1; > > + > > + /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, > > different kernel. */ > > + if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= > > CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) { > > + int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8; > > + for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) { > > + if (index != i) { > > + assert(ctx->internal_prgs[i] == NULL); > > + assert(ctx->internel_kernels[i] == NULL); > > + cl_program_add_ref(ctx->internal_prgs[index]); > > + ctx->internal_prgs[i] = ctx->internal_prgs[index]; > > + } > > + > > + if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_2", NULL); > > + } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_4", NULL); > > + } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_8", NULL); > > + } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) { > > + ctx->internel_kernels[i] = > > cl_program_create_kernel(ctx->internal_prgs[index], > > + > > "__cl_fill_region_align8_16", NULL); > > + } else > > + assert(0); > > + } > > + } else { > > + ctx->internel_kernels[index] = > > cl_kernel_dup(ctx->internal_prgs[index]->ker[0]); > > + } > > + } > > > > return ctx->internel_kernels[index]; > > } > > diff --git a/src/cl_context.h b/src/cl_context.h > > index 782a9af..6eb6c9e 100644 > > --- a/src/cl_context.h > > +++ b/src/cl_context.h > > @@ -54,6 +54,14 @@ enum _cl_internal_ker_type { > > CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1, //copy image 3d tobuffer > > CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0, //copy buffer to image 2d > > CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1, //copy buffer to image 3d > > + CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne > > pattern, pattern size=1 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne > > pattern, pattern size=2 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN4, //fill buffer with 4 aligne > > pattern, pattern size=4 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_8, //fill buffer with 8 aligne > > pattern, pattern size=8 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_16, //fill buffer with 16 aligne > > pattern, pattern size=16 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_32, //fill buffer with 16 aligne > > pattern, pattern size=32 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_64, //fill buffer with 16 aligne > > pattern, pattern size=64 > > + CL_ENQUEUE_FILL_BUFFER_ALIGN128, //fill buffer with 128 aligne > > pattern, pattern size=128 > > CL_INTERNAL_KERNEL_MAX > > }; > > > > diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c > > index 330d230..d44fa66 100644 > > --- a/src/cl_enqueue.c > > +++ b/src/cl_enqueue.c > > @@ -412,6 +412,7 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* > > data) > > case EnqueueCopyBufferToImage: > > case EnqueueCopyImageToBuffer: > > case EnqueueNDRangeKernel: > > + case EnqueueFillBuffer: > > cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr); > > return CL_SUCCESS; > > case EnqueueNativeKernel: > > diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h > > index 1d3ae5f..cb612eb 100644 > > --- a/src/cl_enqueue.h > > +++ b/src/cl_enqueue.h > > @@ -41,6 +41,7 @@ typedef enum { > > EnqueueNDRangeKernel, > > EnqueueNativeKernel, > > EnqueueMarker, > > + EnqueueFillBuffer, > > EnqueueInvalid > > } enqueue_type; > > > > diff --git a/src/cl_event.c b/src/cl_event.c > > index 727ee1f..ae5bfaf 100644 > > --- a/src/cl_event.c > > +++ b/src/cl_event.c > > @@ -33,6 +33,7 @@ cl_event_is_gpu_command_type(cl_command_type type) > > { > > switch(type) { > > case CL_COMMAND_COPY_BUFFER: > > + case CL_COMMAND_FILL_BUFFER: > > case CL_COMMAND_COPY_IMAGE: > > case CL_COMMAND_COPY_IMAGE_TO_BUFFER: > > case CL_COMMAND_COPY_BUFFER_TO_IMAGE: > > diff --git a/src/cl_mem.c b/src/cl_mem.c > > index 44482f7..0f7dc23 100644 > > --- a/src/cl_mem.c > > +++ b/src/cl_mem.c > > @@ -902,6 +902,108 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, > > cl_mem dst_buf, > > } > > > > LOCAL cl_int > > +cl_mem_fill(cl_command_queue queue, const void * pattern, size_t > > pattern_size, > > + cl_mem buffer, size_t offset, size_t size) > > +{ > > + cl_int ret = CL_SUCCESS; > > + cl_kernel ker = NULL; > > + size_t global_off[] = {0,0,0}; > > + size_t global_sz[] = {1,1,1}; > > + size_t local_sz[] = {1,1,1}; > > + char pattern_comb[4]; > > + int is_128 = 0; > > + const void * pattern1 = NULL; > > + > > + assert(offset%pattern_size == 0); > > + assert(size%pattern_size == 0); > > + > > + if (!size) > > + return ret; > > + > > + if (pattern_size == 128) { > > + /* 128 is according to pattern of double16, but double works not very > > + well on some platform. We use two float16 to handle this. */ > > + extern char cl_internal_fill_buf_align128_str[]; > > + extern int cl_internal_fill_buf_align128_str_size; > > + > > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > > CL_ENQUEUE_FILL_BUFFER_ALIGN128, > > + cl_internal_fill_buf_align128_str, > > (size_t)cl_internal_fill_buf_align128_str_size, NULL); > > + is_128 = 1; > > + pattern_size = pattern_size/2; > > + pattern1 = pattern + pattern_size; > > + size = size/2; > > + } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. > > */ > > + extern char cl_internal_fill_buf_align8_str[]; > > + extern int cl_internal_fill_buf_align8_str_size; > > + int order = ffs(pattern_size/8) - 1; > > + > > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > > CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order, > > + cl_internal_fill_buf_align8_str, > > (size_t)cl_internal_fill_buf_align8_str_size, NULL); > > + } else if (pattern_size == 4) { /* Handle the 8 16 32 64 cases here. */ > incorrect comments here? Really a mistake:( > > + extern char cl_internal_fill_buf_align4_str[]; > > + extern int cl_internal_fill_buf_align4_str_size; > > + > > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > > CL_ENQUEUE_FILL_BUFFER_ALIGN4, > > + cl_internal_fill_buf_align4_str, > > (size_t)cl_internal_fill_buf_align4_str_size, NULL); > Is it a typo for the function name? cl_context_get_static_kernel_from_bin ? > And IMO, the API design is better to change to just pass in a kernel name > string as key. And the binary > buffer and the binary size should be keep as privately internal data. > considering the following prototype. > > cl_context_get_static_kernel_from_bin(queue->ctx, > "cl_internal_fill_buf_align4_str", NULL); > > The index number and the cl_internal_fill_buf_align4_str and > cl_internal_fill_buf_align4_str_size > should be managed internally. > > The last comment is that we could leverage vector load/store as much as > possible here. > > No matter the pattern size is, for example, if the pattern size is 1 char, > and the size is multiple of 128, > and the offset is 64 byte align. Then we can easily promote it to use a int2 > load/store. If it is multiple > of 256, then we can use int4 load/store. > > What's your opinion? The function names really seems a bit strange, maybe cl_context_get_internal_kernel_from_bin? Maybe I can change the cl_internal_fill_buf_align4_str[] and cl_internal_fill_buf_align4_str_size ..., etc, into the cl_context_get_static_kernel_from_bin and just call this function with an enum parameter. I think this can make the code neat here. About the optimization, I think it is the next step. I will consider your purpose and I will even to fix the unaligned cases the same way as the copy buffer if needed. I prefer that you can push this patch suite after I correct the small problems. I think I should send another patch for cl_context_get_static_kernel_from_bin function and its implementation. Because many other places use it. > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet