LGTM, pushed with slight change. Thanks.
On Mon, Jun 23, 2014 at 11:09:33PM +0800, Yongjia Zhang wrote: > Now it treats kernels with same kernel name and different build > options separately. When OCL_OUTPUT_KERNEL_PERF==1, it outputs the > time summary as before, but if OCL_OUTPUT_KERNEL_PERF==2, it will > output the time details including the kernel build options and > kernels with same kernel name but different build options will > output separately. > > v2: use strncmp and strncpy instead of strcmp and strcpy. > > Signed-off-by: Yongjia Zhang <yongjia.zh...@intel.com> > --- > src/cl_api.c | 17 +++++--- > src/performance.c | 113 > ++++++++++++++++++++++++++++++++++++++---------------- > src/performance.h | 2 +- > 3 files changed, 92 insertions(+), 40 deletions(-) > > diff --git a/src/cl_api.c b/src/cl_api.c > index 6206e10..8856f5f 100644 > --- a/src/cl_api.c > +++ b/src/cl_api.c > @@ -1681,7 +1681,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue, > } > > if(b_output_kernel_perf) > - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", > command_queue); > + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", > "\0", command_queue); > > return 0; > > @@ -1785,7 +1785,7 @@ clEnqueueCopyBufferRect(cl_command_queue > command_queue, > } > > if(b_output_kernel_perf) > - time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_copy_buffer_rect", command_queue); > + time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_copy_buffer_rect", "\0", command_queue); > > error: > return err; > @@ -2027,7 +2027,7 @@ clEnqueueCopyImage(cl_command_queue command_queue, > } > > if(b_output_kernel_perf) > - time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_kernel_copy_image", command_queue); > + time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_kernel_copy_image", "\0", command_queue); > > error: > return err; > @@ -2091,7 +2091,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue > command_queue, > } > > if(b_output_kernel_perf) > - time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_copy_image_to_buffer", command_queue); > + time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_copy_image_to_buffer", "\0", command_queue); > > error: > return err; > @@ -2155,7 +2155,7 @@ clEnqueueCopyBufferToImage(cl_command_queue > command_queue, > } > > if(b_output_kernel_perf) > - time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_copy_buffer_to_image", command_queue); > + time_end(command_queue->ctx, "beignet internal kernel : > cl_mem_copy_buffer_to_image", "\0", command_queue); > > error: > return err; > @@ -2556,7 +2556,12 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, > } > > if(b_output_kernel_perf) > - time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue); > + { > + if(kernel->program->build_opts != NULL) > + time_end(command_queue->ctx, cl_kernel_get_name(kernel), > kernel->program->build_opts, command_queue); > + else > + time_end(command_queue->ctx, cl_kernel_get_name(kernel), "\0", > command_queue); > + } > error: > return err; > } > diff --git a/src/performance.c b/src/performance.c > index a785460..85cd481 100644 > --- a/src/performance.c > +++ b/src/performance.c > @@ -8,11 +8,12 @@ > > #define MAX_KERNEL_NAME_LENGTH 100 > #define MAX_KERNEL_EXECUTION_COUNT 100000 > - > +#define MAX_KERNEL_BUILD_OPT 1000 > typedef struct kernel_storage_node > { > char kernel_name[MAX_KERNEL_NAME_LENGTH]; > float kernel_times[MAX_KERNEL_EXECUTION_COUNT]; > + char build_option[MAX_KERNEL_BUILD_OPT]; > int current_count; > float kernel_sum_time; > struct kernel_storage_node *next; > @@ -81,11 +82,12 @@ static context_storage_node * find_context(cl_context > context) > return pre; > } > > -static kernel_storage_node * find_kernel(context_storage_node *p_context, > const char *kernel_name) > +static kernel_storage_node * find_kernel(context_storage_node *p_context, > const char *kernel_name, const char *build_opt) > { > if(NULL != prev_kernel_pointer && NULL != prev_context_pointer && > p_context == prev_context_pointer && > - !strcmp(kernel_name, prev_kernel_pointer->kernel_name)) > + !strncmp(kernel_name, prev_kernel_pointer->kernel_name, > MAX_KERNEL_NAME_LENGTH) && > + !strncmp(build_opt, prev_kernel_pointer->build_option, > MAX_KERNEL_BUILD_OPT)) > return prev_kernel_pointer; > > if(NULL == p_context) > @@ -95,30 +97,38 @@ static kernel_storage_node * > find_kernel(context_storage_node *p_context, const > { > p_context->kernels_storage = (kernel_storage_node > *)malloc(sizeof(kernel_storage_node)); > p_context->kernel_count++; > - strcpy(p_context->kernels_storage->kernel_name,kernel_name); > + strncpy(p_context->kernels_storage->kernel_name,kernel_name, > MAX_KERNEL_NAME_LENGTH); > + p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = > '\0'; > + strncpy(p_context->kernels_storage->build_option, build_opt, > MAX_KERNEL_BUILD_OPT); > + p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = > '\0'; > p_context->kernels_storage->current_count = 0; > p_context->kernels_storage->kernel_sum_time = 0.0f; > p_context->kernels_storage->next = NULL; > return p_context->kernels_storage; > } > + > kernel_storage_node *pre = p_context->kernels_storage; > kernel_storage_node *cur = p_context->kernels_storage; > - while(NULL != cur && strcmp(cur->kernel_name, kernel_name)) > + while(NULL != cur && > + (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) || > + strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT))) > { > pre = cur; > cur = cur->next; > } > if(NULL != cur) > - { > return cur; > - } > + > p_context->kernel_count++; > pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node)); > pre = pre->next; > pre->current_count = 0; > pre->kernel_sum_time = 0.0f; > pre->next = NULL; > - strcpy(pre->kernel_name, kernel_name); > + strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH); > + pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; > + strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT); > + pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; > return pre; > } > > @@ -146,6 +156,8 @@ typedef struct time_element > float kernel_sum_time; > int kernel_execute_count; > double dev; > + float kernel_times[MAX_KERNEL_EXECUTION_COUNT]; > + uint32_t time_index; > } time_element; > > static int cmp(const void *a, const void *b) > @@ -172,40 +184,62 @@ static void print_time_info() > { > printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++); > printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n"); > + > kernel_storage_node *p_kernel = p_context->kernels_storage; > kernel_storage_node *p_tmp_kernel = p_kernel; > time_element *te = (time_element > *)malloc(sizeof(time_element)*p_context->kernel_count); > - int i = 0, j = 0; > + memset(te, 0, sizeof(time_element)*p_context->kernel_count); > + int i = -1, j = 0, k = 0; > while(NULL != p_tmp_kernel) > { > - te[i].kernel_execute_count = p_tmp_kernel->current_count; > - strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name); > - te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time; > - float average = p_tmp_kernel->kernel_sum_time / > p_tmp_kernel->current_count; > - float sumsquare = 0.0f; > + for(k=0; k<=i; k++) > + { > + if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, > MAX_KERNEL_NAME_LENGTH)) > + break; > + } > + if(k == i+1) > + { > + i++; > + k = i; > + } > + te[k].kernel_execute_count += p_tmp_kernel->current_count; > + strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, > MAX_KERNEL_NAME_LENGTH); > + te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; > + te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time; > for(j=0; j != p_tmp_kernel->current_count; ++j) > - sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 ); > - te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count); > + te[k].kernel_times[te[k].time_index++] = > p_tmp_kernel->kernel_times[j]; > p_tmp_kernel = p_tmp_kernel->next; > } > + > + for(k=0; k<=i; k++) > + { > + float average = te[k].kernel_sum_time / te[k].kernel_execute_count; > + double sumsquare = 0.0; > + for(j=0; j<te[k].time_index; ++j) > + sumsquare += pow((te[k].kernel_times[j] - average), 2.0); > + te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count); > + } > + > float sum_time = 0.0f; > qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp); > - for(i=0; i<p_context->kernel_count; ++i) > - sum_time += te[i].kernel_sum_time; > - for(i=0; i<p_context->kernel_count; ++i) > + for(j=0; j<=i; ++j) > + sum_time += te[j].kernel_sum_time; > + > + for(j=0; j<=i; ++j) > { > - printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d > Ave(ms): %7.2f Dev: %.1lf%%] \n", > - te[i].kernel_name, > - te[i].kernel_sum_time / sum_time * 100, > - te[i].kernel_sum_time, > - te[i].kernel_execute_count, > - te[i].kernel_sum_time / te[i].kernel_execute_count, > - te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count > * 100); > + printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d > Ave(ms): %7.2f Dev: %.1lf%%]\n", > + te[j].kernel_name, > + te[j].kernel_sum_time / sum_time * 100, > + te[j].kernel_sum_time, > + te[j].kernel_execute_count, > + te[j].kernel_sum_time / te[j].kernel_execute_count, > + te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count > * 100); > } > free(te); > printf(" Total : %.2f\n", sum_time); > if(2 != b_output_kernel_perf) > { > + printf("[------------ CONTEXT ENDS------------]\n\n"); > p_context = p_context->next; > continue; > } > @@ -214,6 +248,18 @@ static void print_time_info() > while(NULL != p_kernel) > { > printf(" [Kernel Name : %30s Time(ms): %.2f]\n", > p_kernel->kernel_name, p_kernel->kernel_sum_time); > + if(*p_kernel->build_option != '\0') > + { > + int count = 0; > + printf(" ->Build Options : "); > + while(p_kernel->build_option[count] != '\0' ) > + { > + printf("%c", p_kernel->build_option[count++]); > + if(count % 100 == 0) > + printf("\n "); > + } > + printf("\n"); > + } > for(i=0; i!=p_kernel->current_count; ++i) > printf(" Execution Round%5d : %.2f (ms)\n", i+1, > p_kernel->kernel_times[i]); > p_kernel = p_kernel->next; > @@ -225,7 +271,7 @@ static void print_time_info() > } > > > -static void insert(cl_context context, const char *kernel_name, float time) > +static void insert(cl_context context, const char *kernel_name, const char > *build_opt, float time) > { > if(!atexit_registered) > { > @@ -233,7 +279,7 @@ static void insert(cl_context context, const char > *kernel_name, float time) > atexit(print_time_info); > } > context_storage_node *p_context = find_context(context); > - kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name); > + kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, > build_opt); > prev_context_pointer = p_context; > prev_kernel_pointer = p_kernel; > p_kernel->kernel_times[p_kernel->current_count++] = time; > @@ -241,7 +287,8 @@ static void insert(cl_context context, const char > *kernel_name, float time) > if(p_kernel->kernel_sum_time > p_context->kernel_max_time) > { > p_context->kernel_max_time = p_kernel->kernel_sum_time; > - strcpy(p_context->max_time_kernel_name, kernel_name); > + strncpy(p_context->max_time_kernel_name, kernel_name, > MAX_KERNEL_NAME_LENGTH); > + p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; > } > } > > @@ -253,9 +300,9 @@ static struct timeval start, end; > void initialize_env_var() > { > char *env = getenv("OCL_OUTPUT_KERNEL_PERF"); > - if(NULL == env || !strcmp(env,"0")) > + if(NULL == env || !strncmp(env,"0", 1)) > b_output_kernel_perf = 0; > - else if(!strcmp(env,"1")) > + else if(!strncmp(env,"1", 1)) > b_output_kernel_perf = 1; > else > b_output_kernel_perf = 2; > @@ -267,11 +314,11 @@ void time_start(cl_context context, const char * > kernel_name, cl_command_queue c > gettimeofday(&start, NULL); > } > > -void time_end(cl_context context, const char * kernel_name, cl_command_queue > cq) > +void time_end(cl_context context, const char * kernel_name, const char * > build_opt, cl_command_queue cq) > { > clFinish(cq); > gettimeofday(&end, NULL); > float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - > start.tv_usec)/1000.0f; > - insert(context, kernel_name, t); > + insert(context, kernel_name, build_opt, t); > pthread_mutex_unlock(&mutex); > } > diff --git a/src/performance.h b/src/performance.h > index c747743..1e75054 100644 > --- a/src/performance.h > +++ b/src/performance.h > @@ -5,7 +5,7 @@ > > extern int b_output_kernel_perf; > void time_start(cl_context context, const char * kernel_name, > cl_command_queue cq); > -void time_end(cl_context context, const char * kernel_name, cl_command_queue > cq); > +void time_end(cl_context context, const char * kernel_name, const char * > build_opt, cl_command_queue cq); > void initialize_env_var(); > > > -- > 1.8.3.2 > > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet