Re: [Beignet] [PATCH 1/3] Runtime: fix a recurrent release context error.
This patchset is OK for me. On Tue, Jun 20, 2017 at 07:07:45PM +0800, Yang Rong wrote: > Date: Tue, 20 Jun 2017 19:07:45 +0800 > From: Yang Rong> To: beignet@lists.freedesktop.org > Cc: Yang Rong > Subject: [Beignet] [PATCH 1/3] Runtime: fix a recurrent release context > error. > X-Mailer: git-send-email 2.1.4 > > Before release internal resources, must set them to null, otherwize, > when delete these resources, will call release context again. > The ctx->built_in_prgs should be release by application. > > Signed-off-by: Yang Rong > --- > src/cl_context.c | 18 -- > 1 file changed, 8 insertions(+), 10 deletions(-) > > diff --git a/src/cl_context.c b/src/cl_context.c > index c5f3678..f3dd421 100644 > --- a/src/cl_context.c > +++ b/src/cl_context.c > @@ -366,9 +366,6 @@ cl_context_delete(cl_context ctx) >++internal_ctx_refs; >} > > - if (ctx->built_in_prgs) > -++internal_ctx_refs; > - >if (ctx->image_queue) > ++internal_ctx_refs; > > @@ -382,30 +379,31 @@ cl_context_delete(cl_context ctx) >CL_OBJECT_INC_REF(ctx); > >if (ctx->image_queue) { > -clReleaseCommandQueue(ctx->image_queue); > +cl_command_queue q = ctx->image_queue; > ctx->image_queue = NULL; > +clReleaseCommandQueue(q); >} > >/* delete the internal programs. */ >for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) { > if (ctx->internal_kernels[i]) { > - cl_kernel_delete(ctx->internal_kernels[i]); > + cl_kernel k = ctx->internal_kernels[i]; >ctx->internal_kernels[i] = NULL; > + cl_kernel_delete(k); > >assert(ctx->internal_prgs[i]); > - cl_program_delete(ctx->internal_prgs[i]); > + cl_program p = ctx->internal_prgs[i]; >ctx->internal_prgs[i] = NULL; > + cl_program_delete(p); > } > > if (ctx->built_in_kernels[i]) { > - cl_kernel_delete(ctx->built_in_kernels[i]); > + cl_kernel k = ctx->built_in_kernels[i]; >ctx->built_in_kernels[i] = NULL; > + cl_kernel_delete(k); > } >} > > - cl_program_delete(ctx->built_in_prgs); > - ctx->built_in_prgs = NULL; > - >CL_OBJECT_DEC_REF(ctx); > >cl_free(ctx->prop_user); > -- > 2.1.4 > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH newRT] Wrap all memory allocate functions.
I have already tried glibc's hsearch and tsearch. tsearch using binary tree, but you can only have one search tree within one program, which is unacceptible. hsearch using hash table, but you can just insert the element but can not delete the element. For our case, the point address is the key and there hundred of thoundred of them, so this is also unacceptible. On Thu, Mar 30, 2017 at 07:35:20AM +, Yang, Rong R wrote: > Date: Thu, 30 Mar 2017 07:35:20 + > From: "Yang, Rong R" <rong.r.y...@intel.com> > To: "junyan...@inbox.com" <junyan...@inbox.com>, > "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Cc: "He, Junyan" <junyan...@intel.com> > Subject: Re: [Beignet] [PATCH newRT] Wrap all memory allocate functions. > > Actually, you implement a hash table with insert/delete operations, does > linux has these apis? > > > -Original Message- > > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > > junyan...@inbox.com > > Sent: Thursday, March 23, 2017 15:46 > > To: beignet@lists.freedesktop.org > > Cc: He, Junyan <junyan...@intel.com> > > Subject: [Beignet] [PATCH newRT] Wrap all memory allocate functions. > > > > From: Junyan He <junyan...@intel.com> > > > > We modify all memory allocated functions in cl_alloc file, make it > > easy to debug all the memory leak point. > > > > Signed-off-by: Junyan He <junyan...@intel.com> > > --- > > src/cl_accelerator_intel.c | 4 +- > > src/cl_alloc.c | 197 > > ++-- > > - > > src/cl_alloc.h | 43 +++-- > > src/cl_api.c | 3 +- > > src/cl_api_context.c | 4 +- > > src/cl_api_kernel.c| 12 +-- > > src/cl_command_queue.c | 12 +-- > > src/cl_command_queue_enqueue.c | 6 +- > > src/cl_command_queue_gen7.c| 2 +- > > src/cl_context.c | 14 +-- > > src/cl_device_enqueue.c| 2 +- > > src/cl_enqueue.c | 6 +- > > src/cl_event.c | 20 ++--- > > src/cl_kernel.c| 30 +++ > > src/cl_mem.c | 28 +++--- > > src/cl_program.c | 54 +-- > > src/cl_sampler.c | 4 +- > > src/cl_utils.h | 3 - > > src/gen/cl_command_queue_gen.c | 12 +-- > > src/gen/cl_kernel_gen.c| 28 +++--- > > src/gen/cl_program_gen.c | 12 +-- > > src/intel/intel_batchbuffer.c | 4 +- > > src/intel/intel_driver.c | 8 +- > > src/intel/intel_gpgpu.c| 18 ++-- > > src/x11/dricommon.c| 6 +- > > 25 files changed, 342 insertions(+), 190 deletions(-) > > > > diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c > > index ae08184..62700b2 100644 > > --- a/src/cl_accelerator_intel.c > > +++ b/src/cl_accelerator_intel.c > > @@ -18,7 +18,7 @@ cl_accelerator_intel_new(cl_context ctx, > >cl_int err = CL_SUCCESS; > > > >/* Allocate and inialize the structure itself */ > > - TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel)); > > + TRY_ALLOC(accel, CL_CALLOC(1, sizeof(struct _cl_accelerator_intel))); > >CL_OBJECT_INIT_BASE(accel, CL_OBJECT_ACCELERATOR_INTEL_MAGIC); > > > >if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) { > > @@ -81,5 +81,5 @@ cl_accelerator_intel_delete(cl_accelerator_intel accel) > > > >cl_context_delete(accel->ctx); > >CL_OBJECT_DESTROY_BASE(accel); > > - cl_free(accel); > > + CL_FREE(accel); > > } > > diff --git a/src/cl_alloc.c b/src/cl_alloc.c > > index e532569..b9ac853 100644 > > --- a/src/cl_alloc.c > > +++ b/src/cl_alloc.c > > @@ -1,4 +1,4 @@ > > -/* > > +/* > > * Copyright © 2012 Intel Corporation > > * > > * This library is free software; you can redistribute it and/or > > @@ -14,75 +14,204 @@ > > * You should have received a copy of the GNU Lesser General Public > > * License along with this library. If not, see > > <http://www.gnu.org/licenses/>. > > * > > - * Author: Benjamin Segovia <benjamin.sego...@intel.com> > > */ > > - > > #include "cl_alloc.h" > > #include "cl_utils.h" > > - > > +#include "cl_device_id.h" > > #include > > #include > > #include > > +#include > > +#include > > + > > +#ifdef CL_ALLO
Re: [Beignet] [PATCH] Typo in error message
Thanks for fixing it. On Mon, Jan 30, 2017 at 03:18:09PM +0100, Giuseppe Bilotta wrote: > Date: Mon, 30 Jan 2017 15:18:09 +0100 > From: Giuseppe Bilotta> To: Beignet ML > Cc: Giuseppe Bilotta > Subject: [Beignet] [PATCH] Typo in error message > X-Mailer: git-send-email 2.11.0.745.g0978fb64a4 > > --- > src/cl_event.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/src/cl_event.c b/src/cl_event.c > index 3e1dc224..a2b16be4 100644 > --- a/src/cl_event.c > +++ b/src/cl_event.c > @@ -579,7 +579,7 @@ cl_event_exec(cl_event event, cl_int exec_to_status, > cl_bool ignore_depends) > > if (ret != CL_SUCCESS) { >assert(ret < 0); > - DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error staus is > %d", > + DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error status is > %d", > event, event->event_type, ret); >ret = cl_event_set_status(event, ret); >assert(ret == CL_SUCCESS); > -- > 2.11.0.745.g0978fb64a4 > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Runtime: return CL_INVALID_EVENT_WAIT_LIST if not event in the wait list.
Thanks for catching that bug. On Wed, Dec 28, 2016 at 06:47:01PM +0800, Yang Rong wrote: > Date: Wed, 28 Dec 2016 18:47:01 +0800 > From: Yang Rong> To: beignet@lists.freedesktop.org > Cc: Meng Mengmeng , Yang Rong > > Subject: [Beignet] [PATCH] Runtime: return CL_INVALID_EVENT_WAIT_LIST if > not event in the wait list. > X-Mailer: git-send-email 2.1.4 > > From: Meng Mengmeng > > Signed-off-by: Yang Rong > --- > src/cl_event.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/src/cl_event.c b/src/cl_event.c > index 8173578..644a21f 100644 > --- a/src/cl_event.c > +++ b/src/cl_event.c > @@ -546,7 +546,7 @@ cl_event_check_waitlist(cl_uint num_events_in_wait_list, > const cl_event *event_w > /* check the event and context */ > for (i = 0; i < num_events_in_wait_list; i++) { >if (!CL_OBJECT_IS_EVENT(event_wait_list[i])) { > -err = CL_INVALID_EVENT; > +err = CL_INVALID_EVENT_WAIT_LIST; > break; >} > > -- > 2.1.4 > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Refine mem.h and improve the related macro defination.
According to my understanding, PIPE is also a mem object. clGetMemObjectInfo can be used to query all kinds of mem object. clGetPipeInfo just query additional PIPE info. According to CL spec, it just says query CL_MEM_HOST_PTR return the host ptr if image/buffer created by CL_MEM_USE_HOST_PTR is specified, otherwise, return NULL. So, as a PIPE, I think it just retuns NULL and it that correct? On Tue, Dec 20, 2016 at 09:05:41AM +0100, Simon Richter wrote: > Date: Tue, 20 Dec 2016 09:05:41 +0100 > From: Simon Richter <simon.rich...@hogyros.de> > To: beignet@lists.freedesktop.org > Subject: Re: [Beignet] [PATCH] Refine mem.h and improve the related macro > defination. > > Hi, > > On 20.12.2016 04:23, He Junyan wrote: > > >>> +if (!CL_OBJECT_IS_BUFFER(memobj)) { > > >> That would match pipes as well, is that intended? > > > Already redefine CL_OBJECT_IS_BUFFER to just match buffer and subbuffer. > > > +#define CL_OBJECT_IS_BUFFER(mem) ((mem && > >\ > > + ((cl_base_object)mem)->magic == > > CL_OBJECT_MEM_MAGIC && \ > > + CL_OBJECT_GET_REF(mem) >= 1 && > >\ > > + ((cl_mem)mem)->type <= > > CL_MEM_SUBBUFFER_TYPE)) > > Exactly my point. A pipe object would not match CL_OBJECT_IS_BUFFER(), > so the negated test would return true, and the code would look at > memobj->host_ptr in a pipe object, which is wrong. > >Simon > > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH V4] Add profiling feature based on new event implementation.
I notice this should be caused by event timestamp recording sequence. We submit the NDRange and then record the queued timestamp, this is wrong. I have already sent an another patch to "Improve event execute function" to modify this. You can have a try, apply that patch based on this patch. thanks. On Thu, Dec 22, 2016 at 06:41:49AM +, Pan, Xiuli wrote: > Date: Thu, 22 Dec 2016 06:41:49 + > From: "Pan, Xiuli" <xiuli@intel.com> > To: "junyan...@inbox.com" <junyan...@inbox.com>, > "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Subject: Re: [Beignet] [PATCH V4] Add profiling feature based on new event > implementation. > > It seems still have bugs. > Here are some logs I got, the gen timestamps is print in the function > cl_event_update_timestamp_gen, and the final result is print last as > timestamp. > > gen timestamp[0] is d88bddb30 > gen timestamp[1] is d88bde2b0 > run for 8 times > gen timestamp[2] is d88bddae0 // It is smaller than timestamp[0] we get some > negative value > gen timestamp[3] is d8f002390 > timestamp[2] is ffaf > timestamp[3] is 642485f > gen timestamp[0] is d8f03fab0 > gen timestamp[1] is d8f0400f0 > run for 9 times > gen timestamp[2] is d8f03fd30 > gen timestamp[3] is d954687d0 > timestamp[2] is 27f > timestamp[3] is 6428d1f > gen timestamp[0] is d954a9d20 > gen timestamp[1] is d954aa450 > run for 10 times > gen timestamp[2] is d954a9d20 //It is the same as timestamp[0] we get -1 > gen timestamp[3] is d9b8df420 > timestamp[2] is > > > The overflow handler seems to have some problems. > > -Original Message- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > junyan...@inbox.com > Sent: Monday, December 19, 2016 7:24 PM > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH V4] Add profiling feature based on new event > implementation. > > From: Junyan He <junyan...@intel.com> > > TODO: > In opencl 2.0, a new profiling item called CL_PROFILING_COMMAND_COMPLETE is > imported. It means that we need to record the time stamp of all the child > events created by the "Kernel enqueing kernels" feature finish. > This should be done after the "Kernel enqueing kernels" feature enabled. > > V2: > Update event time stamp before inserting to queue thread, avoid MT issue. > > V3: > Fixup overflow problem. > > V4: > Fixup overflow to 0xf problem. > Just take ownership and release event lock when call the update timestamp > function. The update timestamp function may have block system call can should > not hold the lock to call it. > > Signed-off-by: Junyan He <junyan...@intel.com> > --- > src/cl_api.c | 51 > src/cl_api_event.c | 41 + > src/cl_api_mem.c | 9 +++ > src/cl_base_object.c | 29 ++--- > src/cl_base_object.h | 10 ++-- > src/cl_command_queue_enqueue.c | 2 + > src/cl_driver.h| 4 +- > src/cl_enqueue.c | 9 --- > src/cl_event.c | 132 > - > src/cl_event.h | 10 ++-- > src/intel/intel_gpgpu.c| 16 +++-- > 11 files changed, 195 insertions(+), 118 deletions(-) > > diff --git a/src/cl_api.c b/src/cl_api.c index d7b5434..6a4f4ec 100644 > --- a/src/cl_api.c > +++ b/src/cl_api.c > @@ -1312,57 +1312,6 @@ error: >return err; > } > > - > -cl_int > -clGetEventProfilingInfo(cl_event event, > -cl_profiling_infoparam_name, > -size_t param_value_size, > -void * param_value, > -size_t * param_value_size_ret) > -{ > - cl_int err = CL_SUCCESS; > - cl_ulong ret_val; > - > - CHECK_EVENT(event); > - //cl_event_update_status(event, 0); > - > - if (event->event_type == CL_COMMAND_USER || > - !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) || > - event->status != CL_COMPLETE) { > -err = CL_PROFILING_INFO_NOT_AVAILABLE; > -goto error; > - } > - > - if (param_value && param_value_size < sizeof(cl_ulong)) { > -err = CL_INVALID_VALUE; > -goto error; > - } > - > - if (param_name == CL_PROFILING_COMMAND_QUEUED) { > -ret_val = event->queued_timestamp; > - } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) { > -ret_val= event->queued_timestamp + > cl_event_get_timestam
Re: [Beignet] [PATCH] Refine mem.h and improve the related macro defination.
On Mon, Dec 19, 2016 at 06:25:26PM +0100, Simon Richter wrote: > Date: Mon, 19 Dec 2016 18:25:26 +0100 > From: Simon Richter> To: beignet@lists.freedesktop.org > Subject: Re: [Beignet] [PATCH] Refine mem.h and improve the related macro > defination. > > Hi, > > On 19.12.2016 10:21, junyan...@inbox.com wrote: > > > --- a/src/cl_api_mem.c > > +++ b/src/cl_api_mem.c > > @@ -71,54 +71,54 @@ clGetMemObjectInfo(cl_mem memobj, > > break; > >case CL_MEM_HOST_PTR: { > > ptr = 0; > > -if (memobj->type == CL_MEM_IMAGE_TYPE) { > > +if (!CL_OBJECT_IS_BUFFER(memobj)) { > >ptr = (size_t)memobj->host_ptr; > > } else { > > That would match pipes as well, is that intended? > >Simon > Already redefine CL_OBJECT_IS_BUFFER to just match buffer and subbuffer. +#define CL_OBJECT_IS_BUFFER(mem) ((mem && \ + ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC && \ + CL_OBJECT_GET_REF(mem) >= 1 && \ + ((cl_mem)mem)->type <= CL_MEM_SUBBUFFER_TYPE)) > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 01/11] Runtime: Add CL base object for all cl objects.
The whole patch set should be push to dev/runtime first. When it is stable, we can merge them back. On Tue, Jul 19, 2016 at 07:25:47PM +0800, junyan...@inbox.com wrote: > Date: Tue, 19 Jul 2016 19:25:47 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH 01/11] Runtime: Add CL base object for all cl > objects. > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@intel.com> > > The runtime code is a little verbose in CL object handle. > Every CL objects should have a reference, a lock to protect itself > and an ICD dispatcher. We can organize them to a struct and place > it at the beginning of each CL object. > This base object is also used to protect the CL objects MT safe. > CL_OBJECT_LOCK/CL_OBJECT_UNLOCK macro will lock/unlock objects, > but we should use them within one function call, and the critical > region should be short. > We add CL_OBJECT_TAKE_OWNERSHIP/CL_OBJECT_RELEASE_OWNERSHIP macro > to own the object for a long time. CL_OBJECT_TAKE_OWNERSHIP will > not hold the lock and so will not cause deadlock problems. > For example, when we call NDRange on some memobj, we should take > the ownship of the memobj. If another thread call NDRange on the > same memobj, we should return some error like CL_OUT_OF_RESOURCE > to users and protect the memobj from accessing simultaneously. > > Signed-off-by: Junyan He <junyan...@intel.com> > --- > src/CMakeLists.txt |1 + > src/cl_base_object.c | 102 > ++ > src/cl_base_object.h | 77 + > 3 files changed, 180 insertions(+) > create mode 100644 src/cl_base_object.c > create mode 100644 src/cl_base_object.h > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt > index a002865..cec7cfc 100644 > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -65,6 +65,7 @@ MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" > "${BUILT_IN_NAME}") > > set(OPENCL_SRC > ${KERNEL_STR_FILES} > +cl_base_object.c > cl_api.c > cl_alloc.c > cl_kernel.c > diff --git a/src/cl_base_object.c b/src/cl_base_object.c > new file mode 100644 > index 000..4661977 > --- /dev/null > +++ b/src/cl_base_object.c > @@ -0,0 +1,102 @@ > +/* > + * Copyright © 2012 Intel Corporation > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library. If not, see > <http://www.gnu.org/licenses/>. > + * > + */ > +#include > +#include "cl_base_object.h" > + > +static pthread_t invalid_thread_id = -1; > + > +LOCAL void > +cl_object_init_base(cl_base_object obj, cl_ulong magic) > +{ > + obj->magic = magic; > + obj->ref = 1; > + SET_ICD(obj->dispatch); > + pthread_mutex_init(>mutex, NULL); > + pthread_cond_init(>cond, NULL); > + obj->owner = invalid_thread_id; > +} > + > +LOCAL void > +cl_object_destroy_base(cl_base_object obj) > +{ > + int ref = CL_OBJECT_GET_REF(obj); > + if (ref != 0) { > +DEBUGP(DL_ERROR, "CL object %p, call destroy with a reference %d", obj, > + ref); > +assert(0); > + } > + > + if (!CL_OBJECT_IS_VALID(obj)) { > +DEBUGP(DL_ERROR, > + "CL object %p, call destroy while it is already a dead object", > obj); > +assert(0); > + } > + > + if (obj->owner != invalid_thread_id) { > +DEBUGP(DL_ERROR, "CL object %p, call destroy while still has a owener > %d", > + obj, (int)obj->owner); > +assert(0); > + } > + > + obj->magic = CL_OBJECT_INVALID_MAGIC; > + pthread_mutex_destroy(>mutex); > + pthread_cond_destroy(>cond); > +} > + > +LOCAL cl_int > +cl_object_take_ownership(cl_base_object obj, cl_int wait) > +{ > + pthread_t self; > + > + assert(CL_OBJECT_IS_VALID(obj)); > + > + self = pthread_self(); > + > + pthread_mutex_lock(>mutex); > + if (pthread_equal(obj->owner, invalid_thread_id)) { > +obj->owner = self; > +pthread_mutex_
Re: [Beignet] [PATCH] Runtime: Add CL base object for all cl objects.
On Fri, Jul 15, 2016 at 11:50:06AM +0200, Simon Richter wrote: > Date: Fri, 15 Jul 2016 11:50:06 +0200 > From: Simon Richter> To: beignet@lists.freedesktop.org > Subject: Re: [Beignet] [PATCH] Runtime: Add CL base object for all cl > objects. > > Hi, > > On 14.07.2016 10:15, junyan...@inbox.com wrote: > > > The runtime code is a little verbose in CL object handle. > > Every CL objects should have a reference, a lock to protect itself > > and an ICD dispatcher. We can organize them to a struct and place > > it at the beginning of each CL object. > > Does that mean that only a single call to DEFINE_ICD() and SET_ICD() > remains? If so, can/should these be inlined? Really it is, it's useless to define a Macro. Thanks. > >Simon > > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet GET FREE 5GB EMAIL - Check out spam free email with many cool features! Visit http://www.inbox.com/email to find out more! ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 0/5] Add support for kernel debugging
Hi Mircea, Thanks a lot for you contribution. I have several questions here. According to my understanding, your debug feature strongly depends on the libigfxdbgxchg64.so lib, which I think is a binary and I can not find it in your patchset. I notice you get a SIP of some system routine and allocate a buffer object to copy that system routine. How can user control the execution of GPU program is unknown in this patchset. So is it possible to open the source code of libigfxdbgxchg64.so. Or at least can you provide that lib and give a document about how to use it? Thanks On Fri, Jul 08, 2016 at 02:39:34PM +0200, Mircea Gherzan wrote: > Date: Fri, 8 Jul 2016 14:39:34 +0200 > From: Mircea Gherzan> To: beignet@lists.freedesktop.org > Cc: Mircea Gherzan , fabian.schn...@intel.com > Subject: [Beignet] [PATCH 0/5] Add support for kernel debugging > X-Mailer: git-send-email 1.7.0.7 > > This patch series enables debugging OpenCL Beignet shaders with the GDB port > for Intel(R) GPUs. > > Enabling debugging in the Beignet codebase involves: > * checking if the debugger is present > * setting a breakpoint on the first instruction of a kernel, > * getting the "debug system routine" that dumps the content of the EU > registers > to a "debug surface" once a breakpoint is encountered in the shader, > * setting up the BOs for the system routine and for the debug surface, > * writing the right MMIO registers (via batch buffer commands) in order > to enable the shader debug in the hardware. > * notifying the debugger infrastructure that a certain kernel is under debug, > in order to prevent the "Debug Companion Driver" from auto-resuming > the kernel. > > The interaction with the debugger is done via the debugger interchange > library. > This library as well as other debugger open-source components (the kernel > driver, > libraries, GDB) are delivered in the Intel(R) Parallel Studio XE. > > Tested on HSW, BDW and SKL. > > Mircea Gherzan (5): > backend: add support for kernel debugging > runtime: add support for the interchange library of the debugger > runtime: use the "-debug" build option if the debugger is active > runtime: set the kernel name in the cl_gpgpu_kernel structure > runtime: support for the debug system routine, surface and MMIO > registers > > backend/src/backend/context.cpp | 4 +- > backend/src/backend/context.hpp | 4 +- > backend/src/backend/gen_context.cpp | 12 ++- > backend/src/backend/gen_context.hpp | 2 +- > backend/src/backend/gen_program.cpp | 7 +- > backend/src/backend/gen_program.hpp | 8 +- > backend/src/backend/program.cpp | 5 +- > src/CMakeLists.txt | 1 + > src/cl_command_queue_gen7.c | 6 +- > src/cl_context.c| 4 + > src/cl_context.h| 2 + > src/cl_driver.h | 4 + > src/cl_driver_defs.c| 2 +- > src/cl_program.c| 40 > src/intel/intel_debugger.c | 158 + > src/intel/intel_debugger.h | 70 + > src/intel/intel_defines.h | 11 +++ > src/intel/intel_gpgpu.c | 192 > ++-- > src/intel/intel_gpgpu.h | 7 ++ > 19 files changed, 516 insertions(+), 23 deletions(-) > create mode 100644 src/intel/intel_debugger.c > create mode 100644 src/intel/intel_debugger.h > > -- > 1.8.3.1 > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet Can't remember your password? Do you need a strong and secure password? Use Password manager! It stores your passwords & protects your account. Check it out at http://mysecurelogon.com/manager ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [Printf v2][PATCH 07/12] Add the implementation of printf ir instruction.
patch of 06 and 07 have the same title? I think it is a typo here. Please correct it. All the other things are OK, just rename this one and the whole patchset can be pushed later. Also can push my patch about printf test cases together. On Mon, Feb 01, 2016 at 03:42:16PM +0800, yan.w...@linux.intel.com wrote: > Date: Mon, 1 Feb 2016 15:42:16 +0800 > From: yan.w...@linux.intel.com > To: beignet@lists.freedesktop.org > Cc: Yan Wang <yan.w...@linux.intel.com> > Subject: [Beignet] [Printf v2][PATCH 07/12] Add the implementation of > printf ir instruction. > X-Mailer: git-send-email 2.5.0 > > From: Yan Wang <yan.w...@linux.intel.com> > > Contributor: Junyan He <junyan...@linux.intel.com> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com> > --- > backend/src/llvm/llvm_gen_backend.cpp | 95 > +-- > 1 file changed, 80 insertions(+), 15 deletions(-) > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index dba9dba..4870285 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -486,6 +486,9 @@ namespace gbe > typedef map>::iterator PtrOrigMapIter; > // map pointer source to bti > map BtiMap; > +// map printf pointer source to bti > +int printfBti; > +uint32_t printfNum; > // map ptr to its bti register > map BtiValueMap; > // map ptr to it's base > @@ -520,6 +523,8 @@ namespace gbe > unit(unit), > ctx(unit), > regTranslator(ctx), > +printfBti(-1), > +printfNum(0), > LI(0), > TheModule(0), > btiBase(BTI_RESERVED_NUM), > @@ -586,6 +591,7 @@ namespace gbe >addrStoreInst.clear(); >// Reset for next function >btiBase = BTI_RESERVED_NUM; > + printfBti = -1; >return false; > } > /*! Given a possible pointer value, find out the interested escape like > @@ -594,7 +600,7 @@ namespace gbe > /*! For all possible pointers, GlobalVariable, function pointer argument, > alloca instruction, find their pointer escape points */ > void analyzePointerOrigin(Function ); > -unsigned getNewBti(Value *origin, bool isImage); > +unsigned getNewBti(Value *origin, bool force); > void assignBti(Function ); > bool isSingleBti(Value *Val); > Value *getBtiRegister(Value *v); > @@ -717,11 +723,10 @@ namespace gbe > // handle load of dword/qword with unaligned address > void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, > ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, > bool fixedBTI); > void visitInstruction(Instruction ) {NOT_SUPPORTED;} > -void* getPrintfInfo(CallInst* inst) > -{ > - if ([inst]) > -return (void*)[inst]; > - return NULL; > +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { > + if (unit.printfs.find(inst) == unit.printfs.end()) > +return NULL; > + return [inst]; > } > private: >void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug > infomation in context for subsequently passing to Gen insn > @@ -1127,21 +1132,15 @@ namespace gbe > } >} > > - unsigned GenWriter::getNewBti(Value *origin, bool isImage) { > + unsigned GenWriter::getNewBti(Value *origin, bool force) { > unsigned new_bti = 0; > -if (isImage) { > +if (force) { >new_bti = btiBase; >incBtiBase(); >return new_bti; > } > > -if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) { > - new_bti = btiBase; > - incBtiBase(); > -} else if > (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) { > - new_bti = btiBase; > - incBtiBase(); > -} else if > (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) { > +if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) { >new_bti = btiBase; >incBtiBase(); > } > @@ -3716,6 +3715,16 @@ namespace gbe > this->newRegister(); > break; >case GEN_OCL_PRINTF: > +this->newRegister(); // fall through > + case GEN_OCL_PUTS: > + { > + // We need a new BTI as printf output. > + if (printfBti < 0) { > + printfBti = this->getNewBti(, true); > + ctx.getFunction().getPrintfSet()->setBufBTI(printfBti); > + } > + break; > + } >case GEN_OCL_CALC_TIMESTAMP: >
Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf instruction.
On Thu, Jan 21, 2016 at 11:30:21AM +0800, Yan Wang wrote: > Date: Thu, 21 Jan 2016 11:30:21 +0800 > From: Yan Wang <yan.w...@linux.intel.com> > To: beignet@lists.freedesktop.org > Cc: Yan Wang <yan.w...@linux.intel.com> > Subject: [Beignet] [Printf][PATCH 06/11] Implement emision of printf > instruction. > X-Mailer: git-send-email 2.5.0 > > Contributor: Junyan He <junyan...@linux.intel.com> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com> > --- > backend/src/ir/context.hpp| 5 ++ > backend/src/llvm/llvm_gen_backend.cpp | 89 > --- > 2 files changed, 78 insertions(+), 16 deletions(-) > I think it is better to write another patch to type TUPLE logic > diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp > index b95741f..877d639 100644 > --- a/backend/src/ir/context.hpp > +++ b/backend/src/ir/context.hpp > @@ -149,6 +149,11 @@ namespace ir { >GBE_ASSERTM(fn != NULL, "No function currently defined"); >return fn->file.appendArrayTuple(reg, regNum); > } > +/*! Make a tuple from an array of types */ > +INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) { > + GBE_ASSERTM(fn != NULL, "No function currently defined"); > + return fn->file.appendArrayTypeTuple((uint8_t*)type, num); > +} > /*! We just use variadic templates to forward instruction functions */ > #define DECL_INSN(NAME, FAMILY) \ > template INLINE void NAME(Args...args); > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index dba9dba..cc736d7 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -486,6 +486,9 @@ namespace gbe > typedef map>::iterator PtrOrigMapIter; > // map pointer source to bti > map BtiMap; > +// map printf pointer source to bti > +int printfBti; > +uint32_t printfNum; > // map ptr to its bti register > map BtiValueMap; > // map ptr to it's base > @@ -520,6 +523,8 @@ namespace gbe > unit(unit), > ctx(unit), > regTranslator(ctx), > +printfBti(-1), Also need to reset printfBti for each runOnFunction. > +printfNum(0), > LI(0), > TheModule(0), > btiBase(BTI_RESERVED_NUM), > @@ -594,7 +599,7 @@ namespace gbe > /*! For all possible pointers, GlobalVariable, function pointer argument, > alloca instruction, find their pointer escape points */ > void analyzePointerOrigin(Function ); > -unsigned getNewBti(Value *origin, bool isImage); > +unsigned getNewBti(Value *origin, bool force); > void assignBti(Function ); > bool isSingleBti(Value *Val); > Value *getBtiRegister(Value *v); > @@ -717,12 +722,7 @@ namespace gbe > // handle load of dword/qword with unaligned address > void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, > ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, > bool fixedBTI); > void visitInstruction(Instruction ) {NOT_SUPPORTED;} > -void* getPrintfInfo(CallInst* inst) > -{ > - if ([inst]) > -return (void*)[inst]; > - return NULL; > -} > +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { return > [inst]; } I think ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { if (unit.printfs.find(inst) == unit.printfs.end()) return NULL; return [inst]; } would be better > private: >void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug > infomation in context for subsequently passing to Gen insn >ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t > index = 0u); > @@ -1127,21 +1127,15 @@ namespace gbe > } >} > > - unsigned GenWriter::getNewBti(Value *origin, bool isImage) { > + unsigned GenWriter::getNewBti(Value *origin, bool force) { > unsigned new_bti = 0; > -if (isImage) { > +if (force) { >new_bti = btiBase; >incBtiBase(); >return new_bti; > } > > -if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) { > - new_bti = btiBase; > - incBtiBase(); > -} else if > (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) { > - new_bti = btiBase; > - incBtiBase(); > -} else if > (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) { > +if (origin->getName().equals(StringRef("__gen
Re: [Beignet] [PATCH 00/18] Enable profiling by line number.
Ping for review On Thu, Dec 24, 2015 at 07:01:52PM +0800, junyan...@inbox.com wrote: > Date: Thu, 24 Dec 2015 19:01:52 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [PATCH 00/18] Enable profiling by line number. > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > This patch set will let the user to specify the line numbers in the source > code to insert the profiling watch points. > As the first step, we just use the env var OCL_PROFILING_LINES to control > the kernel name and line numbers. The format is: > KERNEL_NAME:PROFILING_MODE:LINE_NUMBER0,LINE_NUMBER1,LINE_NUMBER2,... > for example: > export OCL_PROFILING_LINES="builtin_atanpi_float8:2:2,6,7,8,15" > will insert watch points at 2 6 7 8 15 lines in the kernel named > builtin_atanpi_float8. > We have 3 PROFILING_MODE, > level 1: just brief timestamp with line number. >Total log number is 6 > Line2:Timestamp: 190 Thread Exec:6 > Line6:Timestamp: 1174 Thread Exec:6 > Line7:Timestamp: 3092 Thread Exec:6 > Line8:Timestamp: 3105 Thread Exec:6 > Line 15:Timestamp: 3241 Thread Exec:6 > >level 2: timestamp with source, plus: >Format: Average Timestamp Exec number Source > | __kernel void > builtin_atanpi_float8(__global float *dst, __global float *src1, __global > int *vector) { >TS: 190 Num: 6 > |int i = get_global_id(0); > |float8 x1 = (float8) (src1[i * (*vector) > + 0],src1[i * (*vector) + 1],src1[i * (*vector) + 2],src1[i * (*vector) + > 3],src1[i * (*vector) + 4],src1[i * (*vector) + 5],src1[i * (*vector) + > 6],src1[i * (*vector) + 7]); > | > |float8 ret; >TS: 1174 Num: 6 > |ret = atanpi(x1); >TS: 3092 Num: 6 > |dst[i * (*vector) + 0] = ret[0]; >TS: 3105 Num: 6 > |dst[i * (*vector) + 1] = ret[1]; > |dst[i * (*vector) + 2] = ret[2]; > |dst[i * (*vector) + 3] = ret[3]; > |dst[i * (*vector) + 4] = ret[4]; > |dst[i * (*vector) + 5] = ret[5]; > |dst[i * (*vector) + 6] = ret[6]; > |dst[i * (*vector) + 7] = ret[7]; >TS: 3241 Num: 6 > | }; > > >level 3: output the detail logs, add all logs as: > Log 0 --- >| fix functions id: 7 simd: 16 kernel id:0 | >| thread id:0 EU id: 8 sub slice id: 0 slice id 0 | >| dispatch Mask: 1 prolog: 6860 epilog: 19548 | >| globalX: 3~ 3 globalY: 0~ 0 globalZ: 0~ 0 | >| ts0 : 201 | ts1 : 1180 | ts2 : 12417 | >| ts3 : 12430 | ts4 : 12637 | ts5 : 0 | >| ts6 : 0 | ts7 : 0 | ts8 : 0 | >| ts9 : 0 | ts10: 0 | ts11: 0 | >| ts12: 0 | ts13: 0 | ts14: 0 | >| ts15: 0 | ts16: 0 | ts17: 0 | >| ts18: 0 | ts19: 0 | | > Log 1 --- >| fix functions id: 7 simd: 16 kernel id:0 | >| thread id:0 EU id: 8 sub slice id: 1 slice id 0 | >| dispatch Mask: 1 prolog: 6877 epilog: 19569 | >| globalX: 4~ 4 globalY: 0~ 0 globalZ: 0~ 0 | >| ts0 : 209 | ts1 : 1190 | ts2 : 12423 | >| ts3 : 12436 | ts4 : 12643 | ts5 : 0 | >| ts6 : 0 | ts7 : 0 | ts8 : 0 | >| ts9 : 0 | ts10: 0 | ts11: 0 | >| ts12: 0 | ts13: 0 | ts14: 0 | >| ts15: 0 | ts16: 0 | ts17: 0 | >| ts18: 0 | ts19: 0 | | >. >. > > > > Some problems: > 1. On BDW, the timestamp sometimes gives invalid huge value. >It may be a HW issue or feature, we need to check it further. > 2. Sometimes the line number of instruction is different from the >source code. This is caused by optimization and we can notice >and analyse it by Gen IR or ASM. I will send a patch to set >optimization level later. > 3. Some line numbers are missing
Re: [Beignet] [Printf][PATCH 04/11] Add the implementation of printf ir instruction.
On Thu, Jan 21, 2016 at 11:29:41AM +0800, Yan Wang wrote: > Date: Thu, 21 Jan 2016 11:29:41 +0800 > From: Yan Wang <yan.w...@linux.intel.com> > To: beignet@lists.freedesktop.org > Cc: Yan Wang <yan.w...@linux.intel.com> > Subject: [Beignet] [Printf][PATCH 04/11] Add the implementation of printf > ir instruction. > X-Mailer: git-send-email 2.5.0 > > Contributor: Junyan He <junyan...@linux.intel.com> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com> > --- > backend/src/ir/function.hpp| 8 ++ > backend/src/ir/instruction.cpp | 57 > +- > backend/src/ir/instruction.hpp | 13 ++ > backend/src/ir/instruction.hxx | 1 + > backend/src/ir/register.cpp| 8 ++ > backend/src/ir/register.hpp| 21 > 6 files changed, 107 insertions(+), 1 deletion(-) > > diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp > index 78250cf..5785bee 100644 > --- a/backend/src/ir/function.hpp > +++ b/backend/src/ir/function.hpp > @@ -341,6 +341,14 @@ namespace ir { > INLINE void setRegister(Tuple ID, uint32_t which, Register reg) { >file.set(ID, which, reg); > } I think need to extract this logic to the type TUPLE patch. > +/*! Get the type from the tuple vector */ > +INLINE uint8_t getType(Tuple ID, uint32_t which) const { > + return file.getType(ID, which); > +} > +/*! Set the type into the tuple vector */ > +INLINE void setType(Tuple ID, uint32_t which, uint8_t type) { > + file.setType(ID, which, type); > +} > /*! Get the register file */ > INLINE const RegisterFile (void) const { return file; } > /*! Get the given value ie immediate from the function */ > diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp > index bb5aac5..652c1fb 100644 > --- a/backend/src/ir/instruction.cpp > +++ b/backend/src/ir/instruction.cpp > @@ -994,6 +994,40 @@ namespace ir { > Register dst[1]; > }; > > +class ALIGNED_INSTRUCTION PrintfInstruction : > + public BasePolicy, > + public TupleSrcPolicy, > + public NDstPolicy<PrintfInstruction, 1> > +{ > + public: > +INLINE PrintfInstruction(Register dst, Tuple srcTuple, Tuple > typeTuple, > + uint8_t srcNum, uint8_t bti, uint16_t num) { > + this->opcode = OP_PRINTF; > + this->dst[0] = dst; > + this->src = srcTuple; > + this->type = typeTuple; > + this->srcNum = srcNum; > + this->bti = bti; > + this->num = num; > +} > +INLINE bool wellFormed(const Function , std::string ) > const; > +INLINE void out(std::ostream , const Function ) const; > + > +uint32_t getNum(void) const { return this->num; } > +uint32_t getBti(void) const { return this->bti; } > +Type getType(const Function& fn, uint32_t ID) const { > + GBE_ASSERTM(ID < this->srcNum, "Out-of-bound types"); > + return (Type)fn.getType(type, ID); > +} > + > +uint32_t srcNum:8;//!< Source Number > +uint32_t bti:8; //!< The BTI > +uint32_t num:16; //!< The printf statement number of one kernel. > +Tuple src; > +Tuple type; > +Register dst[1]; > +}; > + > #undef ALIGNED_INSTRUCTION > > / > @@ -1473,6 +1507,10 @@ namespace ir { >return true; > } > > +INLINE bool PrintfInstruction::wellFormed(const Function , > std::string ) const { > + return true; > +} > + > #undef CHECK_TYPE > > / > @@ -1702,6 +1740,11 @@ namespace ir { > >out << "TheadID Map at SLM: " << this->slmAddr; > } > + > +INLINE void PrintfInstruction::out(std::ostream , const Function > ) const { > + this->outOpcode(out); > +} > + >} /* namespace internal */ > >std::ostream << (std::ostream , AddressSpace addrSpace) { > @@ -1862,6 +1905,10 @@ START_INTROSPECTION(WorkGroupInstruction) > #include "ir/instruction.hxx" > END_INTROSPECTION(WorkGroupInstruction) > > +START_INTROSPECTION(PrintfInstruction) > +#include "ir/instruction.hxx" > +END_INTROSPECTION(PrintfInstruction) > + > #undef END_INTROSPECTION > #undef START_INTROSPECTION > #undef DECL_INSN > @@ -2008,7 +2055,8 @@ END_FU
Re: [Beignet] [Printf][PATCH 03/11] Reconstruct printf parser.
On Thu, Jan 21, 2016 at 11:29:24AM +0800, Yan Wang wrote: > Date: Thu, 21 Jan 2016 11:29:24 +0800 > From: Yan Wang <yan.w...@linux.intel.com> > To: beignet@lists.freedesktop.org > Cc: Yan Wang <yan.w...@linux.intel.com> > Subject: [Beignet] [Printf][PATCH 03/11] Reconstruct printf parser. > X-Mailer: git-send-email 2.5.0 > > Contributor: Junyan He <junyan...@linux.intel.com> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com> > --- > backend/src/ir/unit.cpp | 1 - > backend/src/ir/unit.hpp | 2 +- > backend/src/llvm/llvm_gen_backend.cpp | 4 +- > backend/src/llvm/llvm_printf_parser.cpp | 112 > ++-- > 4 files changed, 53 insertions(+), 66 deletions(-) > > diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp > index a350c60..5604244 100644 > --- a/backend/src/ir/unit.cpp > +++ b/backend/src/ir/unit.cpp > @@ -34,7 +34,6 @@ namespace ir { >Unit::~Unit(void) { > for (const auto : functions) GBE_DELETE(pair.second); > delete profilingInfo; > -for (const auto : printfs) GBE_DELETE(pair.second); >} >Function *Unit::getFunction(const std::string ) const { > auto it = functions.find(name); > diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp > index 10a1af6..9b9e41f 100644 > --- a/backend/src/ir/unit.hpp > +++ b/backend/src/ir/unit.hpp > @@ -47,7 +47,7 @@ namespace ir { >public: > typedef map<std::string, Function*> FunctionSet; > /*! Moved from printf pass */ > -map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs; > +map<llvm::CallInst*, PrintfSet::PrintfFmt> printfs; > /*! Create an empty unit */ > Unit(PointerSize pointerSize = POINTER_32_BITS); > /*! Release everything (*including* the function pointers) */ > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index dec023c..dba9dba 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -719,8 +719,8 @@ namespace gbe > void visitInstruction(Instruction ) {NOT_SUPPORTED;} > void* getPrintfInfo(CallInst* inst) > { > - if (unit.printfs[inst]) > -return (void*)unit.printfs[inst]; > + if ([inst]) > +return (void*)[inst]; >return NULL; > } > private: > diff --git a/backend/src/llvm/llvm_printf_parser.cpp > b/backend/src/llvm/llvm_printf_parser.cpp > index 1c88981..13ce099 100644 > --- a/backend/src/llvm/llvm_printf_parser.cpp > +++ b/backend/src/llvm/llvm_printf_parser.cpp > @@ -293,41 +293,21 @@ error: >public: > static char ID; > typedef std::pair<Instruction*, bool> PrintfInst; > -std::vector deadprintfs; > Module* module; > IRBuilder<>* builder; > Type* intTy; > -Value* pbuf_ptr; > -Value* index_buf_ptr; > -Value* g1Xg2Xg3; > -Value* wg_offset; > -int out_buf_sizeof_offset; > ir::Unit > -int printf_num; > -int totalSizeofSize; > - > -struct PrintfParserInfo { > - llvm::CallInst* call; > - PrintfSet::PrintfFmt* printf_fmt; > -}; > > PrintfParser(ir::Unit ) : FunctionPass(ID), > -unit(unit) > + unit(unit) > { >module = NULL; >builder = NULL; >intTy = NULL; > - out_buf_sizeof_offset = 0; > - pbuf_ptr = NULL; > - index_buf_ptr = NULL; > - g1Xg2Xg3 = NULL; > - wg_offset = NULL; > - printf_num = 0; > - totalSizeofSize = 0; > } > > -bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, > int& sizeof_size); > -bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& > dst_type, int& sizeof_size); > +bool parseOnePrintfInstruction(CallInst * call); > +bool generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& > new_arg); > > virtual const char *getPassName() const > { > @@ -337,7 +317,7 @@ error: > virtual bool runOnFunction(llvm::Function ); >}; > > - bool PrintfParser::parseOnePrintfInstruction(CallInst * call, > PrintfParserInfo& info, int& sizeof_size) > + bool PrintfParser::parseOnePrintfInstruction(CallInst * call) >{ > CallSite CS(call); > CallSite::arg_iterator CI_FMT = CS.arg_begin(); > @@ -359,16 +339,44 @@ error: > PrintfSet::PrintfFmt* printf_fmt = NULL; Maybe we can check whether the printf string is just "" here. if (fmt.size() == 0) { return false; // A null string, do nothing.
Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf instruction.
After applied the printf patch set, I find the last test still failed, please help to check. On Thu, Jan 28, 2016 at 12:33:05PM +0800, He Junyan wrote: > Date: Thu, 28 Jan 2016 12:33:05 +0800 > From: He Junyan <junyan...@inbox.com> > To: beignet@lists.freedesktop.org > Subject: Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf > instruction. > > On Thu, Jan 21, 2016 at 11:30:21AM +0800, Yan Wang wrote: > > Date: Thu, 21 Jan 2016 11:30:21 +0800 > > From: Yan Wang <yan.w...@linux.intel.com> > > To: beignet@lists.freedesktop.org > > Cc: Yan Wang <yan.w...@linux.intel.com> > > Subject: [Beignet] [Printf][PATCH 06/11] Implement emision of printf > > instruction. > > X-Mailer: git-send-email 2.5.0 > > > > Contributor: Junyan He <junyan...@linux.intel.com> > > Signed-off-by: Yan Wang <yan.w...@linux.intel.com> > > --- > > backend/src/ir/context.hpp| 5 ++ > > backend/src/llvm/llvm_gen_backend.cpp | 89 > > --- > > 2 files changed, 78 insertions(+), 16 deletions(-) > > > I think it is better to write another patch to type TUPLE logic > > diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp > > index b95741f..877d639 100644 > > --- a/backend/src/ir/context.hpp > > +++ b/backend/src/ir/context.hpp > > @@ -149,6 +149,11 @@ namespace ir { > >GBE_ASSERTM(fn != NULL, "No function currently defined"); > >return fn->file.appendArrayTuple(reg, regNum); > > } > > +/*! Make a tuple from an array of types */ > > +INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) { > > + GBE_ASSERTM(fn != NULL, "No function currently defined"); > > + return fn->file.appendArrayTypeTuple((uint8_t*)type, num); > > +} > > /*! We just use variadic templates to forward instruction functions */ > > #define DECL_INSN(NAME, FAMILY) \ > > template INLINE void NAME(Args...args); > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > > b/backend/src/llvm/llvm_gen_backend.cpp > > index dba9dba..cc736d7 100644 > > --- a/backend/src/llvm/llvm_gen_backend.cpp > > +++ b/backend/src/llvm/llvm_gen_backend.cpp > > @@ -486,6 +486,9 @@ namespace gbe > > typedef map>::iterator PtrOrigMapIter; > > // map pointer source to bti > > map BtiMap; > > +// map printf pointer source to bti > > +int printfBti; > > +uint32_t printfNum; > > // map ptr to its bti register > > map BtiValueMap; > > // map ptr to it's base > > @@ -520,6 +523,8 @@ namespace gbe > > unit(unit), > > ctx(unit), > > regTranslator(ctx), > > +printfBti(-1), > Also need to reset printfBti for each runOnFunction. > > > +printfNum(0), > > LI(0), > > TheModule(0), > > btiBase(BTI_RESERVED_NUM), > > @@ -594,7 +599,7 @@ namespace gbe > > /*! For all possible pointers, GlobalVariable, function pointer > > argument, > > alloca instruction, find their pointer escape points */ > > void analyzePointerOrigin(Function ); > > -unsigned getNewBti(Value *origin, bool isImage); > > +unsigned getNewBti(Value *origin, bool force); > > void assignBti(Function ); > > bool isSingleBti(Value *Val); > > Value *getBtiRegister(Value *v); > > @@ -717,12 +722,7 @@ namespace gbe > > // handle load of dword/qword with unaligned address > > void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, > > ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, > > bool fixedBTI); > > void visitInstruction(Instruction ) {NOT_SUPPORTED;} > > -void* getPrintfInfo(CallInst* inst) > > -{ > > - if ([inst]) > > -return (void*)[inst]; > > - return NULL; > > -} > > +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { return > > [inst]; } > > I think > ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { > if (unit.printfs.find(inst) == unit.printfs.end()) > return NULL; > >return [inst]; >} > > would be better > > > private: > >void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug > > infomation in context for subsequently passing to Gen insn > >ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, > > int32_t index = 0u)
Re: [Beignet] beignet git with llvm 3.7.1 + haswell (gen 7.5) failing
I think the correct steps to use FP64 should be: 1. Query the device using clGetDeviceInfo with CL_DEVICE_EXTENSIONS. 2. Check whether the extension string has "cl_khr_fp64" 3. If so, then you can enable DOUBLE support with #pragma OPENCL EXTENSION cl_khr_fp64 : enable You can refer to the cl_check_double helper function in utests. Haswell really could use double before, but not a full feature one. Because of the hardware's limitation, the precision of double is even lower than the float on PRE-HASWELL platforms. And double division is also unavailable on HASWELL. So we decide to totally disable double support on PRE-BDW platforms. The better user experience is on our plan. We really should give more useful prompt information and handle compiling error more decently, rather than a simple ASSERT. On Thu, Jan 21, 2016 at 06:50:36AM +, Song, Ruiling wrote: > Date: Thu, 21 Jan 2016 06:50:36 + > From: "Song, Ruiling" <ruiling.s...@intel.com> > To: Paulo Dias <paulo.miguel.d...@gmail.com>, "He, Junyan" > <junyan...@intel.com> > Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Subject: Re: [Beignet] beignet git with llvm 3.7.1 + haswell (gen 7.5) > failing > > Yes, I agree with you. But at least as I know haswell hardware does not > support double well. > > Hi Junyan, > > Can we handle it graciously? Do you have any idea? > > Thanks! > Ruiling > > From: Paulo Dias [mailto:paulo.miguel.d...@gmail.com] > Sent: Wednesday, January 20, 2016 8:37 PM > To: Song, Ruiling <ruiling.s...@intel.com> > Cc: beignet@lists.freedesktop.org > Subject: Re: [Beignet] beignet git with llvm 3.7.1 + haswell (gen 7.5) failing > > it does, but beignet should fail graciously with an error message then, not > segfault. and it used to work even with haswell. > > groo@hydra:~/devel/opencl/tools-master$ ./cl-demo 1 10 > Choose platform: > [0] Intel > [1] Mesa > Enter choice: 0 > Choose device: > [0] Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile > Enter choice: 0 > - > NAME: Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile > VENDOR: Intel > PROFILE: FULL_PROFILE > VERSION: OpenCL 1.2 beignet 1.2 > EXTENSIONS: cl_khr_global_int32_base_atomics > cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics > cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store > cl_khr_image2d_from_buffer cl_khr_spir cl_khr_icd cl_intel_accelerator > cl_intel_motion_estimation > DRIVER_VERSION: 1.2 > > Type: GPU > EXECUTION_CAPABILITIES: Kernel Native > GLOBAL_MEM_CACHE_TYPE: Read-Write (2) > CL_DEVICE_LOCAL_MEM_TYPE: Global (2) > SINGLE_FP_CONFIG: 0x6 > QUEUE_PROPERTIES: 0x2 > > VENDOR_ID: 32902 > MAX_COMPUTE_UNITS: 20 > MAX_WORK_ITEM_DIMENSIONS: 3 > MAX_WORK_GROUP_SIZE: 512 > PREFERRED_VECTOR_WIDTH_CHAR: 16 > PREFERRED_VECTOR_WIDTH_SHORT: 8 > PREFERRED_VECTOR_WIDTH_INT: 4 > PREFERRED_VECTOR_WIDTH_LONG: 2 > PREFERRED_VECTOR_WIDTH_FLOAT: 4 > PREFERRED_VECTOR_WIDTH_DOUBLE: 0 > MAX_CLOCK_FREQUENCY: 1000 > ADDRESS_BITS: 32 > MAX_MEM_ALLOC_SIZE: 1610612736 > IMAGE_SUPPORT: 1 > MAX_READ_IMAGE_ARGS: 128 > MAX_WRITE_IMAGE_ARGS: 8 > IMAGE2D_MAX_WIDTH: 8192 > IMAGE2D_MAX_HEIGHT: 8192 > IMAGE3D_MAX_WIDTH: 8192 > IMAGE3D_MAX_HEIGHT: 8192 > IMAGE3D_MAX_DEPTH: 2048 > MAX_SAMPLERS: 16 > MAX_PARAMETER_SIZE: 1024 > MEM_BASE_ADDR_ALIGN: 1024 > MIN_DATA_TYPE_ALIGN_SIZE: 128 > GLOBAL_MEM_CACHELINE_SIZE: 64 > GLOBAL_MEM_CACHE_SIZE: 8192 > GLOBAL_MEM_SIZE: 2147483648 > MAX_CONSTANT_BUFFER_SIZE: 134217728 > MAX_CONSTANT_ARGS: 8 > LOCAL_MEM_SIZE: 65536 > ERROR_CORRECTION_SUPPORT: 0 > PROFILING_TIMER_RESOLUTION: 80 > ENDIAN_LITTLE: 1 > AVAILABLE: 1 > COMPILER_AVAILABLE: 1 > MAX_WORK_GROUP_SIZES: 512 512 512 > - > ASSERTION FAILED: 0 > at file > /build/beignet-4N2m2_/beignet-1.2.0~git201601200931.13f504c~padoka0/backend/src/backend/gen_encoder.cpp, > function virtual void gbe::GenEncoder::handleDouble(gbe::GenEncoder*, > uint32_t, gbe::GenRegister, gbe::GenRegister, gbe::GenRegister), line 634 > Trace/breakpoint trap (core dumped) > > | Paulo Dias > | paulo.miguel.d...@gmail.com<mailto:paulo.miguel.d...@gmail.com> > > Tempora mutantur, nos et mutamur in illis. > > On Mon, Jan 18, 2016 at 12:10 AM, Song, Ruiling > <ruiling.s...@intel.com<mailto:ruiling.s...@intel.com>> wrote: > > Haswell does not support double data type, and Beignet does not expose the > extension. Looks like cl-demo use double data type? > > > > Thanks! >
Re: [Beignet] [PATCH V2] Fix the bug of crash when we pass -I path with spaces.
V2 just fix some typo, please just ignore the previous one. This patch should be merget to master and release1.1 On Wed, Jan 20, 2016 at 05:57:20PM +0800, junyan...@inbox.com wrote: > Date: Wed, 20 Jan 2016 17:57:20 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH V2] Fix the bug of crash when we pass -I path > with spaces. > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > We failed to handle -I "/XX X/YY YY/" like path passed > from the build option. We need to consider the spaces > here and pass it correctly to Clang. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > backend/src/backend/program.cpp | 51 > + > 1 file changed, 47 insertions(+), 4 deletions(-) > > diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp > index f886d03..c8bc688 100644 > --- a/backend/src/backend/program.cpp > +++ b/backend/src/backend/program.cpp > @@ -770,17 +770,60 @@ namespace gbe { > bool useDefaultCLCVersion = true; > > if (options) { > - char *str = (char *)malloc(sizeof(char) * (strlen(options) + 1)); > - memcpy(str, options, strlen(options) + 1); > - std::string optionStr(str); > + char *c_str = (char *)malloc(sizeof(char) * (strlen(options) + 1)); > + memcpy(c_str, options, strlen(options) + 1); > + std::string optionStr(c_str); >const std::string unsupportedOptions("-cl-denorms-are-zero, > -cl-strict-aliasing, -cl-opt-disable," > "-cl-no-signed-zeros, > -cl-fp32-correctly-rounded-divide-sqrt"); > >const std::string uncompatiblePCHOptions = > ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1, > -cl-finite-math-only"); >const std::string fastMathOption = ("-cl-fast-relaxed-math"); >while (end != std::string::npos) { > +/* need to handle -I"/XX X/X XX" with spaces first. */ > +if (optionStr[start] == '-' && optionStr[start + 1] == 'I') { > + end = start + 2; > + while(end < optionStr.size() && optionStr[end] == ' ') // Ignore > the spaces > +end++; > + > + if (end == optionStr.size()) { //reach the end and no content, > ignore > +free(c_str); > +return true; > + } > + > + if (optionStr[end] != '"') { // just a normal path without " " > +clOpt.push_back("-I"); > +start = end; > +continue; > + } > + > + end++; > + start = end; > + clOpt.push_back("-I"); > + > + /* find the second " */ > + while (end < optionStr.size() && optionStr[end] != '"') > +end++; > + > + if (optionStr[end] != '"') { > +free(c_str); > +return false; > + } > + > + if (end == start + 1) { // the case of "", ignore > +start = end + 1; > +continue; > + } > + > + std::string IPath = optionStr.substr(start, end - start); > + clOpt.push_back(IPath.c_str()); > + start = end + 1; > + continue; > +} > + > + > end = optionStr.find(' ', start); > std::string str = optionStr.substr(start, end - start); > + > start = end + 1; > if(str.size() == 0) >continue; > @@ -822,7 +865,7 @@ namespace gbe { > > clOpt.push_back(str); >} > - free(str); > + free(c_str); > } > > if (useDefaultCLCVersion) { > -- > 1.9.1 > > > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Workgroup reduce add optimization
I think the basic idea is OK and the result is als OK. Please pay attention to the code format, we prefer spaces rather than TABs to begin the line. And some comments below. On Wed, Dec 23, 2015 at 05:32:19PM +0200, Grigore Lupescu wrote: > Date: Wed, 23 Dec 2015 17:32:19 +0200 > From: Grigore Lupescu> To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH] Workgroup reduce add optimization > X-Mailer: git-send-email 2.1.4 > > Signed-off-by: Grigore Lupescu > --- > backend/src/backend/gen_context.cpp | 48 > - > 1 file changed, 32 insertions(+), 16 deletions(-) > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index a2e11a4..52e988e 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -2943,21 +2943,38 @@ namespace gbe >} > } >} > -} else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) { > - GBE_ASSERT(tmp.type == theVal.type); > - GenRegister v = GenRegister::toUniform(tmp, theVal.type); > - for (uint32_t i = 0; i < simd; i++) { > -p->ADD(threadData, threadData, v); > -v.subnr += typeSize(theVal.type); > -if (v.subnr == 32) { > - v.subnr = 0; > - v.nr++; > -} > - } > -} > - > -p->pop(); > - } > +} else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){ > + > + tmp.hstride = GEN_HORIZONTAL_STRIDE_1; > + tmp.vstride = GEN_VERTICAL_STRIDE_4; > + tmp.width = GEN_WIDTH_4; > + > + GBE_ASSERT(tmp.type == theVal.type); > + GenRegister partialSum = tmp; > + > + /* adjust offset, compute add with ADD4/ADD */ > + for (uint32_t i = 1; i < simd/4; i++){ > + tmp = tmp.suboffset(tmp, 4); > + GenNativeInstruction* insnQ1 = p->next(GEN_OPCODE_ADD); > + p->setHeader(insnQ1); > + p->setDst(insnQ1, partialSum); > + p->setSrc0(insnQ1, partialSum); > + p->setSrc1(insnQ1, tmp); > + insnQ1->header.execution_size = GEN_WIDTH_4; > + } I think it is not good to use generate the instruction directly here. Maybe you can set simd=4 and call p->ADD. We want to put all the instructions generation to gen_encoder.cpp > + > + partialSum = GenRegister::toUniform(partialSum, theVal.type); > + for (uint32_t i = 0; i < 4; i++){ > + p->ADD(threadData, threadData, partialSum); > + partialSum.subnr += typeSize(theVal.type); > + if (partialSum.subnr == 32) { > + partialSum.subnr = 0; > + partialSum.nr++; > + } I think you can also use suboffset here. > + } > + } > + p->pop(); > +} > > #define SEND_RESULT_MSG() \ > do { \ > @@ -3123,7 +3140,6 @@ do { \ > p->curr.predicate = GEN_PREDICATE_NONE; > p->WAIT(2); > p->patchJMPI(jip, (p->n_instruction() - jip), 0); > - > /* Do something when get the msg. */ > p->curr.execWidth = simd; > p->MOV(dst, msgData); > -- > 2.1.4 > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] add Broxton support
Ok, That's good for me. Thanks for enabling BTX. On Fri, Dec 04, 2015 at 03:22:20AM +0800, Guo Yejun wrote: > Date: Fri, 4 Dec 2015 03:22:20 +0800 > From: Guo Yejun> To: beignet@lists.freedesktop.org > Cc: Guo Yejun > Subject: [Beignet] [PATCH] add Broxton support > X-Mailer: git-send-email 1.9.1 > > special versions of linux kernel and libdrm are needed. > utest and conformance test PASSED. > > Signed-off-by: Guo Yejun > --- > GetGenID.sh| 2 +- > backend/src/backend/gen8_context.cpp | 2 +- > backend/src/backend/gen8_context.hpp | 2 + > backend/src/backend/gen9_context.cpp | 110 > + > backend/src/backend/gen9_context.hpp | 22 ++ > backend/src/backend/gen_insn_selection.cpp | 11 +++ > backend/src/backend/gen_insn_selection.hpp | 7 ++ > backend/src/backend/gen_program.cpp| 17 - > backend/src/gbe_bin_generater.cpp | 4 ++ > src/cl_device_data.h | 9 ++- > src/cl_device_id.c | 34 +++-- > src/intel/intel_gpgpu.c| 5 +- > 12 files changed, 213 insertions(+), 12 deletions(-) > > diff --git a/GetGenID.sh b/GetGenID.sh > index 7acf9bd..30296da 100755 > --- a/GetGenID.sh > +++ b/GetGenID.sh > @@ -1,5 +1,5 @@ > #!/bin/bash > -genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a > 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 > 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26) > +genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a > 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 > 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 > 5a84) > pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F > ] '{print $1}')) > n=${#pciid[*]} > i=0 > diff --git a/backend/src/backend/gen8_context.cpp > b/backend/src/backend/gen8_context.cpp > index 71d900f..7455bfc 100644 > --- a/backend/src/backend/gen8_context.cpp > +++ b/backend/src/backend/gen8_context.cpp > @@ -417,7 +417,7 @@ namespace gbe > GBE_ASSERT(0); >} > > - static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0) > + GenRegister Gen8Context::unpacked_ud(GenRegister reg, uint32_t offset) >{ > if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) { >if(offset == 0) > diff --git a/backend/src/backend/gen8_context.hpp > b/backend/src/backend/gen8_context.hpp > index 537aef5..cc415c6 100644 > --- a/backend/src/backend/gen8_context.hpp > +++ b/backend/src/backend/gen8_context.hpp > @@ -76,6 +76,8 @@ namespace gbe > > virtual void emitF64DIVInstruction(const SelectionInstruction ); > > +static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0); > + >protected: > virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, > int sz = 0); > virtual void subTimestamps(GenRegister& t0, GenRegister& t1, > GenRegister& tmp); > diff --git a/backend/src/backend/gen9_context.cpp > b/backend/src/backend/gen9_context.cpp > index c35293a..47b1496 100644 > --- a/backend/src/backend/gen9_context.cpp > +++ b/backend/src/backend/gen9_context.cpp > @@ -55,4 +55,114 @@ namespace gbe >p->WAIT(); > p->pop(); >} > + > + void BxtContext::newSelection(void) { > +this->sel = GBE_NEW(SelectionBxt, *this); > + } > + > + void BxtContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, > GenRegister dst_h, > + GenRegister dst_l, GenRegister > s0l_s1h, GenRegister s0h_s1l) > + { > +src0.type = src1.type = GEN_TYPE_UD; > +dst_h.type = dst_l.type = GEN_TYPE_UL; > +s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL; > + > +//GenRegister tmp; > + > +GenRegister s0l = unpacked_ud(src0); > +GenRegister s1l = unpacked_ud(src1); > +GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, > reuse s0l_s1h > +GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, > reuse dst_l > + > +p->MOV(s0h, GenRegister::offset(s0l, 0, 4)); > +p->MOV(s1h, GenRegister::offset(s1l, 0, 4)); > + > +/* High 32 bits X High 32 bits. */ > +p->MUL(dst_h, s0h, s1h); > +/* High 32 bits X low 32 bits. */ > +p->MUL(s0h_s1l, s0h, s1l); > +/* Low 32 bits X high 32 bits. */ > +p->MUL(s0l_s1h, s0l, s1h); > +/* Low 32 bits X low 32 bits. */ > +p->MUL(dst_l, s0l, s1l); > + > +/* Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + > 1 - 2^(N+1), here N = 32 > +The max of addding 2 32bits integer to it is > +2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1 > +which means the product s0h_s1l adds dst_l's high 32 bits and then > adds s0l_s1h's low 32 bits will not > +overflow and have no
Re: [Beignet] [PATCH 07/13] Backend: Add WORKGROUP_OP instruction selection.
On Wed, Dec 09, 2015 at 08:18:29AM +, Yang, Rong R wrote: > Date: Wed, 9 Dec 2015 08:18:29 + > From: "Yang, Rong R" <rong.r.y...@intel.com> > To: "junyan...@inbox.com" <junyan...@inbox.com>, > "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Subject: Re: [Beignet] [PATCH 07/13] Backend: Add WORKGROUP_OP instruction > selection. > > > > > -Original Message- > > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > > junyan...@inbox.com > > Sent: Tuesday, December 1, 2015 16:11 > > To: beignet@lists.freedesktop.org > > Subject: [Beignet] [PATCH 07/13] Backend: Add WORKGROUP_OP instruction > > selection. > > > > From: Junyan He <junyan...@linux.intel.com> > > > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > > --- > > backend/src/backend/gen_context.cpp|3 ++ > > backend/src/backend/gen_context.hpp|1 + > > .../src/backend/gen_insn_gen7_schedule_info.hxx|1 + > > backend/src/backend/gen_insn_selection.cpp | 34 > > > > backend/src/backend/gen_insn_selection.hpp |1 + > > backend/src/backend/gen_insn_selection.hxx |1 + > > 6 files changed, 41 insertions(+) > > > > diff --git a/backend/src/backend/gen_context.cpp > > b/backend/src/backend/gen_context.cpp > > index 43fa7fa..5c819b7 100644 > > --- a/backend/src/backend/gen_context.cpp > > +++ b/backend/src/backend/gen_context.cpp > > @@ -2844,6 +2844,9 @@ namespace gbe > > } p->pop(); > >} > > > > + void GenContext::emitWorkGroupOpInstruction(const > > + SelectionInstruction ) { } > > + > >void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, > > int sz) { > > if (sz == 0) > >sz = 8; > > diff --git a/backend/src/backend/gen_context.hpp > > b/backend/src/backend/gen_context.hpp > > index da9bbbe..22ec0ea 100644 > > --- a/backend/src/backend/gen_context.hpp > > +++ b/backend/src/backend/gen_context.hpp > > @@ -179,6 +179,7 @@ namespace gbe > > virtual void emitF64DIVInstruction(const SelectionInstruction ); > > void emitCalcTimestampInstruction(const SelectionInstruction ); > > void emitStoreProfilingInstruction(const SelectionInstruction ); > > +void emitWorkGroupOpInstruction(const SelectionInstruction ); > > void scratchWrite(const GenRegister header, uint32_t offset, uint32_t > > reg_num, uint32_t reg_type, uint32_t channel_mode); > > void scratchRead(const GenRegister dst, const GenRegister header, > > uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t > > channel_mode); > > unsigned beforeMessage(const SelectionInstruction , GenRegister > > bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc); diff --git > > a/backend/src/backend/gen_insn_gen7_schedule_info.hxx > > b/backend/src/backend/gen_insn_gen7_schedule_info.hxx > > index 739cc04..8ef422f 100644 > > --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx > > +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx > > @@ -47,3 +47,4 @@ DECL_GEN7_SCHEDULE(I64SATSUB, 20,40, > > 20) > > DECL_GEN7_SCHEDULE(F64DIV, 20,40, 20) > > DECL_GEN7_SCHEDULE(CalcTimestamp, 80,1,1) > > DECL_GEN7_SCHEDULE(StoreProfiling, 80,1,1) > > +DECL_GEN7_SCHEDULE(WorkGroupOp,80, 1, > > 1) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > > b/backend/src/backend/gen_insn_selection.cpp > > index 5b08958..536d347 100644 > > --- a/backend/src/backend/gen_insn_selection.cpp > > +++ b/backend/src/backend/gen_insn_selection.cpp > > @@ -680,6 +680,9 @@ namespace gbe > > void I64REM(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int > > tmp_int); > > /*! double division */ > > void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum); > > +/*! Work Group Operations */ > > +void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, > > GenRegister nextThreadID, > > + GenRegister threadID, GenRegister threadn, > > + GenRegister tmp); > > /* common functions for both binary instruction and sel_cmp and > > compare instruction. > > It will handle the IMM or normal register assignment, and will try > > to avoid > > LOADI > > as much as possible. */ > > @@ -1897,6 +190
Re: [Beignet] [PATCH] Add benchmark for workgroup functions
Hi Grigore, I notice that you just reuse the kernel in utest as the benchmark kernel. In this kernel, we just call the workgroup function once, while the time diff calculated by your benchmark here includes the whole process of exec a kernel on GPU. The OCL_NDRANGE itself and the LOAD and STORE in the kernel may occupy more time than the workgroup function. So I think it is hard for us to judge the performance base on this time diff. I think maybe you can re-write a kernel and call the workgroup function, for example _add, more than 100 times within one kernel, and then the time diff may be more valuable. On Fri, Dec 04, 2015 at 03:37:28PM +0200, Grigore Lupescu wrote: > Date: Fri, 4 Dec 2015 15:37:28 +0200 > From: Grigore Lupescu> To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH] Add benchmark for workgroup functions > X-Mailer: git-send-email 2.1.4 > > Signed-off-by: Grigore Lupescu > --- > benchmark/CMakeLists.txt| 3 +- > benchmark/benchmark_workgroup_functions.cpp | 176 > > 2 files changed, 178 insertions(+), 1 deletion(-) > create mode 100644 benchmark/benchmark_workgroup_functions.cpp > > diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt > index dd33829..fd7fd7d 100644 > --- a/benchmark/CMakeLists.txt > +++ b/benchmark/CMakeLists.txt > @@ -18,7 +18,8 @@ set (benchmark_sources >benchmark_copy_buffer_to_image.cpp >benchmark_copy_image_to_buffer.cpp >benchmark_copy_buffer.cpp > - benchmark_copy_image.cpp) > + benchmark_copy_image.cpp > + benchmark_workgroup_functions.cpp) > > > SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}") > diff --git a/benchmark/benchmark_workgroup_functions.cpp > b/benchmark/benchmark_workgroup_functions.cpp > new file mode 100644 > index 000..81403a0 > --- /dev/null > +++ b/benchmark/benchmark_workgroup_functions.cpp > @@ -0,0 +1,176 @@ > +#include > +#include > +#include > +#include > +#include "utest_helper.hpp" > +#include > + > +double benchmark_workgroup_add_uint(void) > +{ > + cl_int ret; > + struct timeval start,stop; > + const size_t set_size = 256; > + const size_t set_num = set_size * set_size; > + size_t set_num_work = set_num; > + uint32_t* src = NULL; /* input set will be generated */ > + > + cl_mem sub_buf_in; > + cl_mem sub_buf_out; > + cl_buffer_region buf_region_in; > + cl_buffer_region buf_region_out; > + > + buf_region_in.size = set_size * sizeof(uint32_t); > + buf_region_in.origin = 0; > + buf_region_out.size = set_size * sizeof(uint32_t); > + buf_region_out.origin = 0; > + > + /* Each set is of the form (1, 0, 0, ..0) */ > + src = (uint32_t*)calloc(sizeof(uint32_t), set_num * set_size); > + OCL_ASSERT(src != NULL); > + for(uint32_t i = 0; i < set_num * set_size; i++) > + if((i % set_size) == 0) > + src[i] = 1; > + > + /* Setup kernel and buffers */ > + OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", > + "compiler_workgroup_reduce_add_uint"); > + OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(uint32_t), > NULL); > + OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(uint32_t), > NULL); > + > + OCL_MAP_BUFFER(0); > + memcpy(buf_data[0], src, set_num* set_size * sizeof(uint32_t)); > + OCL_UNMAP_BUFFER(0); > + > + globals[0] = set_size; > + locals[0] = set_size; > + > + /* Measure performance */ > + gettimeofday(,0); > + while(set_num_work > 0){ > + /* Perform reductions, subBuffers with offsets */ > + for(uint32_t i = 0; i < set_num; i++){ > + sub_buf_in = clCreateSubBuffer(buf[0], 0, > + CL_BUFFER_CREATE_TYPE_REGION, > _region_in, ); > + OCL_ASSERT(ret == 0); > + sub_buf_out = clCreateSubBuffer(buf[1], 0, > + CL_BUFFER_CREATE_TYPE_REGION, > _region_out, ); > + OCL_ASSERT(ret == 0); > + > + OCL_SET_ARG(0, sizeof(cl_mem), _buf_in); > + OCL_SET_ARG(1, sizeof(cl_mem), _buf_out); > + OCL_NDRANGE(1); > + > + buf_region_in.origin += set_size * sizeof(uint32_t); > + buf_region_out.origin += set_size * sizeof(uint32_t); > + } > + /* Prepare memory for next set of reductions */ > + OCL_MAP_BUFFER(0); > + OCL_MAP_BUFFER(1); > + for (uint32_t i = 0; i < set_num_work; i++) { > + ((uint32_t *)buf_data[0])[i] = > + ((uint32_t *)buf_data[1])[i * set_size]; > + } > + OCL_UNMAP_BUFFER(0); > + OCL_UNMAP_BUFFER(1); > + > + set_num_work /= set_size; > +
Re: [Beignet] [PATCH 01/13] Backend: Add sr0 reg helper function.
This is V2. V2: Just rebase to master and ping for review. On Tue, Dec 01, 2015 at 04:10:28PM +0800, junyan...@inbox.com wrote: > Date: Tue, 1 Dec 2015 16:10:28 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH 01/13] Backend: Add sr0 reg helper function. > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > sr0 is used to specify the state reigster where we can get the > state of each EU thread. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > backend/src/backend/gen75_context.cpp |8 +--- > backend/src/backend/gen_register.hpp | 10 ++ > 2 files changed, 11 insertions(+), 7 deletions(-) > > diff --git a/backend/src/backend/gen75_context.cpp > b/backend/src/backend/gen75_context.cpp > index 7d407c3..fa8b029 100644 > --- a/backend/src/backend/gen75_context.cpp > +++ b/backend/src/backend/gen75_context.cpp > @@ -44,13 +44,7 @@ namespace gbe > p->push(); >p->curr.execWidth = 1; >p->curr.predicate = GEN_PREDICATE_NONE; > - GenRegister sr0 = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, > -GEN_ARF_STATE, > -1, > -GEN_TYPE_UD, > -GEN_VERTICAL_STRIDE_8, > -GEN_WIDTH_8, > -GEN_HORIZONTAL_STRIDE_1); > + GenRegister sr0 = GenRegister::sr(0, 1); >p->SHR(sr0, slm_index, GenRegister::immud(16)); > p->pop(); >} > diff --git a/backend/src/backend/gen_register.hpp > b/backend/src/backend/gen_register.hpp > index 5c813be..aa0744b 100644 > --- a/backend/src/backend/gen_register.hpp > +++ b/backend/src/backend/gen_register.hpp > @@ -828,6 +828,16 @@ namespace gbe > GEN_HORIZONTAL_STRIDE_0); > } > > +static INLINE GenRegister sr(uint32_t nr, uint32_t subnr = 0) { > + return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, > + GEN_ARF_STATE | nr, > + subnr, > + GEN_TYPE_UD, > + GEN_VERTICAL_STRIDE_8, > + GEN_WIDTH_8, > + GEN_HORIZONTAL_STRIDE_1); > +} > + > static INLINE GenRegister notification0(uint32_t subnr) { >return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, > GEN_ARF_NOTIFICATION_COUNT, > -- > 1.7.9.5 > > > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 21/21 V3] CMake: Add -lrt to the link command of libcl.so
Sorry, this patch should not belong to this patch set, please ignore it. On Tue, Nov 17, 2015 at 07:40:23AM +0800, junyan...@inbox.com wrote: > Date: Tue, 17 Nov 2015 07:40:23 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH 21/21 V3] CMake: Add -lrt to the link command of > libcl.so > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > The clock_gettime will cause the linkage error on some > version of GCC, we need to add -lrt at the end of the > link command line. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > src/CMakeLists.txt |1 + > 1 file changed, 1 insertion(+) > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt > index c917e76..4c5112c 100644 > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -144,6 +144,7 @@ add_library(cl SHARED ${OPENCL_SRC}) > ADD_DEPENDENCIES(cl ${GIT_SHA1}) > target_link_libraries( >cl > + rt >${X11_LIBRARIES} >${XEXT_LIBRARIES} >${XFIXES_LIBRARIES} > -- > 1.7.9.5 > > > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/7 V2] Backend: Delete the useless MOV_DF instruction.
V2: Fix uniform bug in conversion. Delete verbose printf in utests. Fix a bug for BSW when convert half to double. On Thu, Nov 05, 2015 at 04:15:41PM +0800, junyan...@inbox.com wrote: > Date: Thu, 5 Nov 2015 16:15:41 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH 1/7 V2] Backend: Delete the useless MOV_DF > instruction. > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > Because just platform after BDW will support double, > the special instruction for double MOV is not needed > anymore. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > backend/src/backend/gen75_encoder.cpp | 36 - > backend/src/backend/gen75_encoder.hpp | 1 - > backend/src/backend/gen8_encoder.cpp | 36 - > backend/src/backend/gen8_encoder.hpp | 1 - > backend/src/backend/gen_context.cpp| 3 --- > backend/src/backend/gen_encoder.cpp| 43 > -- > backend/src/backend/gen_encoder.hpp| 2 -- > backend/src/backend/gen_insn_selection.cpp | 23 +--- > backend/src/backend/gen_insn_selection.hxx | 1 - > 9 files changed, 1 insertion(+), 145 deletions(-) > > diff --git a/backend/src/backend/gen75_encoder.cpp > b/backend/src/backend/gen75_encoder.cpp > index 135be02..5d1a964 100644 > --- a/backend/src/backend/gen75_encoder.cpp > +++ b/backend/src/backend/gen75_encoder.cpp > @@ -251,42 +251,6 @@ namespace gbe > pop(); >} > > - void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister > tmp) { > -GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && > dest.type == GEN_TYPE_F)); > -GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F); > -int w = curr.execWidth; > -GenRegister r0; > -r0 = GenRegister::h2(r); > -push(); > -curr.execWidth = 4; > -curr.predicate = GEN_PREDICATE_NONE; > -curr.noMask = 1; > -MOV(r0, src0); > -MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4)); > -curr.noMask = 0; > -curr.quarterControl = 0; > -curr.nibControl = 0; > -MOV(dest, r0); > -curr.nibControl = 1; > -MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4)); > -pop(); > -if (w == 16) { > - push(); > - curr.execWidth = 4; > - curr.predicate = GEN_PREDICATE_NONE; > - curr.noMask = 1; > - MOV(r0, GenRegister::suboffset(src0, 8)); > - MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12)); > - curr.noMask = 0; > - curr.quarterControl = 1; > - curr.nibControl = 0; > - MOV(GenRegister::suboffset(dest, 8), r0); > - curr.nibControl = 1; > - MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4)); > - pop(); > -} > - } > - >void Gen75Encoder::JMPI(GenRegister src, bool longjmp) { > alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src); >} > diff --git a/backend/src/backend/gen75_encoder.hpp > b/backend/src/backend/gen75_encoder.hpp > index e494f29..f5044c0 100644 > --- a/backend/src/backend/gen75_encoder.hpp > +++ b/backend/src/backend/gen75_encoder.hpp > @@ -42,7 +42,6 @@ namespace gbe > virtual void JMPI(GenRegister src, bool longjmp = false); > /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump > distance */ > virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip); > -virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp > = GenRegister::null()); > virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double > value); > virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, > GenRegister bti, uint32_t srcNum); > virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister > bti, uint32_t elemNum); > diff --git a/backend/src/backend/gen8_encoder.cpp > b/backend/src/backend/gen8_encoder.cpp > index 55fc3fb..98c3917 100644 > --- a/backend/src/backend/gen8_encoder.cpp > +++ b/backend/src/backend/gen8_encoder.cpp > @@ -260,42 +260,6 @@ namespace gbe > MOV(dest, value); >} > > - void Gen8Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister > tmp) { > -GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && > dest.type == GEN_TYPE_F)); > -GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F); > -int w = curr.execWidth; > -GenRegister r0; > -r0 = GenRegister::h2(r); > -push(); > -curr.e
Re: [Beignet] [V2 PATCH 0/8] Implement double division on BDW
Ping for review. On Fri, Sep 18, 2015 at 05:58:11PM +0800, junyan...@inbox.com wrote: > Date: Fri, 18 Sep 2015 17:58:11 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [V2 PATCH 0/8] Implement double division on BDW > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > We use the macro: > r0 = 0, r6 = a, r7 = b, r1 = 1 > > math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE > (-f0.0) if > madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 // Step(1), q0=a*y0 > madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), e0=(1-b*y0) > madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), r0=a-b*q0 > madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2 // Step(4), y1=y0+e0*y0 > madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), e1=(1-b*y1) > madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6 // Step(6), y2=y0+e0*y1 > madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6 // Step(7), q1=q0+r0*y1 > madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7 // Step(8), y3=y1+e1*y2 > madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), r1=a-b*q1 > > madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2 // Step(10), q=q1+r1*y3 > endif > > to implement hi precision double division on BDW. > > > V2: > 1. Correct the spelling slips. > 2. Fix some bugs for double registers format. > 3. Redefine the handle double logic and delete the double support on pre-gen7 > 4. Declare fp64 extension support on BDW. > 5. Consider the uniform case for F64DIV. > > With this patch set, the +-*/ is basically OK on BDW platform. > All pre-gen7 platforms will not support double any more. > Conversion and bitcast between double and other types are not OK now. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > backend/src/backend/gen/gen_mesa_disasm.c | 134 > > backend/src/backend/gen75_encoder.hpp | 4 - > backend/src/backend/gen7_encoder.hpp| 4 - > backend/src/backend/gen8_context.cpp| 145 > ++ > backend/src/backend/gen8_context.hpp| 2 + > backend/src/backend/gen8_encoder.cpp| 164 > +- > backend/src/backend/gen8_encoder.hpp| 12 ++- > backend/src/backend/gen8_instruction.hpp| 86 > backend/src/backend/gen_context.cpp | 4 + > backend/src/backend/gen_context.hpp | 1 + > backend/src/backend/gen_defs.hpp| 13 +++ > backend/src/backend/gen_encoder.cpp | 52 ++ > backend/src/backend/gen_encoder.hpp | 3 +- > backend/src/backend/gen_insn_gen7_schedule_info.hxx | 1 + > backend/src/backend/gen_insn_selection.cpp | 54 +- > backend/src/backend/gen_insn_selection.hxx | 1 + > backend/src/backend/gen_register.hpp| 6 +- > kernels/compiler_double_4.cl| 5 - > kernels/compiler_double_div.cl | 11 ++ > src/cl_device_id.c | 3 + > src/cl_extensions.c | 21 > src/cl_extensions.h | 2 + > utests/CMakeLists.txt | 2 + > utests/compiler_double.cpp | 5 +- > utests/compiler_double_4.cpp| 40 > utests/compiler_double_div.cpp | 80 +++ > utests/utest_helper.cpp | 19 > utests/utest_helper.hpp | 3 + > > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Backend: Refine ConvertInstruction logic in insn_selection
Ping for review. On Tue, Sep 22, 2015 at 06:29:23PM +0800, junyan...@inbox.com wrote: > Date: Tue, 22 Sep 2015 18:29:23 +0800 > From: junyan...@inbox.com > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH] Backend: Refine ConvertInstruction logic in > insn_selection > X-Mailer: git-send-email 1.7.9.5 > > From: Junyan He <junyan...@linux.intel.com> > > The ConvertInstruction now need to handle a lot of special > cases instead of simple MOV. The judgement of native long > support, half support and reg restriction of long type and > the situation very complicated. The current code logic is > too verbose and hard to read. We now use sub routine functions > to make it clear and readable. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > backend/src/backend/gen_insn_selection.cpp | 780 > +--- > 1 file changed, 475 insertions(+), 305 deletions(-) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index ab00269..4800f7f 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -4124,148 +4124,132 @@ namespace gbe >return false; > } > > -INLINE bool emitOne(Selection::Opaque , const ir::ConvertInstruction > , bool ) const > +INLINE void convertBetweenHalfFloat(Selection::Opaque , const > ir::ConvertInstruction , bool ) const > { >using namespace ir; >const Type dstType = insn.getDstType(); >const Type srcType = insn.getSrcType(); > - const RegisterFamily dstFamily = getFamily(dstType); > - const RegisterFamily srcFamily = getFamily(srcType); >const GenRegister dst = sel.selReg(insn.getDst(0), dstType); >const GenRegister src = sel.selReg(insn.getSrc(0), srcType); >const Opcode opcode = insn.getOpcode(); > - sel.push(); > -if (sel.isScalarReg(insn.getDst(0)) == true) { > - sel.curr.execWidth = 1; > - sel.curr.predicate = GEN_PREDICATE_NONE; > - sel.curr.noMask = 1; > -} > - if(opcode == ir::OP_SAT_CVT) > -sel.curr.saturate = 1; > > - // We need two instructions to make the conversion >if (opcode == OP_F16TO32) { > sel.F16TO32(dst, src); >} else if (opcode == OP_F32TO16) { > +// We need two instructions to make the conversion > GenRegister unpacked; > unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, > sel.isScalarReg(insn.getSrc(0; > sel.push(); > - if (sel.isScalarReg(insn.getSrc(0))) { > -sel.curr.execWidth = 1; > -sel.curr.predicate = GEN_PREDICATE_NONE; > -sel.curr.noMask = 1; > - } > - sel.F32TO16(unpacked, src); > +if (sel.isScalarReg(insn.getSrc(0))) { > + sel.curr.execWidth = 1; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > +} > +sel.F32TO16(unpacked, src); > sel.pop(); > sel.MOV(dst, unpacked); > - } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && > srcFamily == FAMILY_DWORD) {//convert i32 to small int and half > -GenRegister unpacked; > -if (dstFamily == FAMILY_WORD) { > - uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; > - > - /* The special case, when dst is half, float->word->half will lose > accuracy. */ > - if (dstType == TYPE_HALF) { > -GBE_ASSERT(sel.hasHalfType()); > -type = GEN_TYPE_HF; > - } > + } else { > +GBE_ASSERT("Not conversion between float and half\n"); > + } > +} > > - if (!sel.isScalarReg(dst.reg())) { > -unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, > sel.isScalarReg(insn.getSrc(0; > -unpacked = GenRegister::retype(unpacked, type); > - } else > -unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type); > -} else { > - const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : > GEN_TYPE_B; > - if (!sel.isScalarReg(dst.reg())) { > -unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, > sel.isScalarReg(insn.getSrc(0; > -unpacked = GenRegister::retype(unpacked, type); > - } else > -unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type); > -} > +INLINE void convert32bitsToSmall(Selection::Opaque , const > ir::ConvertInstruction , bool ) const > +{ >
Re: [Beignet] [PATCH] fix __kernel function called in __kernel issue.
LGTM, thanks On Sat, Oct 10, 2015 at 06:55:45AM -0400, xionghu@intel.com wrote: > Date: Sat, 10 Oct 2015 06:55:45 -0400 > From: xionghu@intel.com > To: beignet@lists.freedesktop.org > Cc: xionghu@intel.com > Subject: [Beignet] [PATCH] fix __kernel function called in __kernel issue. > X-Mailer: git-send-email 1.9.1 > > From: Luo Xionghu> > the printfPaser variables g1Xg2Xg3 and wg_offset should be reinit after > the builder is deleted, or else the variables will be freed and caused > memory leak; > query the Constants related to the globallist by name instead: the > GenWriter pass will be called by the number of __kernel functions in the > module, since the globallist is always the same, constant index is > not simply increased in different kernel function. > > this patch could fix fdo bug: > https://bugs.freedesktop.org/show_bug.cgi?id=90472. > > Signed-off-by: Luo Xionghu > --- > backend/src/llvm/llvm_gen_backend.cpp | 4 +--- > backend/src/llvm/llvm_printf_parser.cpp | 2 ++ > 2 files changed, 3 insertions(+), 3 deletions(-) > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 4905415..1a65ee0 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -2443,7 +2443,6 @@ namespace gbe >{ > // Allocate a address register for each global variable > const Module::GlobalListType = TheModule->getGlobalList(); > -size_t j = 0; > for(auto i = globalList.begin(); i != globalList.end(); i ++) { >const GlobalVariable = *i; >if(!v.isConstantUsed()) continue; > @@ -2475,8 +2474,7 @@ namespace gbe > GBE_ASSERT(v.hasInitializer()); > this->newRegister(const_cast ()); > ir::Register reg = > regTranslator.getScalar(const_cast (), 0); > -ir::Constant = unit.getConstantSet().getConstant(j ++); > -GBE_ASSERT(con.getName() == v.getName()); > +ir::Constant = unit.getConstantSet().getConstant(v.getName()); > ctx.LOADI(ir::TYPE_S32, reg, > ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); >} else { > if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) { > diff --git a/backend/src/llvm/llvm_printf_parser.cpp > b/backend/src/llvm/llvm_printf_parser.cpp > index 3d84457..7ebda65 100644 > --- a/backend/src/llvm/llvm_printf_parser.cpp > +++ b/backend/src/llvm/llvm_printf_parser.cpp > @@ -755,6 +755,8 @@ error: > > deadprintfs.clear(); > delete builder; > +g1Xg2Xg3 = NULL; > +wg_offset = NULL; > > return changed; >} > -- > 1.9.1 > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/7] add debugloc for inserted llvm instructions
This patch set can basically work, but has some problems. Common issue: 1. Please pay attention to the code format, the line should not begin with TAB and should not have trailing SPACEs. 2. Some tmp comments should be deleted, and no need for author name in code. 3. I think the reorganise the patches by stage of the backend, one patch for each stage. Details are as following: On Fri, Sep 18, 2015 at 05:01:51PM +0800, Bai Yannan wrote: > Date: Fri, 18 Sep 2015 17:01:51 +0800 > From: Bai Yannan> To: beignet@lists.freedesktop.org > Cc: Bai Yannan > Subject: [Beignet] [PATCH 1/7] add debugloc for inserted llvm instructions > X-Mailer: git-send-email 1.9.1 > > add debugloc when llvm instuctions inserted, the debugloc is inherited > from the contiguous one. > > Signed-off-by: Bai Yannan > --- > backend/src/backend/program.cpp | 7 + > backend/src/llvm/llvm_gen_backend.cpp| 25 ++ > backend/src/llvm/llvm_loadstore_optimization.cpp | 18 ++ > backend/src/llvm/llvm_printf_parser.cpp | 20 ++ > backend/src/llvm/llvm_sampler_fix.cpp| 17 ++ > backend/src/llvm/llvm_scalarize.cpp | 18 ++ > backend/src/llvm/llvm_timestamp.cpp | 337 > +++ > backend/src/llvm/llvm_to_gen.cpp | 10 +- > 8 files changed, 451 insertions(+), 1 deletion(-) > create mode 100644 backend/src/llvm/llvm_timestamp.cpp > > diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp > index f5865c2..af817de 100644 > --- a/backend/src/backend/program.cpp > +++ b/backend/src/backend/program.cpp > @@ -49,6 +49,7 @@ > #include > #include > #include > +#include > > #ifdef GBE_COMPILER_AVAILABLE > /* Not defined for LLVM 3.0 */ > @@ -554,6 +555,12 @@ namespace gbe { > args.push_back("stringInput.cl"); > args.push_back("-ffp-contract=off"); > > + if(getenv("OCL_PROFILING")) { > + char * isProfiling = getenv("OCL_PROFILING"); > + if(*isProfiling == '1') > + args.push_back("-g"); > + } I think here we need to use BVAR or IVAR auxiliary functions instead of using system getenv. > + > // The compiler invocation needs a DiagnosticsEngine so it can report > problems > std::string ErrorString; > llvm::raw_string_ostream ErrorInfo(ErrorString); > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 4905415..238370a 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -108,6 +108,8 @@ > > #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5 > #include "llvm/IR/Mangler.h" > +#include "llvm/IR/DebugLoc.h" > +#include "llvm/IR/DebugInfo.h" > #else > #include "llvm/Target/Mangler.h" > #endif > @@ -178,6 +180,20 @@ > > using namespace llvm; > > +#define OCL_PROFILING (bool)(getenv("OCL_PROFILING")[0]-48) > +#define SETDEBUGLOCATION(BUILDER, INSN) \ > + if(OCL_PROFILING) { > \ > + llvm::BasicBlock *bb = INSN->getParent(); \ > + llvm::BasicBlock::iterator iter =bb->begin(); \ > + while(!(iter++)->isIdenticalTo(INSN)) ; \ > + llvm::MDNode *N = iter->getMetadata("dbg"); \ > + llvm::DebugLoc dg = iter->getDebugLoc();\ > + while(!N) N = (++iter)->getMetadata("dbg"); \ > + BUILDER.SetCurrentDebugLocation(dg);\ > + } > +// end define SETDEBUGLOCATION I notice that all the SETDEBUGLOCATION macro have the almost same logic. I prefer to rewrite it as a function and place it at some common place. > + > + > namespace gbe > { >/*! Gen IR manipulates only scalar types */ > @@ -977,6 +993,7 @@ namespace gbe >Value *trueVal = getPointerBase((*iter).second[0]); >Value *falseVal = getPointerBase((*iter).second[1]); >Builder.SetInsertPoint(si); > + SETDEBUGLOCATION(Builder, si); >Value *base = Builder.CreateSelect(si->getCondition(), trueVal, > falseVal); >pointerBaseMap.insert(std::make_pair(ptr, base)); > return base; > @@ -984,6 +1001,7 @@ namespace gbe >PHINode *phi = dyn_cast(ptr); >IRBuilder<> Builder(phi->getParent()); >Builder.SetInsertPoint(phi); > + SETDEBUGLOCATION(Builder, phi); > >PHINode *basePhi = Builder.CreatePHI(ptr->getType(), > phi->getNumIncomingValues()); >unsigned srcNum = pointers.size(); > @@ -997,7 +1015,10 @@ namespace gbe > IRBuilder<> Builder2(phi->getIncomingBlock(x)); > BasicBlock *predBB = phi->getIncomingBlock(x); > if (predBB->getTerminator()) > +
Re: [Beignet] [PATCH 4/7] fix bug that LOADI cannot inherits debug info
On Fri, Sep 18, 2015 at 05:01:54PM +0800, Bai Yannan wrote: > Date: Fri, 18 Sep 2015 17:01:54 +0800 > From: Bai Yannan> To: beignet@lists.freedesktop.org > Cc: Bai Yannan > Subject: [Beignet] [PATCH 4/7] fix bug that LOADI cannot inherits debug info > X-Mailer: git-send-email 1.9.1 > > Signed-off-by: Bai Yannan > --- > backend/src/backend/gen_context.cpp| 4 +--- > backend/src/backend/gen_insn_selection.cpp | 12 +++- > backend/src/backend/program.cpp| 4 +++- > backend/src/ir/function.cpp| 7 +++ > backend/src/ir/function.hpp| 1 + > backend/src/llvm/llvm_gen_backend.cpp | 31 > -- > 6 files changed, 44 insertions(+), 15 deletions(-) > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 411336e..9264cd9 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -254,6 +254,7 @@ namespace gbe >void GenContext::emitLabelInstruction(const SelectionInstruction ) { > const ir::LabelIndex label(insn.index); > this->labelPos.insert(std::make_pair(label, p->store.size())); > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitUnaryInstruction(const SelectionInstruction ) { > @@ -631,9 +632,6 @@ namespace gbe > const GenRegister dst = ra->genReg(insn.dst(0)); > const GenRegister src0 = ra->genReg(insn.src(0)); > const GenRegister src1 = ra->genReg(insn.src(1)); > - // debug > - if(insn.dbginfo.hasdbginfo) > - std::cout<<"*** "< > switch (insn.opcode) { >case SEL_OP_SEL: p->SEL(dst, src0, src1); break; > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index e861b7c..5ad665d 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -1104,26 +1104,19 @@ namespace gbe > { > SelectionInstruction = *it; > if(!selinsn.dbginfo.hasdbginfo) > - //SET_SELINSN_DBGINFO(selinsn) > - { selinsn.dbginfo.line = line; > - selinsn.dbginfo.col = col; > - selinsn.dbginfo.hasdbginfo = true;} > - //else break; > + SET_SELINSN_DBGINFO(selinsn) > } > else > for(auto it = this->blockList.rbegin(); it!= > this->blockList.rend(); it++) > { > SelectionBlock = *it; > - for(auto jt = block.insnList.rbegin(); jt!= > block.insnList.rend(); jt++) > + for(auto jt = block.insnList.begin(); jt!= > block.insnList.end(); jt++) > { > SelectionInstruction = *jt; > if(!selinsn.dbginfo.hasdbginfo) > SET_SELINSN_DBGINFO(selinsn) > - else goto OVER; > } > } > - OVER: > - ; >} > #undef SET_SELINSN_DBGINFO > > @@ -4768,6 +4761,7 @@ namespace gbe >const uint32_t simdWidth = sel.ctx.getSimdWidth(); >GBE_ASSERTM(label < sel.ctx.getMaxLabel(), "We reached the maximum > label number which is reserved for barrier handling"); >sel.LABEL(label); > + SET_SELOPAQUE_DBGINFO(insn); > >if(!insn.getParent()->needIf) > return true; > diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp > index af817de..e317230 100644 > --- a/backend/src/backend/program.cpp > +++ b/backend/src/backend/program.cpp > @@ -557,8 +557,10 @@ namespace gbe { > > if(getenv("OCL_PROFILING")) { > char * isProfiling = getenv("OCL_PROFILING"); > - if(*isProfiling == '1') > + if(*isProfiling == '1'){ > + args.push_back("-o0"); > args.push_back("-g"); > + } > } > > // The compiler invocation needs a DiagnosticsEngine so it can report > problems > diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp > index f87f23a..439b34c 100644 > --- a/backend/src/ir/function.cpp > +++ b/backend/src/ir/function.cpp > @@ -389,6 +389,13 @@ namespace ir { > return const_cast (); >} > > + Instruction *BasicBlock::getSuccessorInstruction(Instruction *pos) { > + for(auto it = this->begin();it != this->end();it++) > + if(it == pos) > + return &(*(++it)); > + return &(*(--this->end())); > + } > + >LabelIndex BasicBlock::getLabelIndex(void) const { > const Instruction *first =
Re: [Beignet] [PATCH 7/7] refine pass debug info from llvm ir to gen insn
On Fri, Sep 18, 2015 at 05:01:57PM +0800, Bai Yannan wrote: > Date: Fri, 18 Sep 2015 17:01:57 +0800 > From: Bai Yannan> To: beignet@lists.freedesktop.org > Cc: Bai Yannan , Lv Meng > Subject: [Beignet] [PATCH 7/7] refine pass debug info from llvm ir to gen > insn > X-Mailer: git-send-email 1.9.1 > > Add line and col to ctx to pass debug infomation > > Signed-off-by: Bai Yannan > Signed-off-by: Lv Meng > --- > backend/src/backend/gen_context.cpp | 93 +++--- > backend/src/backend/gen_encoder.cpp | 24 +++--- > backend/src/backend/gen_encoder.hpp | 7 +- > backend/src/ir/context.cpp| 2 + > backend/src/ir/context.hpp| 3 + > backend/src/llvm/llvm_gen_backend.cpp | 142 > ++ > backend/src/llvm/llvm_to_gen.cpp | 4 +- > 7 files changed, 93 insertions(+), 182 deletions(-) > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 43c0b25..18a02a0 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -92,11 +92,6 @@ namespace gbe >} > > #define OCL_PROFILING (bool)(getenv("OCL_PROFILING")[0]-48) > -#define SET_GENINSN_DBGINFO(INSN)\ > - if(INSN.dbginfo.hasdbginfo)\ > - p->setDbginfo(INSN.dbginfo.line,INSN.dbginfo.col);\ > - else p->setDbginfo(0,0) > - >void GenContext::emitInstructionStream(void) { > // Emit Gen ISA > for (auto : *sel->blockList) > @@ -106,6 +101,9 @@ namespace gbe >// no more virtual register here in that part of the code generation >GBE_ASSERT(insn.state.physicalFlag); >p->curr = insn.state; > + //meng > + p->line = insn.dbginfo.line; > + p->col = insn.dbginfo.col; >switch (opcode) { > #define DECL_SELECTION_IR(OPCODE, FAMILY) \ >case SEL_OP_##OPCODE: this->emit##FAMILY(insn); break; > @@ -119,7 +117,6 @@ namespace gbe > instruction prefetcher prefetch into an invalide page */ > for(int i = 0; i < 8; i++) > p->NOP(); > - p->setDbginfo(0,0); >} > >bool GenContext::patchBranches(void) { > @@ -255,7 +252,7 @@ namespace gbe >void GenContext::emitLabelInstruction(const SelectionInstruction ) { > const ir::LabelIndex label(insn.index); > this->labelPos.insert(std::make_pair(label, p->store.size())); > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); it seems to be a tmp comment, and should not be in the patches. >} > >void GenContext::emitUnaryInstruction(const SelectionInstruction ) { > @@ -330,7 +327,7 @@ namespace gbe > break; >default: NOT_IMPLEMENTED; > } > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction > ) { > @@ -489,7 +486,7 @@ namespace gbe >default: > NOT_IMPLEMENTED; > } > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction > ) { > @@ -590,7 +587,7 @@ namespace gbe >default: > NOT_IMPLEMENTED; > } > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitSimdShuffleInstruction(const SelectionInstruction > ) { > @@ -682,7 +679,7 @@ namespace gbe > break; >default: NOT_IMPLEMENTED; > } > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::collectShifter(GenRegister dest, GenRegister src) { > @@ -777,7 +774,7 @@ namespace gbe > } > storeTopHalf(dest, e); > storeBottomHalf(dest, f); > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64MADSATInstruction(const SelectionInstruction > ) { > @@ -909,7 +906,7 @@ namespace gbe > } > storeTopHalf(dest, g); > storeBottomHalf(dest, h); > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64HADDInstruction(const SelectionInstruction ) { > @@ -937,7 +934,7 @@ namespace gbe > p->OR(c, c, d); > storeBottomHalf(dest, a); > storeTopHalf(dest, c); > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64RHADDInstruction(const SelectionInstruction ) > { > @@ -968,7 +965,7 @@ namespace gbe > p->OR(c, c, d); > storeBottomHalf(dest, a); > storeTopHalf(dest, c); > - SET_GENINSN_DBGINFO(insn); > + //SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64ShiftInstruction(const SelectionInstruction ) > { > @@ -1075,7 +1072,7 @@ namespace gbe >default: > NOT_IMPLEMENTED; > } > - SET_GENINSN_DBGINFO(insn); > +
Re: [Beignet] [PATCH 3/7] pass dbginfo from gen ir to geninsn
On Fri, Sep 18, 2015 at 05:01:53PM +0800, Bai Yannan wrote: > Date: Fri, 18 Sep 2015 17:01:53 +0800 > From: Bai Yannan> To: beignet@lists.freedesktop.org > Cc: Bai Yannan > Subject: [Beignet] [PATCH 3/7] pass dbginfo from gen ir to geninsn > X-Mailer: git-send-email 1.9.1 > > 1, pass debug infomation first from gen ir to selection ir; > 2, pass debug infomation from selection ir to gen instruction; > 3, print line and column binded with ASM into a log file. > > Signed-off-by: Bai Yannan > --- > backend/src/backend/gen_context.cpp| 54 +++ > backend/src/backend/gen_defs.hpp | 4 ++ > backend/src/backend/gen_encoder.cpp| 15 ++ > backend/src/backend/gen_encoder.hpp| 4 ++ > backend/src/backend/gen_insn_selection.cpp | 84 > +- > backend/src/backend/gen_insn_selection.hpp | 7 +++ > 6 files changed, 167 insertions(+), 1 deletion(-) > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 25fdf08..411336e 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -91,6 +91,11 @@ namespace gbe > return i; >} > > +#define SET_GENINSN_DBGINFO(INSN)\ > + if(INSN.dbginfo.hasdbginfo)\ > + p->setDbginfo(INSN.dbginfo.line,INSN.dbginfo.col);\ > + else p->setDbginfo(0,0) > + >void GenContext::emitInstructionStream(void) { > // Emit Gen ISA > for (auto : *sel->blockList) > @@ -106,12 +111,14 @@ namespace gbe > #include "backend/gen_insn_selection.hxx" > #undef DECL_INSN >} > + //p->setDbginfo(insn.dbginfo.line,insn.dbginfo.col); >p->pop(); > } > /* per spec, pad the instruction stream with 8 nop to avoid > instruction prefetcher prefetch into an invalide page */ > for(int i = 0; i < 8; i++) > p->NOP(); > + p->setDbginfo(0,0); >} > >bool GenContext::patchBranches(void) { > @@ -241,6 +248,7 @@ namespace gbe >p->curr.execWidth = this->simdWidth; >p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); > p->pop(); > + //SET_GENINSN_DBGINFO(0); >} > >void GenContext::emitLabelInstruction(const SelectionInstruction ) { > @@ -320,6 +328,7 @@ namespace gbe > break; >default: NOT_IMPLEMENTED; > } > + SET_GENINSN_DBGINFO(insn); I think that SET_GENINSN_DBGINFO should be add to a common place like in instruction.cpp. >} > >void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction > ) { > @@ -478,6 +487,7 @@ namespace gbe >default: > NOT_IMPLEMENTED; > } > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction > ) { > @@ -578,6 +588,7 @@ namespace gbe >default: > NOT_IMPLEMENTED; > } > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitSimdShuffleInstruction(const SelectionInstruction > ) { > @@ -620,6 +631,10 @@ namespace gbe > const GenRegister dst = ra->genReg(insn.dst(0)); > const GenRegister src0 = ra->genReg(insn.src(0)); > const GenRegister src1 = ra->genReg(insn.src(1)); > + // debug > + if(insn.dbginfo.hasdbginfo) > + std::cout<<"*** "< + > switch (insn.opcode) { >case SEL_OP_SEL: p->SEL(dst, src0, src1); break; >case SEL_OP_SEL_INT64: > @@ -668,6 +683,7 @@ namespace gbe > break; >default: NOT_IMPLEMENTED; > } > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::collectShifter(GenRegister dest, GenRegister src) { > @@ -762,6 +778,7 @@ namespace gbe > } > storeTopHalf(dest, e); > storeBottomHalf(dest, f); > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64MADSATInstruction(const SelectionInstruction > ) { > @@ -893,6 +910,7 @@ namespace gbe > } > storeTopHalf(dest, g); > storeBottomHalf(dest, h); > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64HADDInstruction(const SelectionInstruction ) { > @@ -920,6 +938,7 @@ namespace gbe > p->OR(c, c, d); > storeBottomHalf(dest, a); > storeTopHalf(dest, c); > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64RHADDInstruction(const SelectionInstruction ) > { > @@ -950,6 +969,7 @@ namespace gbe > p->OR(c, c, d); > storeBottomHalf(dest, a); > storeTopHalf(dest, c); > + SET_GENINSN_DBGINFO(insn); >} > >void GenContext::emitI64ShiftInstruction(const SelectionInstruction ) > { > @@ -1056,6 +1076,7 @@ namespace gbe >default: > NOT_IMPLEMENTED; > } > + SET_GENINSN_DBGINFO(insn); >} >void GenContext::setFlag(GenRegister flagReg, GenRegister src) { > p->push(); > @@ -1211,6 +1232,7 @@
Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW.
On Tue, Sep 15, 2015 at 06:00:57AM -0700, Matt Turner wrote: > Date: Tue, 15 Sep 2015 06:00:57 -0700 > From: Matt Turner <matts...@gmail.com> > To: "junyan.he" <junyan...@inbox.com> > Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Subject: Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW. > > On Tue, Sep 15, 2015 at 4:15 AM, <junyan...@inbox.com> wrote: > > From: Junyan He <junyan...@linux.intel.com> > > > > According to the document, we use a set of instructions > > to implement double type division. > > > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > > --- > > backend/src/backend/gen8_context.cpp | 68 > > > > backend/src/backend/gen8_context.hpp | 2 ++ > > 2 files changed, 70 insertions(+) > > > > diff --git a/backend/src/backend/gen8_context.cpp > > b/backend/src/backend/gen8_context.cpp > > index b497ee5..f465832 100644 > > --- a/backend/src/backend/gen8_context.cpp > > +++ b/backend/src/backend/gen8_context.cpp > > @@ -924,6 +924,74 @@ namespace gbe > > this->unpackLongVec(src, dst, p->curr.execWidth); > >} > > > > + void Gen8Context::emitF64DIVInstruction(const SelectionInstruction > > ) { > > +/* Macro for Double Precision IEEE Compliant fdiv > > + > > + Set Rounding Mode in CR to RNE > > + GRF are initialized: r0 = 0, r6 = a, r7 = b, r1 = 1 > > + The default data type for the macro is :df > > + > > + math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE > > + (-f0.0) if > > + madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 // Step(1), q0=a*y0 > > + madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), > > e0=(1-b*y0) > > + madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), > > r0=a-b*q0 > > + madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2 // Step(4), > > y1=y0+e0*y0 > > + madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), > > e1=(1-b*y1) > > + madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6 // Step(6), > > y2=y0+e0*y1 > > + madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6 // Step(7), > > q1=q0+r0*y1 > > + madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7 // Step(8), > > y3=y1+e1*y2 > > + madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), > > r1=a-b*q1 > > + > > + Change Rounding Mode in CR if required > > + Implicit Accumulator for destination is NULL > > + > > + madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2 // Step(10), > > q=q1+r1*y3 > > + endif */ > > I don't see an IF or an ENDIF instruction emitted in the code below. > Is that intentional, or am I misreading the code? > Here, we use f0.1 as the predication for all the instructions, like: (-f0.1) madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 (-f0.1) madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 . I avoid using IF-Endif here, because we need to calculate the instruction number within IF clause, and it is not convenient. > > +GenRegister r6 = GenRegister::retype(ra->genReg(insn.src(0)), > > GEN_TYPE_DF); > > +GenRegister r7 = GenRegister::retype(ra->genReg(insn.src(1)), > > GEN_TYPE_DF); > > +GenRegister r8 = GenRegister::retype(ra->genReg(insn.dst(0)), > > GEN_TYPE_DF); > > +const GenRegister r0 = GenRegister::retype(ra->genReg(insn.dst(1)), > > GEN_TYPE_DF); > > +const GenRegister r1 = GenRegister::retype(ra->genReg(insn.dst(2)), > > GEN_TYPE_DF); > > +const GenRegister r9 = GenRegister::retype(ra->genReg(insn.dst(3)), > > GEN_TYPE_DF); > > +const GenRegister r10 = GenRegister::retype(ra->genReg(insn.dst(4)), > > GEN_TYPE_DF); > > +const GenRegister r11 = GenRegister::retype(ra->genReg(insn.dst(5)), > > GEN_TYPE_DF); > > +const GenRegister r12 = GenRegister::retype(ra->genReg(insn.dst(6)), > > GEN_TYPE_DF); > > +const GenRegister r13 = GenRegister::retype(ra->genReg(insn.dst(7)), > > GEN_TYPE_DF); > > +Gen8Encoder *p8 = reinterpret_cast(p); > > +p->push(); { > > + p->curr.execWidth = 4; > > + p->curr.predicate = GEN_PREDICATE_NONE; > > + p->curr.noMask= 1; > > + p->MOV(r1, GenRegister::immdf(1.0d)); > > + p->MOV(r0, GenRegister::immdf(0.0d)); > > + > > + for (int i = 0; i < (simdWidth == 16 ? 4 : 2); i++) { > > +p->curr.predicate
Re: [Beignet] [PATCH 5/8] Backend: Add the MADM function to gen8 encoder.
On Tue, Sep 15, 2015 at 05:57:13AM -0700, Matt Turner wrote: > Date: Tue, 15 Sep 2015 05:57:13 -0700 > From: Matt Turner <matts...@gmail.com> > To: "junyan.he" <junyan...@inbox.com> > Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Subject: Re: [Beignet] [PATCH 5/8] Backend: Add the MADM function to gen8 > encoder. > > On Tue, Sep 15, 2015 at 4:15 AM, <junyan...@inbox.com> wrote: > > From: Junyan He <junyan...@linux.intel.com> > > > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > > --- > > backend/src/backend/gen8_encoder.cpp | 56 > > > > backend/src/backend/gen8_encoder.hpp | 2 ++ > > backend/src/backend/gen_defs.hpp | 2 ++ > > 3 files changed, 60 insertions(+) > > > > diff --git a/backend/src/backend/gen8_encoder.cpp > > b/backend/src/backend/gen8_encoder.cpp > > index 0af27a3..002a8b5 100644 > > --- a/backend/src/backend/gen8_encoder.cpp > > +++ b/backend/src/backend/gen8_encoder.cpp > > @@ -591,4 +591,60 @@ namespace gbe > > this->setSrc0WithAcc(insn, src0, src0Acc); > > this->setSrc1WithAcc(insn, src1, src1Acc); > >} > > + > > + void Gen8Encoder::MADM(GenRegister dst, GenRegister src0, GenRegister > > src1, GenRegister src2, > > + uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t > > src2Acc) > > + { > > +GenNativeInstruction *insn = this->next(GEN_OPCODE_MADM); > > +Gen8NativeInstruction *gen8_insn = >gen8_insn; > > +assert(dst.file == GEN_GENERAL_REGISTER_FILE); > > +assert(src0.file == GEN_GENERAL_REGISTER_FILE); > > +assert(src1.file == GEN_GENERAL_REGISTER_FILE); > > +assert(src2.file == GEN_GENERAL_REGISTER_FILE); > > +assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == > > GEN_HORIZONTAL_STRIDE_0); > > +assert(src0.type == GEN_TYPE_DF || src0.type == GEN_TYPE_F); > > +assert(src0.type == dst.type); > > +assert(src0.type == src1.type); > > +assert(src0.type == src2.type); > > +int32_t dataType = src0.type == GEN_TYPE_DF ? 3 : 0; > > + > > +this->setHeader(insn); > > +gen8_insn->bits1.da3srcacc.dest_reg_nr = dst.nr; > > +gen8_insn->bits1.da3srcacc.dest_subreg_nr = dst.subnr / 16; > > +gen8_insn->bits1.da3srcacc.dst_specal_acc = dstAcc; > > +gen8_insn->bits1.da3srcacc.src_type = dataType; > > +gen8_insn->bits1.da3srcacc.dest_type = dataType; > > +gen8_insn->header.access_mode = GEN_ALIGN_16; > > + > > +assert(src0.file == GEN_GENERAL_REGISTER_FILE); > > +assert(src0.address_mode == GEN_ADDRESS_DIRECT); > > +assert(src0.nr < 128); > > +gen8_insn->bits2.da3srcacc.src0_specal_acc = src0Acc; > > +gen8_insn->bits2.da3srcacc.src0_subreg_nr = src0.subnr / 4 ; > > +gen8_insn->bits2.da3srcacc.src0_reg_nr = src0.nr; > > +gen8_insn->bits1.da3srcacc.src0_abs = src0.absolute; > > +gen8_insn->bits1.da3srcacc.src0_negate = src0.negation; > > +gen8_insn->bits2.da3srcacc.src0_rep_ctrl = src0.vstride == > > GEN_VERTICAL_STRIDE_0; > > + > > +assert(src1.file == GEN_GENERAL_REGISTER_FILE); > > +assert(src1.address_mode == GEN_ADDRESS_DIRECT); > > +assert(src1.nr < 128); > > +gen8_insn->bits2.da3srcacc.src1_specal_acc = src1Acc; > > +gen8_insn->bits2.da3srcacc.src1_subreg_nr_low = (src1.subnr / 4) & 0x3; > > +gen8_insn->bits3.da3srcacc.src1_subreg_nr_high = (src1.subnr / 4) >> 2; > > +gen8_insn->bits2.da3srcacc.src1_rep_ctrl = src1.vstride == > > GEN_VERTICAL_STRIDE_0; > > +gen8_insn->bits3.da3srcacc.src1_reg_nr = src1.nr; > > +gen8_insn->bits1.da3srcacc.src1_abs = src1.absolute; > > +gen8_insn->bits1.da3srcacc.src1_negate = src1.negation; > > + > > +assert(src2.file == GEN_GENERAL_REGISTER_FILE); > > +assert(src2.address_mode == GEN_ADDRESS_DIRECT); > > +assert(src2.nr < 128); > > +gen8_insn->bits3.da3srcacc.src2_specal_acc = src2Acc; > > +gen8_insn->bits3.da3srcacc.src2_subreg_nr = src2.subnr / 4; > > +gen8_insn->bits3.da3srcacc.src2_rep_ctrl = src2.vstride == > > GEN_VERTICAL_STRIDE_0; > > +gen8_insn->bits3.da3srcacc.src2_reg_nr = src2.nr; > > +gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute; > > +gen8_insn->bits1.da3srcacc.src2_negate = src2.negation; > > + } > > } /* End of the name space. */ > > diff --git a/b
Re: [Beignet] [PATCH 00/19 V2] Add Profiling support in beignet.
I think it would be better after we integrate the binary_to_soure feature. Then I think it is easy and clear to describe how to use the profiling feature. On Thu, Sep 10, 2015 at 05:56:17AM +, Zou, Nanhai wrote: > Date: Thu, 10 Sep 2015 05:56:17 + > From: "Zou, Nanhai"> To: "junyan...@inbox.com" , > "beignet@lists.freedesktop.org" > Subject: Re: [Beignet] [PATCH 00/19 V2] Add Profiling support in beignet. > > It will be nice if you can add a simple how-to-profile-your-kernel document > in docs/howto > > Thanks > Zou Nanhai > > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet > ___ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 19/19] runtime: Add fp16 extension to BDW later platform.
The half float can work for BSW, I will send a standalone patch to enable it later. On 2015年06月19日 15:18, Yang, Rong R wrote: One concern: Does cherryview support half? The other part of the patchset LGTM. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of junyan...@inbox.com Sent: Thursday, June 11, 2015 19:26 To: beignet@lists.freedesktop.org Cc: Junyan He Subject: [Beignet] [PATCH 19/19] runtime: Add fp16 extension to BDW later platform. From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- src/cl_device_id.c | 123 ++--- -- src/cl_device_id.h | 1 + src/cl_extensions.c | 29 ++-- src/cl_extensions.h | 2 + src/cl_gt_device.h | 1 + src/cl_platform_id.c | 2 +- 6 files changed, 102 insertions(+), 56 deletions(-) diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 215f7f2..09171f8 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -26,6 +26,7 @@ #include cl_khr_icd.h #include cl_thread.h #include CL/cl.h +#include CL/cl_ext.h #include cl_gbe_loader.h #include cl_alloc.h @@ -398,6 +399,8 @@ baytrail_t_device_break: case PCI_CHIP_BROADWLL_U_GT1: DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, Intel(R) HD Graphics BroadWell ULX GT1); brw_gt1_break: + /* For Gen8 and later, half float is suppported and we will enable cl_khr_fp16. */ + cl_intel_platform_enable_fp16_extension(intel_platform); intel_brw_gt1_device.vendor_id = device_id; intel_brw_gt1_device.platform = intel_platform; ret = intel_brw_gt1_device; @@ -414,6 +417,7 @@ brw_gt1_break: case PCI_CHIP_BROADWLL_U_GT2: DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, Intel(R) HD Graphics BroadWell ULX GT2); brw_gt2_break: + cl_intel_platform_enable_fp16_extension(intel_platform); intel_brw_gt2_device.vendor_id = device_id; intel_brw_gt2_device.platform = intel_platform; ret = intel_brw_gt2_device; @@ -430,6 +434,7 @@ brw_gt2_break: case PCI_CHIP_BROADWLL_U_GT3: DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, Intel(R) HD Graphics BroadWell ULX GT2); brw_gt3_break: + cl_intel_platform_enable_fp16_extension(intel_platform); intel_brw_gt3_device.vendor_id = device_id; intel_brw_gt3_device.platform = intel_platform; ret = intel_brw_gt3_device; @@ -447,61 +452,65 @@ chv_break: break; - case PCI_CHIP_SKYLAKE_ULT_GT1: - DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake ULT GT1); - case PCI_CHIP_SKYLAKE_ULX_GT1: - DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake ULX GT1); - case PCI_CHIP_SKYLAKE_DT_GT1: - DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake Desktop GT1); - case PCI_CHIP_SKYLAKE_HALO_GT1: - DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake Halo GT1); - case PCI_CHIP_SKYLAKE_SRV_GT1: - DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake Server GT1); +case PCI_CHIP_SKYLAKE_ULT_GT1: + DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake ULT GT1); +case PCI_CHIP_SKYLAKE_ULX_GT1: + DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake ULX GT1); +case PCI_CHIP_SKYLAKE_DT_GT1: + DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake Desktop GT1); +case PCI_CHIP_SKYLAKE_HALO_GT1: + DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake Halo GT1); +case PCI_CHIP_SKYLAKE_SRV_GT1: + DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, + Intel(R) HD Graphics Skylake Server GT1); skl_gt1_break: - intel_skl_gt1_device.vendor_id = device_id; - intel_skl_gt1_device.platform = intel_platform; - ret = intel_skl_gt1_device; - break; - - case PCI_CHIP_SKYLAKE_ULT_GT2: - DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, Intel(R) HD Graphics Skylake ULT GT2); - case PCI_CHIP_SKYLAKE_ULT_GT2F: - DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, Intel(R) HD Graphics Skylake ULT GT2F); - case PCI_CHIP_SKYLAKE_ULX_GT2: - DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, Intel(R) HD Graphics Skylake ULX GT2); - case PCI_CHIP_SKYLAKE_DT_GT2: - DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, Intel(R) HD Graphics Skylake Desktop GT2); - case PCI_CHIP_SKYLAKE_HALO_GT2: - DECL_INFO_STRING(skl_gt2_break
Re: [Beignet] thread safety and OpenMP
So far as we know, beignet is thread safe. Every thead has its own command buffer and do not have relationship with each others. Do you use subbuffer to divide the image? If you can provide some source code or test case, it may help a lot. -Original Message- From: Song, Ruiling Sent: Wednesday, July 01, 2015 10:40 AM To: Gerald Baier; beignet@lists.freedesktop.org Cc: He, Junyan Subject: RE: [Beignet] thread safety and OpenMP -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Gerald Baier Sent: Sunday, June 28, 2015 9:56 PM To: beignet@lists.freedesktop.org Subject: [Beignet] thread safety and OpenMP I'm using Beignet for image processing, where basically the image is subdivided into tiles which are then processed by several threads using OpenMP tasks. I noticed that some of the tiles are occasionally messed up. If Using OpenMP or OpenCL? Per OpenCL spec, opencl driver should be thread-safe, and Junyan implemented the multi-thread support in beignet. That is to say Beignet is thread-safe. I am not sure whether Junyan has more comments on how to debug the issue. By the way, if you can provide a test case to reproduce the issue, it would be much helpful. Thanks! Ruiling I use only one thread everything works fine, also the same program runs as expected on nvidia GPUs with multiple threads. Hence the question whether Beignet is thread safe and how I could debug my program? Here's my configuration: device name: Intel(R) HD Graphics IvyBridge M GT2 device version: OpenCL 1.2 beignet 1.0.3 (git-9e0ca6f) Best regards, Gerald ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/8] Backend: Add half float as a new type.
After research, I find the F16C feature is only supported on IVB later platforms and needs at least GCC 4.6 support. This will cause some compatible issues. The beignet project may be cross-compiled on some old platform using old version GCC, and some one may use other compiler to build it. According to this, I still prefer to use the software imitation for half float. It's verbose, but it will not have any side effect for runtime performance. On 2015年05月22日 14:28, He Junyan wrote: Thanks for your information. I will do some research for it. On 2015年05月22日 05:51, Matt Turner wrote: On Thu, May 21, 2015 at 1:25 AM, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Because the CPU of X86 does not support half float instructions, there is no support for half float operations. So we introduce the half class to handle the operations for half float using llvm's APFloat utility. Ivybridge and newer have the F16C instruction set (http://en.wikipedia.org/wiki/F16C) which offers instructions to convert half-precision - single-precision floats. I don't know if it's valuable to use it, but it's there. ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/2] [opencl-2.0] enable create image 2d from buffer in clCreateImage.
Some comments, On 2015年04月03日 13:39, xionghu@intel.com wrote: From: Luo Xionghu xionghu@intel.com this patch allows create 2d image with a cl buffer. Signed-off-by: Luo Xionghu xionghu@intel.com --- src/cl_api.c | 3 ++- src/cl_mem.c | 67 +++- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index cd4020e..25e621a 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -549,8 +549,9 @@ clCreateImage(cl_context context, goto error; } /* buffer refers to a valid buffer memory object if image_type is - CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */ + CL_MEM_OBJECT_IMAGE1D_BUFFER or CL_MEM_OBJECT_IMAGE2D. Otherwise it must be NULL. */ if (image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER + image_desc-image_type != CL_MEM_OBJECT_IMAGE2D image_desc-buffer) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; diff --git a/src/cl_mem.c b/src/cl_mem.c index b41ec14..3c5667e 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -971,26 +971,47 @@ _cl_mem_new_image_from_buffer(cl_context ctx, if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, bpp)) != CL_SUCCESS)) goto error; - // Per bspec, a image should has a at least 2 line vertical alignment, - // thus we can't simply attach a buffer to a 1d image surface which has the same size. - // We have to create a new image, and copy the buffer data to this new image. - // And replace all the buffer object's reference to this image. - image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type, + if(image_desc-image_type == CL_MEM_OBJECT_IMAGE2D) { Spec says: The restrictions are: all the values specified in image_desc except for mem_object must match the image descriptor information associated with mem_object. the channel data type specified in image_format must match the channel data type associated with mem_object. So I think here we may need to add some check. +image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type, + image_desc-image_width, image_desc-image_height, image_desc-image_depth, + image_desc-image_row_pitch, image_desc-image_slice_pitch, + image_desc-buffer, errcode_ret); ~~~ here, why image_desc-buffer? + } else if (image_desc-image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { +// Per bspec, a image should has a at least 2 line vertical alignment, +// thus we can't simply attach a buffer to a 1d image surface which has the same size. +// We have to create a new image, and copy the buffer data to this new image. +// And replace all the buffer object's reference to this image. +image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type, mem_buffer-base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret); + } + else +assert(0); + if (image == NULL) return NULL; - void *src = cl_mem_map(buffer, 0); - void *dst = cl_mem_map(image, 1); - // - // FIXME, we could use copy buffer to image to do this on GPU latter. - // currently the copy buffer to image function doesn't support 1D image. - // - // There is a potential risk that this buffer was mapped and the caller - // still hold the pointer and want to access it again. This scenario is - // not explicitly forbidden in the spec, although it should not be permitted. - memcpy(dst, src, mem_buffer-base.size); - cl_mem_unmap(buffer); - cl_mem_unmap(image); + + if(image_desc-image_type == CL_MEM_OBJECT_IMAGE2D) + { +size_t origin[] = {0,0,0}; +size_t region[] = {image_desc-image_width, image_desc-image_height, 1}; +clEnqueueCopyBufferToImage(ctx-queues, buffer, image, 0, origin, region, 0, NULL, NULL); + } + else if (image_desc-image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) + { +// FIXME, we could use copy buffer to image to do this on GPU latter. +// currently the copy buffer to image function doesn't support 1D image. +// +// There is a potential risk that this buffer was mapped and the caller +// still hold the pointer and want to access it again. This scenario is +// not explicitly forbidden in the spec, although it should not be permitted. +void *src = cl_mem_map(buffer, 0); +void *dst = cl_mem_map(image, 1); +memcpy(dst, src, mem_buffer-base.size); +cl_mem_unmap(image); +cl_mem_unmap(buffer); + } + else +assert(0); if (err != 0) goto error; @@ -1025,12 +1046,20 @@ cl_mem_new_image(cl_context context, { switch (image_desc-image_type) { case CL_MEM_OBJECT_IMAGE1D: - case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE3D: return _cl_mem_new_image(context, flags, image_format, image_desc-image_type, image_desc-image_width,
Re: [Beignet] [PATCH 1/8] Backend: Add half float as a new type.
Thanks for your information. I will do some research for it. On 2015年05月22日 05:51, Matt Turner wrote: On Thu, May 21, 2015 at 1:25 AM, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Because the CPU of X86 does not support half float instructions, there is no support for half float operations. So we introduce the half class to handle the operations for half float using llvm's APFloat utility. Ivybridge and newer have the F16C instruction set (http://en.wikipedia.org/wiki/F16C) which offers instructions to convert half-precision - single-precision floats. I don't know if it's valuable to use it, but it's there. ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH OpenCL 2.0] Backend: Update the workgroup instructions for llvm backend to gen.
I have modified it in the new patch set. Just ignore this one. Thanks. On 2015年04月30日 13:49, Zhigang Gong wrote: Junyan, I haven't found any new response to address this comment from Rong and me. Do you miss this comment or do I miss your new patch? Thanks, Zhigang Gong. On Thu, Apr 02, 2015 at 12:53:30PM +0800, Zhigang Gong wrote: Right, especially for those builtin function which don't care about the sign. Junyan, could you refine your patch accordingly? Thanks. On Tue, Mar 24, 2015 at 07:39:03AM +, Yang, Rong R wrote: Zhigang have add function OCLIntrinsicMap.find to handle override function name, only need one DECL_LLVM_GEN_FUNCTION for one group override functions, and in the GenWriter::emitCallInst to get the corresponding argument type. It reduce the DECL_LLVM_GEN_FUNCTION significant. Can you also use this method? -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of junyan...@inbox.com Sent: Tuesday, March 24, 2015 14:40 To: beignet@lists.freedesktop.org Cc: Junyan He Subject: [Beignet] [PATCH OpenCL 2.0] Backend: Update the workgroup instructions for llvm backend to gen. From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/llvm/llvm_gen_ocl_function.hxx | 87 1 file changed, 87 insertions(+) diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 9536a3c..947fadc 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -160,3 +160,90 @@ DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region) // printf function DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf) + +// work group function +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_1D, +_Z30__gen_ocl_work_group_broadcastij) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_2D, +_Z30__gen_ocl_work_group_broadcastijj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_3D, +_Z30__gen_ocl_work_group_broadcastijjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_1D, +_Z30__gen_ocl_work_group_broadcastjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_2D, +_Z30__gen_ocl_work_group_broadcastjjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_3D, +_Z30__gen_ocl_work_group_broadcast) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_1D, +_Z30__gen_ocl_work_group_broadcastlj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_2D, +_Z30__gen_ocl_work_group_broadcastljj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_lONG_3D, +_Z30__gen_ocl_work_group_broadcastljjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_1D, +_Z30__gen_ocl_work_group_broadcastmj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_2D, +_Z30__gen_ocl_work_group_broadcastmjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_3D, +_Z30__gen_ocl_work_group_broadcastmjjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_1D, +_Z30__gen_ocl_work_group_broadcastfj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_2D, +_Z30__gen_ocl_work_group_broadcastfjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_3D, +_Z30__gen_ocl_work_group_broadcastfjjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_1D, +_Z30__gen_ocl_work_group_broadcastdj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_2D, +_Z30__gen_ocl_work_group_broadcastdjj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_3D, +_Z30__gen_ocl_work_group_broadcastdjjj) + +// work group reduce +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_DOUBLE, +_Z31__gen_ocl_work_group_reduce_addd) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_FLOAT, +_Z31__gen_ocl_work_group_reduce_addf) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_INT, +_Z31__gen_ocl_work_group_reduce_addi) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_UINT, +_Z31__gen_ocl_work_group_reduce_addj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_LONG, +_Z31__gen_ocl_work_group_reduce_addl) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_ULONG, +_Z31__gen_ocl_work_group_reduce_addm) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXD, +_Z31__gen_ocl_work_group_reduce_maxd) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXF, +_Z31__gen_ocl_work_group_reduce_maxf) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXI, +_Z31__gen_ocl_work_group_reduce_maxi) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXJ, +_Z31__gen_ocl_work_group_reduce_maxj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXL, +_Z31__gen_ocl_work_group_reduce_maxl) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXM, +_Z31__gen_ocl_work_group_reduce_maxm) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MIND, +_Z31__gen_ocl_work_group_reduce_mind) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINF, +_Z31__gen_ocl_work_group_reduce_minf) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINI, +_Z31__gen_ocl_work_group_reduce_mini) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINJ, +_Z31__gen_ocl_work_group_reduce_minj) +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINL
Re: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.
OK, it's a better way to avid the usage of addc. I think tmp_dst can also be avoided here to save one tmp register. On 2015年03月23日 15:44, Song, Ruiling wrote: Good idea, the patch LGTM. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Yang Rong Sent: Monday, March 23, 2015 2:26 PM To: beignet@lists.freedesktop.org Cc: Yang, Rong R Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD. HADD is equal to (src01) + (src11) + ((src00x1) (src10x1)), and RHADD is equal to (src01) + (src11) + ((src00x1) | (src10x1)). Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/backend/gen8_context.cpp | 114 - backend/src/backend/gen_insn_selection.cpp | 8 +- 2 files changed, 20 insertions(+), 102 deletions(-) diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp index 3f57cf6..b136902 100644 --- a/backend/src/backend/gen8_context.cpp +++ b/backend/src/backend/gen8_context.cpp @@ -651,58 +651,21 @@ namespace gbe GenRegister tmp0 = ra-genReg(insn.dst(1)); GenRegister tmp1 = ra-genReg(insn.dst(2)); GenRegister tmp_dst = ra-genReg(insn.dst(3)); -int execWidth = p-curr.execWidth; /* Src0 and Src1 are always unsigned long type.*/ GBE_ASSERT(src0.type == GEN_TYPE_UL src1.type == GEN_TYPE_UL); dst.type = src0.type; -tmp0.type = tmp1.type = GEN_TYPE_UD; +tmp0.type = tmp1.type = GEN_TYPE_UL; tmp_dst.type = GEN_TYPE_UL; GBE_ASSERT(tmp_dst.subnr == 0); -GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) : - GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD); -GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? - GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) : - GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD); -GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ? - GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr); -GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ? - GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) : - GenRegister::unpacked_ud(src0.nr, src0.subnr + 1); -GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ? - GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr); -GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ? - GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) : - GenRegister::unpacked_ud(src1.nr, src1.subnr + 1); - -GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D); -p-push(); -p-curr.execWidth = 8; -p-ADDC(dl, s0l, s1l); -p-MOV(tmp0, acc0); -p-ADDC(dh, s0h, s1h); -p-MOV(tmp1, acc0); -p-ADDC(dh, dh, tmp0); -p-MOV(tmp0, acc0); -p-ADD(tmp1, tmp0, tmp1); - -if (execWidth == 16) { - p-curr.quarterControl = 1; - p-ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1)); - p-MOV(GenRegister::Qn(tmp0, 1), acc0); - p-ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1)); - p-MOV(GenRegister::Qn(tmp1, 1), acc0); - p-ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1)); - p-MOV(GenRegister::Qn(tmp0, 1), acc0); - p-ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1)); -} -p-pop(); - -packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth); - -p-SHR(dst, dst, GenRegister::immud(1)); -p-SHL(tmp_dst, tmp1, GenRegister::immud(63)); +//hadd = (src01) + (src11) + ((src00x1) (src10x1)) +p-AND(tmp0, src0, GenRegister::immud(1)); +p-AND(tmp1, src1, GenRegister::immud(1)); +p-AND(tmp_dst, tmp0, tmp1); +p-SHR(tmp0, src0, GenRegister::immud(1)); +p-SHR(tmp1, src1, GenRegister::immud(1)); +p-ADD(dst, tmp0, tmp1); p-ADD(dst, dst, tmp_dst); } @@ -714,66 +677,21 @@ namespace gbe GenRegister tmp0 = ra-genReg(insn.dst(1)); GenRegister tmp1 = ra-genReg(insn.dst(2)); GenRegister tmp_dst = ra-genReg(insn.dst(3)); -int execWidth = p-curr.execWidth; /* Src0 and Src1 are always unsigned long type.*/ GBE_ASSERT(src0.type == GEN_TYPE_UL src1.type == GEN_TYPE_UL); dst.type = src0.type; -tmp0.type = tmp1.type = GEN_TYPE_UD; +tmp0.type = tmp1.type = GEN_TYPE_UL; tmp_dst.type = GEN_TYPE_UL; GBE_ASSERT(tmp_dst.subnr == 0); -GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) : - GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD); -GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? -
Re: [Beignet] [V2 PATCH 5/7] Backend: Handle the bswap using indirect mode access.
On 2015年03月09日 09:11, Zhigang Gong wrote: On Fri, Mar 06, 2015 at 03:24:00PM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com The swap for short will be like: mov(1) a01:UD0xe600e61UD{ align1 WE_all }; mov(1) a0.11:UD 0xe620e63UD{ align1 WE_all }; mov(1) a0.21:UD 0xe640e65UD{ align1 WE_all }; mov(1) a0.31:UD 0xe660e67UD{ align1 WE_all }; mov(8) g1141:UB g[a0]VxH,1,0:UB { align1 WE_all 1Q }; mov(8) g114.81:UBg[a0 8]VxH,1,0:UB{ align1 WE_all 1Q }; mov(8) g114.161:UB g[a0 16]VxH,1,0:UB { align1 WE_all 1Q }; mov(8) g114.241:UB g[a0 24]VxH,1,0:UB { align1 WE_all 1Q }; mov(16) g1131:UW g1148,8,1:UW { align1 WE_normal 1H }; Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/backend/gen_context.cpp| 112 backend/src/backend/gen_insn_selection.cpp |9 +++ backend/src/backend/gen_insn_selection.hxx |1 + 3 files changed, 122 insertions(+) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 6856510..46b4a06 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -297,6 +297,118 @@ namespace gbe p-MOV(dst.top_half(this-simdWidth), GenRegister::immud(0)); break; } + case SEL_OP_BSWAP: { +uint32_t simd = p-curr.execWidth; +GBE_ASSERT(simd == 8 || simd == 16 || simd == 1); +uint16_t new_a0[16]; +memset(new_a0, 0, sizeof(new_a0)); + +GBE_ASSERT(src.type == dst.type); +uint32_t start_addr = src.nr*32 + src.subnr; + +if (simd == 1) { + GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0 + dst.hstride == GEN_HORIZONTAL_STRIDE_0); + if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) { +GBE_ASSERT(start_addr = 0); +new_a0[0] = start_addr + 3; +new_a0[1] = start_addr + 2; +new_a0[2] = start_addr + 1; +new_a0[3] = start_addr; +this-setA0Content(new_a0, 0, 4); + +p-push(); +p-curr.execWidth = 4; +p-curr.predicate = GEN_PREDICATE_NONE; +p-curr.noMask = 1; +GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), +a0[0], new_a0[0] - a0[0]); +GenRegister dst_ = dst; +dst_.type = GEN_TYPE_UB; +dst_.hstride = GEN_HORIZONTAL_STRIDE_1; +dst_.width = GEN_WIDTH_4; +dst_.vstride = GEN_VERTICAL_STRIDE_4; +p-MOV(dst_, ind_src); +p-pop(); + } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) { +p-MOV(GenRegister::retype(dst, GEN_TYPE_UB), +GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB)); +p-MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB), +GenRegister::retype(src, GEN_TYPE_UB)); + } else { +GBE_ASSERT(0); + } +} else { + if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) { +GBE_ASSERT(src.subnr == 0); The above assertion is not correct. Because a valid simd8 or simd16 BSWAP instruction may have a uniform source register. We can't assume the source register must not be uniform value. I think the uniform case will be handled in if (simd == 1) case just above. I find if src is uniform, the dst seems always to be uniform and the simd will be 1 here. +GBE_ASSERT(dst.subnr == 0); +GBE_ASSERT(tmp.subnr == 0); +GBE_ASSERT(start_addr = 0); +new_a0[0] = start_addr + 3; +new_a0[1] = start_addr + 2; +new_a0[2] = start_addr + 1; +new_a0[3] = start_addr; +new_a0[4] = start_addr + 7; +new_a0[5] = start_addr + 6; +new_a0[6] = start_addr + 5; +new_a0[7] = start_addr + 4; +this-setA0Content(new_a0, 56); + +p-push(); +p-curr.execWidth = 8; +p-curr.predicate = GEN_PREDICATE_NONE; +p-curr.noMask = 1; +GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), +a0[0], new_a0[0] - a0[0]); +p-MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src); +for (int i = 1; i 4; i++) { + ind_src.addr_imm += 8; + p-MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src); +} +if (simd == 16) { + for (int i = 0; i 4; i++) { +ind_src.addr_imm += 8; +p-MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src); + } +} +p-pop(); + +p-MOV(dst, tmp
Re: [Beignet] compiler_fill_image_1d_array intermittent failure
It's really a bug. Because of the HW limitation, vertical stride is at least aligned to 2. For 1D array image, the data has interval. The size calculated in memset is right, but the image size is just twice as big as it. Use clEnqueueWriteImage is safe and I will fix it later. On 2015年02月04日 07:11, Rebecca N. Palmer wrote: Both [3.18 kernel] and the 3.16 kernel have a different intermittent failure I have yet to investigate: compiler_fill_image_1d_array()[FAILED] Error: dst[j*w + i] == 0 at file /home/rnpalmer/Debian/builds/stackbuild/beignet/utests/compiler_fill_image_1d_array.cpp, function compiler_fill_image_1d_array, line 63 The kernel itself (fill j7,i32 with 0x03020100) is working correctly; the problem is that the initial memset() clear (line 30) sometimes only clears half the array (exactly half, i.e. j4 is filled with 0, but the rest is left at whatever it was before). ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/8] SKL: Add skl pci ids and device.
Hi, It seems that gen9_context.hpp and gen9_context.cpp are lost and can not compile. On 2015年01月29日 16:16, Yang Rong wrote: SKL add the new GT4 type device. Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/cl_device_data.h | 45 +++ src/cl_device_id.c | 122 +-- 2 files changed, 164 insertions(+), 3 deletions(-) diff --git a/src/cl_device_data.h b/src/cl_device_data.h index 0d25ca4..d6f8209 100644 --- a/src/cl_device_data.h +++ b/src/cl_device_data.h @@ -230,5 +230,50 @@ #define IS_BROADWELL(devid) (IS_BRW_GT1(devid) || IS_BRW_GT2(devid) || IS_BRW_GT3(devid)) #define IS_GEN8(devid) IS_BROADWELL(devid) +/* SKL */ +#define PCI_CHIP_SKYLAKE_ULT_GT1 0x1906 /* Intel(R) Skylake ULT - GT1 */ +#define PCI_CHIP_SKYLAKE_ULT_GT2 0x1916 /* Intel(R) Skylake ULT - GT2 */ +#define PCI_CHIP_SKYLAKE_ULT_GT3 0x1926 /* Intel(R) Skylake ULT - GT3 */ +#define PCI_CHIP_SKYLAKE_ULT_GT2F 0x1921 /* Intel(R) Skylake ULT - GT2F */ +#define PCI_CHIP_SKYLAKE_ULX_GT1 0x190E /* Intel(R) Skylake ULX - GT1 */ +#define PCI_CHIP_SKYLAKE_ULX_GT2 0x191E /* Intel(R) Skylake ULX - GT2 */ +#define PCI_CHIP_SKYLAKE_DT_GT10x1902 /* Intel(R) Skylake Desktop - GT1 */ +#define PCI_CHIP_SKYLAKE_DT_GT20x1912 /* Intel(R) Skylake Desktop - GT2 */ +#define PCI_CHIP_SKYLAKE_HALO_GT1 0x190B /* Intel(R) Skylake HALO - GT1 */ +#define PCI_CHIP_SKYLAKE_HALO_GT2 0x191B /* Intel(R) Skylake HALO - GT2 */ +#define PCI_CHIP_SKYLAKE_HALO_GT3 0x192B /* Intel(R) Skylake HALO - GT3 */ +#define PCI_CHIP_SKYLAKE_HALO_GT4 0x193B /* Intel(R) Skylake HALO - GT4 */ +#define PCI_CHIP_SKYLAKE_SRV_GT1 0x190A /* Intel(R) Skylake Server - GT1 */ +#define PCI_CHIP_SKYLAKE_SRV_GT2 0x191A /* Intel(R) Skylake Server - GT2 */ +#define PCI_CHIP_SKYLAKE_SRV_GT3 0x192A /* Intel(R) Skylake Server - GT3 */ +#define PCI_CHIP_SKYLAKE_SRV_GT4 0x193A /* Intel(R) Skylake Server - GT4 */ + +#define IS_SKL_GT1(devid) \ + (devid == PCI_CHIP_SKYLAKE_ULT_GT1 || \ + devid == PCI_CHIP_SKYLAKE_ULX_GT1 || \ + devid == PCI_CHIP_SKYLAKE_DT_GT1 || \ + devid == PCI_CHIP_SKYLAKE_HALO_GT1 || \ + devid == PCI_CHIP_SKYLAKE_SRV_GT1) + +#define IS_SKL_GT2(devid) \ + (devid == PCI_CHIP_SKYLAKE_ULT_GT2 || \ + devid == PCI_CHIP_SKYLAKE_ULT_GT2F || \ + devid == PCI_CHIP_SKYLAKE_ULX_GT2 || \ + devid == PCI_CHIP_SKYLAKE_DT_GT2 || \ + devid == PCI_CHIP_SKYLAKE_HALO_GT2 || \ + devid == PCI_CHIP_SKYLAKE_SRV_GT2) + +#define IS_SKL_GT3(devid) \ + (devid == PCI_CHIP_SKYLAKE_ULT_GT3 || \ + devid == PCI_CHIP_SKYLAKE_HALO_GT3 || \ + devid == PCI_CHIP_SKYLAKE_SRV_GT3) + +#define IS_SKL_GT4(devid) \ + (devid == PCI_CHIP_SKYLAKE_HALO_GT4 || \ + devid == PCI_CHIP_SKYLAKE_SRV_GT4) + +#define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || IS_SKL_GT3(devid) || IS_SKL_GT4(devid)) +#define IS_GEN9(devid) IS_SKYLAKE(devid) + #endif /* __CL_DEVICE_DATA_H__ */ diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 3032a38..9d83ab2 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -140,6 +140,51 @@ static struct _cl_device_id intel_brw_gt3_device = { #include cl_gen75_device.h }; +/* XXX we clone brw now */ +static struct _cl_device_id intel_skl_gt1_device = { + INIT_ICD(dispatch) + .max_compute_unit = 6, + .max_thread_per_unit = 7, + .sub_slice_count = 2, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, +#include cl_gen75_device.h +}; + +static struct _cl_device_id intel_skl_gt2_device = { + INIT_ICD(dispatch) + .max_compute_unit = 24, + .max_thread_per_unit = 7, + .sub_slice_count = 3, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, +#include cl_gen75_device.h +}; + +static struct _cl_device_id intel_skl_gt3_device = { + INIT_ICD(dispatch) + .max_compute_unit = 48, + .max_thread_per_unit = 7, + .sub_slice_count = 6, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, +#include cl_gen75_device.h +}; + +static struct _cl_device_id intel_skl_gt4_device = { + INIT_ICD(dispatch) + .max_compute_unit = 72, + .max_thread_per_unit = 7, + .sub_slice_count = 9, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, +#include cl_gen75_device.h +}; + LOCAL cl_device_id cl_get_gt_device(void) @@ -378,6 +423,62 @@ brw_gt3_break: ret = intel_brw_gt3_device; break; + case PCI_CHIP_SKYLAKE_ULT_GT1: + DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R) HD Graphics Skylake ULT GT1); + case PCI_CHIP_SKYLAKE_ULX_GT1: + DECL_INFO_STRING(skl_gt1_break,
Re: [Beignet] [Patch V2 8/8] SKL: fix skl LD fail.
Except some format problem, this patchset LGTM and can pass all the utest cases on my platform. On 2015年01月30日 10:59, Yang Rong wrote: Skl's LD message payload order is changed from u, lod, v, w to u, v, lod, w. Add the Gen9Context and Selection9 to handle it. Skl Still use Gen8Encoder. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/CMakeLists.txt | 2 + backend/src/backend/gen9_context.cpp | 31 ++ backend/src/backend/gen9_context.hpp | 50 ++ backend/src/backend/gen_insn_selection.cpp | 67 -- backend/src/backend/gen_insn_selection.hpp | 7 backend/src/backend/gen_program.cpp| 3 +- 6 files changed, 147 insertions(+), 13 deletions(-) create mode 100644 backend/src/backend/gen9_context.cpp create mode 100644 backend/src/backend/gen9_context.hpp diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index ce83c62..951 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -103,6 +103,8 @@ set (GBE_SRC backend/gen75_context.cpp backend/gen8_context.hpp backend/gen8_context.cpp +backend/gen9_context.hpp +backend/gen9_context.cpp backend/gen_program.cpp backend/gen_program.hpp backend/gen_program.h diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp new file mode 100644 index 000..79ca275 --- /dev/null +++ b/backend/src/backend/gen9_context.cpp @@ -0,0 +1,31 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see http://www.gnu.org/licenses/. + * + */ + +/** + * \file gen9_context.cpp + */ + +#include backend/gen9_context.hpp +#include backend/gen_insn_selection.hpp + +namespace gbe +{ + void Gen9Context::newSelection(void) { +this-sel = GBE_NEW(Selection9, *this); + } +} diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp new file mode 100644 index 000..672b4fc --- /dev/null +++ b/backend/src/backend/gen9_context.hpp @@ -0,0 +1,50 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see http://www.gnu.org/licenses/. + * + */ + +/** + * \file gen9_context.hpp + */ +#ifndef __GBE_gen9_CONTEXT_HPP__ +#define __GBE_gen9_CONTEXT_HPP__ + +#include backend/gen8_context.hpp +#include backend/gen8_encoder.hpp + +namespace gbe +{ + /* This class is used to implement the HSW + specific logic for context. */ + class Gen9Context : public Gen8Context + { + public: +virtual ~Gen9Context(void) { }; +Gen9Context(const ir::Unit unit, const std::string name, uint32_t deviceID, bool relaxMath = false) +: Gen8Context(unit, name, deviceID, relaxMath) { +}; + + protected: + virtual GenEncoder* generateEncoder(void) { + return GBE_NEW(Gen8Encoder, this-simdWidth, 9, deviceID); + } + + private: +virtual void newSelection(void); + }; +} +#endif /* __GBE_GEN9_CONTEXT_HPP__ */ + diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 65842ff..4d0b979 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -249,6 +249,9 @@ namespace gbe this-vectorList.push_back(vec); } +#define LD_MSG_ORDER_IVB 7 +#define LD_MSG_ORDER_SKL 9 + /// // Maximal munch selection on DAG /// @@ -358,6 +361,8 @@ namespace gbe void setHas32X32Mul(bool b) { bHas32X32Mul = b; } bool hasLongType() const { return
Re: [Beignet] [PATCH 16/27] Modify the convert logic in gen selection.
On 四, 2015-01-08 at 13:14 +0800, Zhigang Gong wrote: On Tue, Jan 06, 2015 at 06:01:54PM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com The conversion logic is too complicated. We split it more clearly for each case. Notice: For I64 to I8, the conversion can not be completed within one step because of the hardware hstride restriction. So we need to convert it to i32 and than convert it to i8. typo here, should be then. Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/backend/gen8_context.cpp | 8 +- backend/src/backend/gen_insn_selection.cpp | 195 - 2 files changed, 168 insertions(+), 35 deletions(-) diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp index cffb10d..18a3425 100644 --- a/backend/src/backend/gen8_context.cpp +++ b/backend/src/backend/gen8_context.cpp @@ -55,7 +55,9 @@ namespace gbe { switch (insn.opcode) { case SEL_OP_CONVI64_TO_I: - +/* Should never come to here, just use the common OPCODE. */ +GBE_ASSERT(0); +break; default: GenContext::emitUnaryInstruction(insn); } @@ -65,7 +67,9 @@ namespace gbe { switch (insn.opcode) { case SEL_OP_CONVI_TO_I64: - +/* Should never come to here, just use the common OPCODE. */ +GBE_ASSERT(0); +break; default: GenContext::emitUnaryWithTempInstruction(insn); } diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index b6a13bf..60f45f7 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -349,9 +349,17 @@ namespace gbe const ir::RegisterData regData = getRegisterData(reg); return regData.isUniform(); } +INLINE bool isLongReg(const ir::Register reg) const { + const ir::RegisterData regData = getRegisterData(reg); + return regData.family == ir::FAMILY_QWORD; +} + +INLINE GenRegister unpacked_ud(const ir::Register reg) const { + return GenRegister::unpacked_ud(reg, isScalarReg(reg)); +} INLINE GenRegister unpacked_uw(const ir::Register reg) const { - return GenRegister::unpacked_uw(reg, isScalarReg(reg)); + return GenRegister::unpacked_uw(reg, isScalarReg(reg), isLongReg(reg)); } INLINE GenRegister unpacked_ub(const ir::Register reg) const { @@ -3658,7 +3666,7 @@ namespace gbe sel.F32TO16(unpacked, src); sel.pop(); sel.MOV(dst, unpacked); - } else if (dstFamily != FAMILY_DWORD dstFamily != FAMILY_QWORD (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) { + } else if (dstFamily != FAMILY_DWORD dstFamily != FAMILY_QWORD srcFamily == FAMILY_DWORD) {//convert i32 to small int GenRegister unpacked; if (dstFamily == FAMILY_WORD) { const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; @@ -3675,27 +3683,115 @@ namespace gbe } else unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type); } -if(srcFamily == FAMILY_QWORD) { + +sel.push(); +if (sel.isScalarReg(insn.getSrc(0))) { + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; +} +sel.MOV(unpacked, src); +sel.pop(); + +if (unpacked.reg() != dst.reg()) + sel.MOV(dst, unpacked); + } else if (dstFamily == FAMILY_WORD srcFamily == FAMILY_QWORD) { //convert i64 to i16 +const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; +GenRegister unpacked; +if (!sel.isScalarReg(dst.reg())) { + if (sel.hasLongType()) { +unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0; + } else { +unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0; + } + unpacked = GenRegister::retype(unpacked, type); +} else { + unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type); +} + +if(!sel.hasLongType()) { You already remove (|| srcFamily == FAMILY_QWORD at Line 3658, why still do the following code which is to convert I64 source operand to I32? It looks incorrect for me. The following else branch should be put here unconditional. I think here we are converting 64bits to 16bits, we first mov 64 bits to 32bits and then mov it to 16bits. I do not modify the origin logic here, but really we can optimize it. GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD
Re: [Beignet] [PATCH 2/6] Add long type support for disasm.
The Print imm logic is added in the [PATCH 10/27] of the new patchset. On 二, 2015-01-06 at 00:50 +, Yang, Rong R wrote: Also need add long/ulong imm print. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of junyan...@inbox.com Sent: Wednesday, December 24, 2014 00:13 To: beignet@lists.freedesktop.org Cc: Junyan He Subject: [Beignet] [PATCH 2/6] Add long type support for disasm. From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/backend/gen/gen_mesa_disasm.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index 162d459..2ebbc98 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -265,7 +265,7 @@ static const char *access_mode[2] = { [1] = align16, }; -static const char *reg_encoding[8] = { +static const char *reg_encoding[10] = { [0] = :UD, [1] = :D, [2] = :UW, @@ -273,10 +273,12 @@ static const char *reg_encoding[8] = { [4] = :UB, [5] = :B, [6] = :DF, - [7] = :F + [7] = :F, + [8] = :Q, + [9] = :UQ }; -int reg_type_size[8] = { +int reg_type_size[10] = { [0] = 4, [1] = 4, [2] = 2, @@ -284,7 +286,9 @@ int reg_type_size[8] = { [4] = 1, [5] = 1, [6] = 8, - [7] = 4 + [7] = 4, + [8] = 8, + [9] = 8 }; static const char *reg_file[4] = { @@ -983,6 +987,7 @@ static int imm(FILE *file, uint32_t type, const void* inst) break; case GEN_TYPE_F: format(file, %-gF, GEN_BITS_FIELD(inst, bits3.f)); + break; } return 0; } -- 1.7.9.5 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Fix PrintfState copying.
On 二, 2014-12-09 at 12:41 +0800, Yan Wang wrote: PrintfState includes std::string object and shouldn't be copied by malloc/memcpy. Signed-off-by: Yan Wang yan.w...@linux.intel.com --- backend/src/ir/printf.hpp | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp index b9f7619..8ea5976 100644 --- a/backend/src/ir/printf.hpp +++ b/backend/src/ir/printf.hpp @@ -75,6 +75,23 @@ namespace gbe char conversion_specifier; int out_buf_sizeof_offset; // Should *global_total_size to get the full offset. std::string str;//if %s, the string store here. + + PrintfState(void) { + } I think if we consider the PrintfState as a object and use constructor to init it, here we should better to init all the elements to default value. + + PrintfState(const PrintfState other) { +left_justified = other.left_justified; +sign_symbol = other.sign_symbol; +alter_form = other.alter_form; +zero_padding = other.zero_padding; +vector_n = other.vector_n; +min_width = other.min_width; +precision = other.precision; +length_modifier = other.length_modifier; +conversion_specifier = other.conversion_specifier; +out_buf_sizeof_offset = other.out_buf_sizeof_offset; +str = other.str; + } }; enum { @@ -106,8 +123,7 @@ namespace gbe PrintfSlot(PrintfState * st) { type = PRINTF_SLOT_TYPE_STATE; -state = (PrintfState *)malloc(sizeof(PrintfState)); -memcpy(state, st, sizeof(PrintfState)); +state = new PrintfState(*st); } PrintfSlot(const PrintfSlot other) { @@ -119,8 +135,7 @@ namespace gbe type = PRINTF_SLOT_TYPE_STRING; } else if (other.type == PRINTF_SLOT_TYPE_STATE) { type = PRINTF_SLOT_TYPE_STATE; - state = (PrintfState *)malloc(sizeof(PrintfState)); - memcpy(state, other.state, sizeof(PrintfState)); + state = new PrintfState(*other.state); } else { type = PRINTF_SLOT_TYPE_NONE; ptr = NULL; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] runtime: fix one bug in BDW image.
It's good for me On 三, 2014-11-12 at 14:12 +0800, Zhigang Gong wrote: As we still have the image 1d array workaround, we need to fix it for BDW as well. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/intel/intel_gpgpu.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index c6ea17f..b6e19db 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1028,8 +1028,10 @@ intel_get_surface_type(cl_mem_object_type type) static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type) { uint32_t surface_type; - if (((IS_IVYBRIDGE(gpgpu-drv-device_id) || IS_HASWELL(gpgpu-drv-device_id))) - index = 128 + BTI_RESERVED_NUM + if (((IS_IVYBRIDGE(gpgpu-drv-device_id) || +IS_HASWELL(gpgpu-drv-device_id) || +IS_BROADWELL(gpgpu-drv-device_id))) + index = BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM type == CL_MEM_OBJECT_IMAGE1D_ARRAY) surface_type = I965_SURFACE_2D; else ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 6/6] BDW: Add function intel_gpgpu_bind_buf for gen8.
This patchset LGTM On 一, 2014-09-29 at 13:37 +0800, Yang Rong wrote: From: Junyan He junyan...@linux.intel.com Must call cl_bind_buf instead of intel_gpgpu_bind_buf directly in intel_gpgpu. Signed-off-by: Junyan He junyan...@linux.intel.com --- src/intel/intel_gpgpu.c | 36 +++- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 6b8fa38..eedfe31 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -818,13 +818,13 @@ intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, ss0-ss8_9.surface_base_addr_lo = (buf-offset64 + internal_offset) 0x; ss0-ss8_9.surface_base_addr_hi = ((buf-offset64 + internal_offset) 32) 0x; dri_bo_emit_reloc(gpgpu-aux_buf.bo, - I915_GEM_DOMAIN_RENDER, - I915_GEM_DOMAIN_RENDER, - internal_offset, - gpgpu-aux_offset.surface_heap_offset + - heap-binding_table[index] + - offsetof(gen8_surface_state_t, ss1), - buf); +I915_GEM_DOMAIN_RENDER, +I915_GEM_DOMAIN_RENDER, +internal_offset, +gpgpu-aux_offset.surface_heap_offset + +heap-binding_table[index] + +offsetof(gen8_surface_state_t, ss1), +buf); } static int @@ -981,6 +981,18 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti); } +static void +intel_gpgpu_bind_buf_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, + uint32_t internal_offset, uint32_t size, uint8_t bti) +{ + assert(gpgpu-binded_n max_buf_n); + gpgpu-binded_buf[gpgpu-binded_n] = buf; + gpgpu-target_buf_offset[gpgpu-binded_n] = internal_offset; + gpgpu-binded_offset[gpgpu-binded_n] = offset; + gpgpu-binded_n++; + intel_gpgpu_setup_bti_gen8(gpgpu, buf, internal_offset, size, bti); +} + static int intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size) { @@ -1011,7 +1023,7 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr; gpgpu-stack_b.bo = drm_intel_bo_alloc(bufmgr, STACK, size, 64); - intel_gpgpu_bind_buf(gpgpu, gpgpu-stack_b.bo, offset, 0, size, bti); + cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)gpgpu-stack_b.bo, offset, 0, size, bti); } static void @@ -1427,7 +1439,7 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint } memset(bo-virtual, 0, size); drm_intel_bo_unmap(bo); - intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, bti); + cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)bo, offset, 0, size, bti); return 0; } @@ -1526,6 +1538,12 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info; cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info; + if (IS_BROADWELL(device_id)) { +cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *)intel_gpgpu_bind_buf_gen8; +cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8; +return; + } + if (IS_HASWELL(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75; cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen75; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] Problems with recent beignet
Yes, we have found this problem. New version beignet requires libdrm at least 2.4.52 Old version of libdrm will cause the build fail. We have already add the libdrm version check On 四, 2014-10-16 at 21:53 -0700, Andi Kleen wrote: I tried the current beignet master on my OpenSUSE 13.1 HSW and ran into the following new problems (older versions worked): - With my DRM version I have the build always fails with: src/intel/intel_gpgpu.c: In function ‘intel_gpgpu_setup_bti_gen8’: src/intel/intel_gpgpu.c:922:39: error: ‘drm_intel_bo’ has no member named ‘offset64’ ss0-ss8.surface_base_addr_lo = (buf-offset64 + internal_offset) 0x; ^ src/intel/intel_gpgpu.c:923:40: error: ‘drm_intel_bo’ has no member named ‘offset64’ ss0-ss9.surface_base_addr_hi = ((buf-offset64 + internal_offset) 32) 0x; ^ src//beignet/src/intel/intel_gpgpu.c: In function ‘intel_gpgpu_bind_image_gen8’: [ 25%] src/intel/intel_gpgpu.c:1112:40: error: ‘drm_intel_bo’ has no member named ‘offset64’ ss-ss8.surface_base_addr_lo = obj_bo-offset64 0x; ^ /home/ak/src/beignet/src/intel/intel_gpgpu.c:1113:41: error: ‘drm_intel_bo’ has no member named ‘offset64’ ss-ss9.surface_base_addr_hi = (obj_bo-offset64 32) 0x; I just commented out these lines because they seem to be only used on GEN8. Probably would be good to have a cmake test that tests for these fields and disables gen8 or falls back to plain offset ? - With that fixed the first utest always bails out with: builtin_acos_float()utest_run: /home/ak/src/beignet/src/intel/intel_gpgpu.c:703: intel_gpgpu_check_binded_buf_address: Assertion `gpgpu-binded_buf[i]-offset != 0' failed. Interrupt signal (SIGABRT) received. summary: -- total: 684 run: 1 pass: 0 fail: 1 The assert mentions the same name as above, but I believe it's a different field. I tried to bisect that and ended up with the following commit. Not sure if that is correct? Unfortunately it doesn't cleanly revert from master for testing. commit 8c1ed91f0af6ab8284fe06b4c582b55c7d925816 Author: Zhigang Gong zhigang.g...@intel.com Date: Fri Sep 12 13:45:40 2014 +0800 GBE: fix multiple files compilation bugs. If we want to link multiple files together, and one kernel function need refer other kernel functions in other files, we must not set those functions as linked once attribute. -Andi ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] BDW: Also need set Shader Channel Select for constant buffer in BDW.
OK, LGTM On 四, 2014-10-16 at 15:11 +0800, Yang Rong wrote: Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/intel/intel_gpgpu.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 259882a..167d8d9 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -907,6 +907,12 @@ intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t int memset(ss0, 0, sizeof(gen8_surface_state_t)); ss0-ss0.surface_type = I965_SURFACE_BUFFER; ss0-ss0.surface_format = format; + if(format != I965_SURFACEFORMAT_RAW) { +ss0-ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED; +ss0-ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN; +ss0-ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE; +ss0-ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA; + } ss0-ss2.width = s 0x7f; /* bits 6:0 of sz */ assert(ss0-ss2.width 0x03); ss0-ss2.height = (s 7) 0x3fff; /* bits 20:7 of sz */ ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 8/8] BDW: Correct scratch buffer of BDW.
This patch set will cause displacement_map_element case hang every time. But no regression found on previous platforms. We can find the bug later and fix it. On 一, 2014-09-29 at 13:38 +0800, Yang Rong wrote: BDW's scratch buffer change to power 2 alignment from 1024. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/backend/gen8_context.cpp | 2 +- src/intel/intel_gpgpu.c | 22 ++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp index a8bed64..f7484ca 100644 --- a/backend/src/backend/gen8_context.cpp +++ b/backend/src/backend/gen8_context.cpp @@ -46,7 +46,7 @@ namespace gbe uint32_t Gen8Context::alignScratchSize(uint32_t size){ if(size == 0) return 0; -uint32_t i = 2048; +uint32_t i = 1024; while(i size) i *= 2; return i; } diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index fa7333e..d65b1a2 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -342,16 +342,28 @@ uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) { } uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) { +//align in backend, if non pow2, must align when alloc scratch bo. +assert((size (size - 1)) == 0); size = size 11; uint32_t index = 0; while((size = 1) 0) index++; //get leading one -//non pow 2 size -if(size (size - 1)) index++; return index; } +uint32_t intel_gpgpu_get_scratch_index_gen8(uint32_t size) { +//align in backend, if non pow2, must align when alloc scratch bo. +assert((size (size - 1)) == 0); +size = size 10; +uint32_t index = 0; +while((size = 1) 0) + index++; //get leading one + +return index; +} + + static cl_int intel_gpgpu_get_max_curbe_size(uint32_t device_id) { @@ -1142,7 +1154,9 @@ intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */ desc-desc6.group_threads_num = kernel-thread_n; desc-desc6.barrier_enable = kernel-use_slm; - if (slm_sz = 4*KB) + if (slm_sz == 0) +slm_sz = 0; + else if (slm_sz = 4*KB) slm_sz = 4*KB; else if (slm_sz = 8*KB) slm_sz = 8*KB; @@ -1666,7 +1680,7 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8; cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8; -intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75; +intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8; intel_gpgpu_post_action = intel_gpgpu_post_action_gen75; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 5/5] BDW: Add class Gen8Context.
This patchset is OK and will not cause regression on previous platform. In this patch set, the GenEncoder will be a pure virtual class and all platform encoders will derive from it. But the GenContext still represents the Gen7 context. I think it is better to follow the same way as the encoder to make the architecture clearer. On 一, 2014-09-29 at 13:37 +0800, Yang Rong wrote: Now Gen8Context is almost same as Gen75Context, but still derive Gen8Context from GenContext for clearly. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/CMakeLists.txt | 2 + backend/src/backend/gen8_context.cpp | 113 +++ backend/src/backend/gen8_context.hpp | 63 +++ backend/src/backend/gen_program.cpp | 3 + 4 files changed, 181 insertions(+) create mode 100644 backend/src/backend/gen8_context.cpp create mode 100644 backend/src/backend/gen8_context.hpp diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index 2daa630..c5d388e 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -96,6 +96,8 @@ set (GBE_SRC backend/gen_context.cpp backend/gen75_context.hpp backend/gen75_context.cpp +backend/gen8_context.hpp +backend/gen8_context.cpp backend/gen_program.cpp backend/gen_program.hpp backend/gen_program.h diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp new file mode 100644 index 000..a9914f6 --- /dev/null +++ b/backend/src/backend/gen8_context.cpp @@ -0,0 +1,113 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see http://www.gnu.org/licenses/. + * + */ + +/** + * \file gen8_context.cpp + */ + +#include backend/gen8_context.hpp +#include backend/gen8_encoder.hpp +#include backend/gen_program.hpp +#include backend/gen_defs.hpp +#include backend/gen_encoder.hpp +#include backend/gen_insn_selection.hpp +#include backend/gen_insn_scheduling.hpp +#include backend/gen_reg_allocation.hpp +#include sys/cvar.hpp +#include ir/function.hpp +#include ir/value.hpp +#include cstring + +namespace gbe +{ + void Gen8Context::emitSLMOffset(void) { +if(kernel-getUseSLM() == false) + return; + +const GenRegister slm_offset = ra-genReg(GenRegister::ud1grf(ir::ocl::slmoffset)); +const GenRegister slm_index = GenRegister::ud1grf(0, 0); +//the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to get byte unit +p-push(); + p-curr.execWidth = 1; + p-curr.predicate = GEN_PREDICATE_NONE; + p-SHR(slm_offset, slm_index, GenRegister::immud(12)); +p-pop(); + } + + void Gen8Context::allocSLMOffsetCurbe(void) { +if(fn.getUseSLM()) + allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET); + } + + uint32_t Gen8Context::alignScratchSize(uint32_t size){ +if(size == 0) + return 0; +uint32_t i = 2048; +while(i size) i *= 2; +return i; + } + + void Gen8Context::emitStackPointer(void) { +using namespace ir; + +// Only emit stack pointer computation if we use a stack +if (kernel-getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) = 0) + return; + +// Check that everything is consistent in the kernel code +const uint32_t perLaneSize = kernel-getStackSize(); +const uint32_t perThreadSize = perLaneSize * this-simdWidth; +GBE_ASSERT(perLaneSize 0); +GBE_ASSERT(isPowerOf2(perLaneSize) == true); +GBE_ASSERT(isPowerOf2(perThreadSize) == true); + +// Use shifts rather than muls which are limited to 32x16 bit sources +const uint32_t perLaneShift = logi2(perLaneSize); +const uint32_t perThreadShift = logi2(perThreadSize); +const GenRegister selStatckPtr = this-simdWidth == 8 ? + GenRegister::ud8grf(ir::ocl::stackptr) : + GenRegister::ud16grf(ir::ocl::stackptr); +const GenRegister stackptr = ra-genReg(selStatckPtr); +const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer); +const GenRegister bufferptr = ra-genReg(selStackBuffer); + +// We compute the per-lane stack pointer here +p-push(); + p-curr.execWidth = 1; + p-curr.predicate = GEN_PREDICATE_NONE; + //p-AND(GenRegister::ud1grf(126,0),
Re: [Beignet] [PATCH] Add long support for printf
Sorry, this if V2 V2: Replace all the long and ulong to int64_t On 四, 2014-09-18 at 12:39 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/ir/printf.cpp | 25 - backend/src/llvm/llvm_printf_parser.cpp | 22 +++--- kernels/test_printf.cl |3 +++ 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp index 9d60402..e99aad5 100644 --- a/backend/src/ir/printf.cpp +++ b/backend/src/ir/printf.cpp @@ -149,20 +149,35 @@ namespace gbe switch (slot.state-conversion_specifier) { case PRINTF_CONVERSION_D: case PRINTF_CONVERSION_I: -PRINT_SOMETHING(int, d); +if (slot.state-length_modifier == PRINTF_LM_L) + PRINT_SOMETHING(uint64_t, d); +else + PRINT_SOMETHING(int, d); break; case PRINTF_CONVERSION_O: -PRINT_SOMETHING(int, o); +if (slot.state-length_modifier == PRINTF_LM_L) + PRINT_SOMETHING(uint64_t, o); +else + PRINT_SOMETHING(int, o); break; case PRINTF_CONVERSION_U: -PRINT_SOMETHING(int, u); +if (slot.state-length_modifier == PRINTF_LM_L) + PRINT_SOMETHING(uint64_t, u); +else + PRINT_SOMETHING(int, u); break; case PRINTF_CONVERSION_X: -PRINT_SOMETHING(int, X); +if (slot.state-length_modifier == PRINTF_LM_L) + PRINT_SOMETHING(uint64_t, X); +else + PRINT_SOMETHING(int, X); break; case PRINTF_CONVERSION_x: -PRINT_SOMETHING(int, x); +if (slot.state-length_modifier == PRINTF_LM_L) + PRINT_SOMETHING(uint64_t, x); +else + PRINT_SOMETHING(int, x); break; case PRINTF_CONVERSION_C: diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp index 00e1ef8..29684ba 100644 --- a/backend/src/llvm/llvm_printf_parser.cpp +++ b/backend/src/llvm/llvm_printf_parser.cpp @@ -640,14 +640,22 @@ error: case PRINTF_CONVERSION_U: case PRINTF_CONVERSION_x: case PRINTF_CONVERSION_X: -/* If the bits change, we need to consider the signed. */ -if (arg-getType() != Type::getInt32Ty(module-getContext())) { - arg = builder-CreateIntCast(arg, Type::getInt32Ty(module-getContext()), sign); -} +if (slot.state-length_modifier == PRINTF_LM_L) { /* we would rather print long. */ + if (arg-getType() != Type::getInt64Ty(module-getContext())) { +arg = builder-CreateIntCast(arg, Type::getInt64Ty(module-getContext()), sign); + } + dst_type = Type::getInt64PtrTy(module-getContext(), 1); + sizeof_size = sizeof(int64_t); +} else { + /* If the bits change, we need to consider the signed. */ + if (arg-getType() != Type::getInt32Ty(module-getContext())) { +arg = builder-CreateIntCast(arg, Type::getInt32Ty(module-getContext()), sign); + } -/* Int to Int, just store. */ -dst_type = Type::getInt32PtrTy(module-getContext(), 1); -sizeof_size = sizeof(int); + /* Int to Int, just store. */ + dst_type = Type::getInt32PtrTy(module-getContext(), 1); + sizeof_size = sizeof(int); +} return true; case PRINTF_CONVERSION_C: diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl index 84bb478..c2844f4 100644 --- a/kernels/test_printf.cl +++ b/kernels/test_printf.cl @@ -7,6 +7,7 @@ test_printf(void) uint a = 'x'; float f = 5.0f; int3 vec; + ulong cc = 1004294967296; vec.x = x; vec.y = y; vec.z = z; @@ -15,6 +16,8 @@ test_printf(void) printf(--- Welcome to the printf test of %s ---\n, Intel Beignet); printf(### output a char is %c\n, a); + +printf(@@@ A long value is %ld\n, cc); } if (x % 15 == 0) ___ Beignet mailing list Beignet@lists.freedesktop.org http
Re: [Beignet] [PATCH] GBE/libocl: fix build dependency issue.
LGTM On 四, 2014-09-18 at 08:35 +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- backend/src/libocl/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt index f015eec..b0074b3 100644 --- a/backend/src/libocl/CMakeLists.txt +++ b/backend/src/libocl/CMakeLists.txt @@ -72,7 +72,7 @@ MACRO(GENERATE_HEADER_PY _mod) COMMAND ${PYTHON_EXECUTABLE} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py ${def_name} ${output_name} 1 #COMMAND echo echo \\#endif ${output_name} COMMAND echo \\#endif ${output_name} - DEPENDS ${tmpl_name} + DEPENDS ${tmpl_name} ${def_name} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py COMMENT Generate the header by python: ${output_name} ) ENDMACRO(GENERATE_HEADER_PY) @@ -85,7 +85,7 @@ MACRO(GENERATE_SOURCE_PY _mod) COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/ COMMAND cat ${tmpl_name} ${output_name} COMMAND ${PYTHON_EXECUTABLE} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py ${def_name} ${output_name} 0 - DEPENDS ${tmpl_name} + DEPENDS ${tmpl_name} ${def_name} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py COMMENT Generate the source by python: ${output_name} ) ENDMACRO(GENERATE_SOURCE_PY) ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] GBE: Output linkModules's error message.
LGTM thanks On 三, 2014-09-17 at 11:33 +0800, Ruiling Song wrote: Signed-off-by: Ruiling Song ruiling.s...@intel.com --- backend/src/llvm/llvm_bitcode_link.cpp |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp index d845479..1365b32 100644 --- a/backend/src/llvm/llvm_bitcode_link.cpp +++ b/backend/src/llvm/llvm_bitcode_link.cpp @@ -204,9 +204,10 @@ namespace gbe /* We use beignet's bitcode as dst because it will have a lot of lazy functions which will not be loaded. */ -if(Linker::LinkModules(clonedLib, mod, Linker::DestroySource, NULL)) { +std::string errorMsg; +if(Linker::LinkModules(clonedLib, mod, Linker::DestroySource, errorMsg)) { delete clonedLib; - printf(Fatal Error: link the bitcode error\n); + printf(Fatal Error: link the bitcode error:\n%s\n, errorMsg.c_str()); return NULL; } ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] GBE/libocl: fix a regression after libocl change.
LGTM, thanks On 二, 2014-09-16 at 09:57 +0800, Zhigang Gong wrote: Ping for review. On Fri, Sep 12, 2014 at 05:38:06PM +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- backend/src/libocl/tmpl/ocl_math.tmpl.cl | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl index c397ca2..f61d107 100644 --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl @@ -3204,9 +3204,6 @@ OVERLOADABLE float pown(float x, int n) { } OVERLOADABLE float rootn(float x, int n) { - if (__ocl_math_fastpath_flag) -return __gen_ocl_internal_fastpath_rootn(x, n); - float ax,re; int sign = 0; if( n == 0 )return NAN; @@ -3233,7 +3230,10 @@ OVERLOADABLE float rootn(float x, int n) { ax = __gen_ocl_fabs(x); if(x 0.0f (n1)) sign = 1; - re = __gen_ocl_internal_pow(ax,1.f/n); + if (__ocl_math_fastpath_flag) +re = __gen_ocl_pow(ax, 1.f/n); + else +re = __gen_ocl_internal_pow(ax,1.f/n); if(sign) re = -re; return re; -- 1.8.3.2 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] GBE/libocl: add missing vector builtin definition for fma.
LGTM, thanks On 二, 2014-09-16 at 09:57 +0800, Zhigang Gong wrote: Ping for review. On Fri, Sep 12, 2014 at 05:18:16PM +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- backend/src/libocl/script/ocl_math.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/libocl/script/ocl_math.def b/backend/src/libocl/script/ocl_math.def index ff1d5d6..4baded4 100644 --- a/backend/src/libocl/script/ocl_math.def +++ b/backend/src/libocl/script/ocl_math.def @@ -26,7 +26,7 @@ gentype fabs (gentype) gentype fdim (gentype x, gentype y) gentype floor (gentype) # XXX we use madd for fma -#gentype fma (gentype a, gentype b, gentype c) +gentype fma (gentype a, gentype b, gentype c) gentype fmax (gentype x, gentype y) gentypef fmax (gentypef x, float y) gentyped fmax (gentyped x, double y) -- 1.8.3.2 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/3] GBE: fix multiple files compilation bugs.
The patchset is OK On 五, 2014-09-12 at 14:33 +0800, Zhigang Gong wrote: If we want to link multiple files together, and one kernel function need refer other kernel functions in other files, we must not set those functions as linked once attribute. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- backend/src/backend/gen_program.cpp | 4 +++- backend/src/llvm/llvm_to_gen.cpp| 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp index 3e16fd6..5324587 100644 --- a/backend/src/backend/gen_program.cpp +++ b/backend/src/backend/gen_program.cpp @@ -56,6 +56,7 @@ #include backend/gen_reg_allocation.hpp #include ir/unit.hpp #include llvm/llvm_to_gen.hpp +#include llvm/llvm_gen_backend.hpp #include clang/CodeGen/CodeGenAction.h @@ -371,9 +372,10 @@ namespace gbe { } for (llvm::Module::iterator I = src-begin(), E = src-end(); I != E; ++I) { +llvm::Function *F = llvm::dyn_castllvm::Function(I); +if (F isKernelFunction(*F)) continue; I-setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage); } - llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)-module; llvm::Linker::LinkModules( dst, src, diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp index 755793e..3cb0e5b 100644 --- a/backend/src/llvm/llvm_to_gen.cpp +++ b/backend/src/llvm/llvm_to_gen.cpp @@ -86,10 +86,11 @@ namespace gbe FPM.add(new DataLayout(DL)); #endif +// XXX remove the verifier pass to workaround a non-fatal error. #if LLVM_VERSION_MAJOR == 3 LLVM_VERSION_MINOR =5 -FPM.add(createVerifierPass(true)); +//FPM.add(createVerifierPass(true)); #else -FPM.add(createVerifierPass()); +//FPM.add(createVerifierPass()); #endif FPM.add(new TargetLibraryInfo(*libraryInfo)); FPM.add(createTypeBasedAliasAnalysisPass()); ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] Another build failure with current master
I suggest that you delete the PCH file in the system install dir and try again. On 五, 2014-09-12 at 07:52 -0400, Yichao Yu wrote: On Fri, Sep 12, 2014 at 7:32 AM, Yichao Yu yyc1...@gmail.com wrote: Hi, I've got an error when generating src/kernels/cl_internal_*_str.c's. The error message reads error: OpenCL version was in PCH file but is currently �� build the file /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/src/kernels//cl_internal_built_in_kernel.cl failed Note the non-ascii (non-utf8) characters even when I set locale to C. The command line executed was cd /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/src OCL_BITCODE_BIN=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libocl/lib/beignet.bc OCL_HEADER_DIR=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libocl/include/ OCL_PCH_OBJECT=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libocl/lib/beignet.pch LD_LIBRARY_PATH=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/gbe_bin_generater -s /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/src/kernels//cl_internal_built_in_kernel.cl -o/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/src/kernels//cl_internal_built_in_kernel_str.c Doesn't work with either CLANG or GCC (probably unrelated). llvm version 3.5.0. Ooops, I forgot to save my PKGBUILD before recompile the package. With -DCOMPILER=CLANG the gbe_bin_generater (btw I think it should spell as generator...) actually segfault with the following backtrace (and although the gcc version doesn't segfault, the error message does look like some sort of memory corruption) #0 0x77285335 in clang::Diagnostic::FormatDiagnostic(char const*, char const*, llvm::SmallVectorImplchar) const () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #1 0x772860f3 in clang::Diagnostic::FormatDiagnostic(char const*, char const*, llvm::SmallVectorImplchar) const () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #2 0x7694a77a in clang::TextDiagnosticPrinter::HandleDiagnostic(clang::DiagnosticsEngine::Level, clang::Diagnostic const) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #3 0x77287b50 in clang::DiagnosticIDs::EmitDiag(clang::DiagnosticsEngine, clang::DiagnosticIDs::Level) const () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #4 0x77287cc6 in clang::DiagnosticIDs::ProcessDiag(clang::DiagnosticsEngine) const () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #5 0x77280c4c in clang::DiagnosticsEngine::EmitCurrentDiagnostic(bool) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #6 0x7698af9c in clang::DiagnosticBuilder::Emit() [clone .part.41] () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #7 0x7698eecd in checkLanguageOptions(clang::LangOptions const, clang::LangOptions const, clang::DiagnosticsEngine*) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #8 0x769a4b90 in clang::ASTReader::ParseLanguageOptions(llvm::SmallVectorunsigned long, 64u const, bool, clang::ASTReaderListener) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #9 0x769a7409 in clang::ASTReader::ReadControlBlock(clang::serialization::ModuleFile, llvm::SmallVectorImplclang::ASTReader::ImportedModule, clang::serialization::ModuleFile const*, unsigned int) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #10 0x769a7e8a in clang::ASTReader::ReadASTCore(llvm::StringRef, clang::serialization::ModuleKind, clang::SourceLocation, clang::serialization::ModuleFile*, llvm::SmallVectorImplclang::ASTReader::ImportedModule, long, long, unsigned int) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #11 0x769c19c3 in clang::ASTReader::ReadAST(std::string const, clang::serialization::ModuleKind, clang::SourceLocation, unsigned int) () from /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so #12 0x768f152d in clang::CompilerInstance::createPCHExternalASTSource(llvm::StringRef, std::string const, bool, bool, clang::Preprocessor, clang::ASTContext, void*, bool, bool, bool) () from
Re: [Beignet] [PATCH] GBE/libocl: fix the wrong prototype of scalar native_powr.
It's my typo, thanks for your fixing On 三, 2014-09-10 at 16:23 +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- backend/src/libocl/tmpl/ocl_math.tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h index 7a7d12f..1d860b7 100644 --- a/backend/src/libocl/tmpl/ocl_math.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h @@ -109,7 +109,7 @@ OVERLOADABLE float native_exp10(float x); OVERLOADABLE float native_log(float x); OVERLOADABLE float native_log2(float x); OVERLOADABLE float native_log10(float x); -OVERLOADABLE float native_powr(float x); +OVERLOADABLE float native_powr(float x, float y); OVERLOADABLE float native_recip(float x); OVERLOADABLE float native_rsqrt(float x); OVERLOADABLE float native_sin(float x); ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH v2] GBE: remove the user defined macro cl_khr_fp64.
LGTM, thanks. On 四, 2014-09-04 at 13:59 +0800, Zhigang Gong wrote: This is not a predefined macro according to the spec. Let's not define it by default. This patch also disable the fp64 when enter user kernels. v2: Some internal .cl files require cl_khr_fp64 enabled. Fixed that issue by move the enable macro to ocl_types.h. Signed-off-by: Zhigang Gong zhigang.g...@intel.com Reviewed-by: Junyan He junyan...@linux.intel.com --- backend/src/backend/program.cpp| 2 -- backend/src/libocl/CMakeLists.txt | 2 +- backend/src/libocl/include/ocl.h | 1 + backend/src/libocl/include/ocl_types.h | 3 --- backend/src/libocl/src/ocl_async.cl| 1 + backend/src/libocl/src/ocl_image.cl| 26 +- backend/src/libocl/src/ocl_vload.cl| 1 + 7 files changed, 17 insertions(+), 19 deletions(-) diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 42cd989..98e8a34 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -516,8 +516,6 @@ namespace gbe { } args.push_back(-cl-kernel-arg-info); -args.push_back(-Dcl_khr_fp64); - args.push_back(-mllvm); args.push_back(-inline-threshold=20); #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt index d4e3a53..fb93da1 100644 --- a/backend/src/libocl/CMakeLists.txt +++ b/backend/src/libocl/CMakeLists.txt @@ -129,7 +129,7 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES}) ENDFOREACH(M) -SET (CLANG_OCL_FLAGS -fno-builtin -Dcl_khr_fp64 -ffp-contract=off -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND) +SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND) MACRO(ADD_CL_TO_BC_TARGET _file) # CMake seems can not add pattern rule, use MACRO to replace. diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h index a7d03e6..d4a8805 100644 --- a/backend/src/libocl/include/ocl.h +++ b/backend/src/libocl/include/ocl.h @@ -19,5 +19,6 @@ #include ocl_sync.h #include ocl_vload.h #include ocl_workitem.h +#pragma OPENCL EXTENSION cl_khr_fp64 : disable #endif diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h index 05a2dae..87e9bf5 100644 --- a/backend/src/libocl/include/ocl_types.h +++ b/backend/src/libocl/include/ocl_types.h @@ -1,10 +1,7 @@ #ifndef __OCL_TYPES_H__ #define __OCL_TYPES_H__ -#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -#endif - #include ocl_defines.h #define NULL ((void*)0) diff --git a/backend/src/libocl/src/ocl_async.cl b/backend/src/libocl/src/ocl_async.cl index 57d6859..e6f9a36 100644 --- a/backend/src/libocl/src/ocl_async.cl +++ b/backend/src/libocl/src/ocl_async.cl @@ -1,3 +1,4 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable #include ocl_async.h #include ocl_sync.h #include ocl_workitem.h diff --git a/backend/src/libocl/src/ocl_image.cl b/backend/src/libocl/src/ocl_image.cl index 00c3e8f..7202802 100644 --- a/backend/src/libocl/src/ocl_image.cl +++ b/backend/src/libocl/src/ocl_image.cl @@ -188,7 +188,7 @@ OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image) #define FIXUP_FLOAT_COORD(tmpCoord)\ {\ if (tmpCoord 0 tmpCoord -0x1p-20f) \ - tmpCoord += -0x1p-9; \ + tmpCoord += -0x1p-9f; \ } DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i) @@ -229,7 +229,7 @@ DECL_IMAGE_INFO_COMMON(image1d_buffer_t) #define FIXUP_FLOAT_COORD(tmpCoord)\ {\ if (tmpCoord.s0 0 tmpCoord.s0 -0x1p-20f)\ - tmpCoord.s0 += -0x1p-9; \ + tmpCoord.s0 += -0x1p-9f; \ if (tmpCoord.s1 0 tmpCoord.s1 -0x1p-20f)\ tmpCoord.s1 += -0x1p-9f; \ } @@ -258,7 +258,7 @@ DECL_IMAGE(0, image2d_t, float4, f, 2) #define FIXUP_FLOAT_COORD(tmpCoord)\ {\ if (tmpCoord.s0 0 tmpCoord.s0 -0x1p-20f)\ - tmpCoord.s0 += -0x1p-9; \ + tmpCoord.s0 += -0x1p-9f; \ } DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2) @@ -306,12 +306,12 @@ OVERLOADABLE size_t get_image_array_size(image1d_array_t image) #define FIXUP_FLOAT_COORD(tmpCoord) \ { \ -if (tmpCoord.s0 0
Re: [Beignet] [PATCH] Fix a bug for runtime_barrier_list.cpp, local var not inited.
That's better than memset, It's OK On 一, 2014-09-01 at 09:27 +0800, Zhigang Gong wrote: I just checked the test case. This may not the best fix. The issue should be the first time to access all the events which also inclues the uninitialized event 3,4,5, And the following patch should be better. - for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) { + for (cl_uint i = 0; i 3; ++i) { clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), status, NULL); OCL_ASSERT(status = CL_SUBMITTED); } Any thought? On Mon, Sep 01, 2014 at 10:04:46AM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- utests/runtime_barrier_list.cpp |3 +++ 1 file changed, 3 insertions(+) diff --git a/utests/runtime_barrier_list.cpp b/utests/runtime_barrier_list.cpp index 6987d5e..e176771 100644 --- a/utests/runtime_barrier_list.cpp +++ b/utests/runtime_barrier_list.cpp @@ -1,3 +1,4 @@ +#include string.h #include utest_helper.hpp #define BUFFERSIZE 32*1024 @@ -10,6 +11,8 @@ void runtime_barrier_list(void) cl_int status = 0; cl_int value = 34; + memset(ev, 0, sizeof(cl_event)*5); + // Setup kernel and buffers OCL_CREATE_KERNEL(compiler_event); OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL); -- 1.7.9.5 ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] utests: fix two utest bugs.
That OK On 二, 2014-09-02 at 10:36 +0800, Zhigang Gong wrote: Similar as the bug found by junyan, some events are accessed before assigned. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- utests/runtime_event.cpp | 2 +- utests/runtime_marker_list.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp index b974f6a..f8170a3 100644 --- a/utests/runtime_event.cpp +++ b/utests/runtime_event.cpp @@ -28,7 +28,7 @@ void runtime_event(void) locals[0] = 32; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, ev[0], ev[2]); - for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) { + for (cl_uint i = 0; i 3; ++i) { clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), status, NULL); OCL_ASSERT(status = CL_SUBMITTED); } diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp index fc77156..f64b1d1 100644 --- a/utests/runtime_marker_list.cpp +++ b/utests/runtime_marker_list.cpp @@ -34,7 +34,7 @@ void runtime_marker_list(void) clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, ev[0], ev[2]); - for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) { + for (cl_uint i = 0; i 3; ++i) { clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), status, NULL); OCL_ASSERT(status = CL_SUBMITTED); } ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 00/18] Using bit code ocl lib to replace the huge header file.
The dependencies of these files are complex here and may cause some inconvenience because the CMake lacks of the flexibility. After discussion with Zhigang, I notice that it is important to keep it complying with the whole project and I will rewrite the all the building files using CMake. On 三, 2014-08-27 at 23:17 -0400, Yichao Yu wrote: On Wed, Aug 27, 2014 at 10:50 PM, Song, Ruiling ruiling.s...@intel.com wrote: Out-of-source build (mkdir build; cmake ../; make) does not work after apply your patch. It simply print No targets specified and no makefile found. Stop. and stop building. Do you mean `mkdir build; cd build; cmake ../; make` ? It feels like no makefile found error cannot be caused by changes in the cmake files without configure time error. After apply your patch, utest compiler_copy_image1 failed on my machine. And seems that you miss Copyright header in the new files you added. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of junyan...@inbox.com Sent: Tuesday, August 12, 2014 3:31 PM To: beignet@lists.freedesktop.org Cc: Junyan He Subject: [Beignet] [PATCH 00/18] Using bit code ocl lib to replace the huge header file. From: Junyan He junyan...@linux.intel.com The PCH file is growing too big. It contains too many defines and macros, which need a lot of time to parse and may cause some conflicts with the user defined macros. This patch set will extract the function protocols from its definition into the header files and compile all the functions' definition into a llvm bitcode file as a cl library. This manner is also compatible with libclc and we can switch to libclc if libclc is stable and extensive enough. This patch set may cause the compiling time slower than the PCH version, because the header file's parsing and bitcode's linking are not optimized yet, and we will continue to improve it. TODO: Math functions' fast and standard switch in the lib. Header file parsing optimization, may use PCH or PTH. Linking optimization. Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/CMakeLists.txt| 3 + backend/src/CMakeLists.txt| 141 +- backend/src/GBEConfig.h.in| 2 + backend/src/backend/program.cpp | 239 +- backend/src/builtin_vector_proto.def | 295 - backend/src/gen_as.sh | 101 - backend/src/gen_builtin_vector.py | 414 - backend/src/gen_convert.sh| 553 - backend/src/libocl/Makefile.in|81 + backend/src/libocl/include/ocl.h |23 + backend/src/libocl/include/ocl_async.h|49 + backend/src/libocl/include/ocl_atom.h |84 + backend/src/libocl/include/ocl_common.inh |21 + backend/src/libocl/include/ocl_defines.inh|23 + backend/src/libocl/include/ocl_float.h|79 + backend/src/libocl/include/ocl_geometric.h|39 + backend/src/libocl/include/ocl_image.h| 161 + backend/src/libocl/include/ocl_integer.inh| 160 + backend/src/libocl/include/ocl_math.inh | 103 + backend/src/libocl/include/ocl_misc.h | 122 + backend/src/libocl/include/ocl_printf.h |15 + backend/src/libocl/include/ocl_relational.inh |78 + backend/src/libocl/include/ocl_sync.h |18 + backend/src/libocl/include/ocl_types.h| 104 + backend/src/libocl/include/ocl_vload.h| 143 + backend/src/libocl/include/ocl_workitem.h |15 + backend/src/libocl/lib/ocl_async.cl |69 + backend/src/libocl/lib/ocl_atom.cl| 122 + backend/src/libocl/lib/ocl_barrier.ll |39 + backend/src/libocl/lib/ocl_common.inc |49 + backend/src/libocl/lib/ocl_geometric.cl |96 + backend/src/libocl/lib/ocl_image.cl | 412 + backend/src/libocl/lib/ocl_integer.inc| 352 + backend/src/libocl/lib/ocl_math.inc | 3316 + backend/src/libocl/lib/ocl_memcpy.ll | 336 + backend/src/libocl/lib/ocl_memset.ll | 127 + backend/src/libocl/lib/ocl_misc.cl| 201 + backend/src/libocl/lib/ocl_relational.inc | 151 + backend/src/libocl/lib/ocl_sync.cl|14 + backend/src/libocl/lib/ocl_vload.cl | 257 + backend/src/libocl/lib/ocl_workitem.cl|40 + backend/src/libocl/script/gen_as.sh | 124 + backend/src/libocl/script/gen_common.inc |11 + backend/src/libocl/script/gen_convert.sh | 653 + backend/src/libocl/script/gen_vector.py | 382 + backend/src/libocl/script/ocl_common.def |22 + backend/src/libocl/script/ocl_integer.def |31 + backend/src/libocl/script
Re: [Beignet] [PATCH] GBE: clear deadprintfs when current function is done.
OK, Thanks for finding this bug. On 二, 2014-08-26 at 15:39 +0800, Ruiling Song wrote: It should be cleared, to prevent invalid pointers staying there when processing next Function. Signed-off-by: Ruiling Song ruiling.s...@intel.com --- backend/src/llvm/llvm_printf_parser.cpp |1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp index e02f5aa..00e1ef8 100644 --- a/backend/src/llvm/llvm_printf_parser.cpp +++ b/backend/src/llvm/llvm_printf_parser.cpp @@ -616,6 +616,7 @@ error: prf.first-eraseFromParent(); } +deadprintfs.clear(); delete builder; return changed; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases
Some comments, On 五, 2014-07-04 at 12:00 +0800, Lv Meng wrote: Signed-off-by: Lv Meng meng...@intel.com --- src/CMakeLists.txt | 3 ++- src/cl_context.h| 1 + src/cl_mem.c| 27 +++-- src/kernels/cl_internal_copy_buf_rect_align4.cl | 15 ++ 4 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 src/kernels/cl_internal_copy_buf_rect_align4.cl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 46426d9..dff8fdf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,7 +41,8 @@ set (KERNEL_STR_FILES) set (KERNEL_NAMES cl_internal_copy_buf_align4 cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset -cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d +cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4 +cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d diff --git a/src/cl_context.h b/src/cl_context.h index 75afbf6..f8342d3 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -47,6 +47,7 @@ enum _cl_internal_ker_type { CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, CL_ENQUEUE_COPY_BUFFER_RECT, + CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d diff --git a/src/cl_mem.c b/src/cl_mem.c index 70bc3eb..b78258f 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -1396,9 +1396,20 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, size_t dst_row_pitch, size_t dst_slice_pitch) { cl_int ret; cl_kernel ker; + cl_int index; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1}; + // the src and dst mem rect is continuous, the copy is degraded to buf copy + if((region[0] == dst_row_pitch) (region[0] == src_row_pitch) + (region[1] * src_row_pitch == src_slice_pitch) (region[1] * dst_row_pitch == dst_slice_pitch)){ +cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0]; +cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0]; +cl_int size = region[0]*region[1]*region[2]; +ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size); +return ret; + } + if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; @@ -1413,8 +1424,20 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, /* setup the kernel and run. */ extern char cl_internal_copy_buf_rect_str[]; extern size_t cl_internal_copy_buf_rect_str_size; - - ker = cl_context_get_static_kernel_from_bin(queue-ctx, CL_ENQUEUE_COPY_BUFFER_RECT, + index = CL_ENQUEUE_COPY_BUFFER_RECT; + if( (src_offset % 4== 0) (dst_offset % 4== 0) (src_row_pitch % 4== 0) (dst_row_pitch % 4== 0) + (src_slice_pitch % 4== 0) (dst_slice_pitch % 4== 0) (global_sz[0] % 4 == 0) ){ +global_sz[0] /= 4; +src_offset /= 4; +dst_offset /= 4; +src_row_pitch /= 4; +dst_row_pitch /= 4; +src_slice_pitch /= 4; +dst_slice_pitch /= 4; +index = CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4; + } + + ker = cl_context_get_static_kernel_from_bin(queue-ctx, index, cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL); I think here you use the wrong source string. For align 4 rect buffer, you should use cl_internal_copy_buf_rect_align4_str here, which I notice that already exists in the cl_internal_copy_buf_rect_align4_str.c I think you separate align and unalign cases as cl_mem_fill if (!ker) diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl b/src/kernels/cl_internal_copy_buf_rect_align4.cl new file mode 100644 index 000..fbfe7b2 --- /dev/null +++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl @@ -0,0 +1,15 @@ +kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst, + unsigned int region0, unsigned int region1, unsigned int region2, + unsigned int src_offset, unsigned int dst_offset, +
Re: [Beignet] [PATCH v2] runtime: fix a gpgpu event and thread local gpgpu handling bug.
OK, that's LGTM On 四, 2014-07-03 at 14:14 +0800, Zhigang Gong wrote: When pending a command queue, we need to record the whole gpgpu structure not just the batch buffer. For the following reason: 1. We need to keep those private buffer, for example those printf buffers. 2. We need to make sure this gpgpu will not be reused by other enqueuement. v2: Don't try to flush all user event attached to the queue. Just need to flush the current event when doing command queue flush. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/cl_api.c | 3 +- src/cl_command_queue.c| 14 +++-- src/cl_command_queue.h| 4 +++ src/cl_driver.h | 8 ++ src/cl_driver_defs.c | 4 +-- src/cl_enqueue.c | 2 +- src/cl_event.c| 26 ++--- src/cl_event.h| 7 +++-- src/cl_thread.c | 20 + src/cl_thread.h | 3 ++ src/intel/intel_batchbuffer.c | 13 - src/intel/intel_batchbuffer.h | 1 - src/intel/intel_gpgpu.c | 66 +-- 13 files changed, 97 insertions(+), 74 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index d54ada6..8759027 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -69,7 +69,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list, cl_event* event, enqueue_data* data, cl_command_type type) { cl_int status = cl_event_wait_events(num, wait_list, queue); - cl_event e; + cl_event e = NULL; if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) { e = cl_event_new(queue-ctx, queue, type, event!=NULL); @@ -85,6 +85,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list, cl_event_new_enqueue_callback(e, data, num, wait_list); } } + queue-current_event = e; return status; } diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 8426c4e..cd268aa 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -28,6 +28,7 @@ #include cl_alloc.h #include cl_driver.h #include cl_khr_icd.h +#include cl_event.h #include performance.h #include assert.h @@ -421,10 +422,9 @@ error: return err; } -LOCAL cl_int -cl_command_queue_flush(cl_command_queue queue) +LOCAL void +cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu) { - GET_QUEUE_THREAD_GPGPU(queue); size_t global_wk_sz[3]; void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz); @@ -447,7 +447,15 @@ cl_command_queue_flush(cl_command_queue queue) global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0; cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz); } +} +LOCAL cl_int +cl_command_queue_flush(cl_command_queue queue) +{ + GET_QUEUE_THREAD_GPGPU(queue); + cl_command_queue_flush_gpgpu(queue, gpgpu); + if (queue-current_event) +cl_event_flush(queue-current_event); cl_invalid_thread_gpgpu(queue); return CL_SUCCESS; } diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index b79d63a..bd70f25 100644 --- a/src/cl_command_queue.h +++ b/src/cl_command_queue.h @@ -41,6 +41,7 @@ struct _cl_command_queue { cl_intwait_events_num; /* Number of Non-complete user events */ cl_intwait_events_size; /* The size of array that wait_events point to */ cl_event last_event;/* The last event in the queue, for enqueue mark used */ + cl_event current_event; /* Current event. */ cl_command_queue_properties props; /* Queue properties */ cl_command_queue prev, next; /* We chain the command queues together */ void *thread_data; /* Used to store thread context data */ @@ -82,6 +83,9 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem); /* Flush for the command queue */ extern cl_int cl_command_queue_flush(cl_command_queue); +/* Flush for the specified gpgpu */ +extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu); + /* Wait for the completion of the command queue */ extern cl_int cl_command_queue_finish(cl_command_queue); diff --git a/src/cl_driver.h b/src/cl_driver.h index 2999eb7..3d1d8d8 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -197,13 +197,9 @@ extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new; typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int); extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status; -/* pending flush the batch buffer of this event */ -typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event); -extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending; - /* flush the batch buffer of this event */ -typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event); -extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume; +typedef void
Re: [Beignet] [PATCH] runtime: recover the maximum read image args to 128.
That's OK On 四, 2014-07-03 at 12:53 +0800, Zhigang Gong wrote: To comply with the full profile. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/cl_gt_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index 97ba7e2..63c9047 100644 --- a/src/cl_gt_device.h +++ b/src/cl_gt_device.h @@ -39,7 +39,7 @@ .address_bits = 32, .max_mem_alloc_size = 256 * 1024 * 1024, .image_support = CL_TRUE, -.max_read_image_args = 16, +.max_read_image_args = 128, .max_write_image_args = 8, .image_max_array_size = 2048, .image2d_max_width = 8192, ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 3/5] Add %f and %c support for printf.
On 五, 2014-06-20 at 07:18 +, Yang, Rong R wrote: Two comments. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of junyan...@inbox.com Sent: Wednesday, June 18, 2014 2:42 PM To: beignet@lists.freedesktop.org Cc: Junyan He Subject: [Beignet] [PATCH 3/5] Add %f and %c support for printf. From: Junyan He junyan...@linux.intel.com Add the %c and %f support for printf. Also add the int to float and int to char conversion. Some minor errors such as wrong index flags have been fixed. Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/ir/printf.cpp | 69 +++ backend/src/ir/printf.hpp | 4 ++ backend/src/llvm/llvm_printf_parser.cpp | 72 + 3 files changed, 93 insertions(+), 52 deletions(-) diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp index 0a943ac..4bd7f2d 100644 --- a/backend/src/ir/printf.cpp +++ b/backend/src/ir/printf.cpp @@ -17,18 +17,18 @@ */ /** - * \file sampler.cpp + * \file printf.cpp * */ #include stdarg.h #include printf.hpp -#include ocl_common_defines.h namespace gbe { namespace ir { + pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER; uint32_t PrintfSet::append(PrintfFmt* fmt, Unit unit) @@ -43,35 +43,21 @@ namespace gbe } /* Update the total size of size. */ - sizeOfSize = slots.back()-state-out_buf_sizeof_offset - + getPrintfBufferElementSize(slots.size() - 1); + if (slots.size() 0) +sizeOfSize = slots.back()-state-out_buf_sizeof_offset + + getPrintfBufferElementSize(slots.size() - 1); return (uint32_t)fmts.size(); } -/* ugly here. We can not build the va_list dynamically:( - And I have tried - va_list arg; arg = some_ptr; - This works very OK on 32bits platform but can not even - pass the compiling in the 64bits platform. - sizeof(arg) = 4 in 32bits platform but - sizeof(arg) = 24 in 64bits platform. - We can not assume the platform here. */ -void vfprintf_wrap(std::string fmt, vectorint contents) -{ - int* ptr = NULL; - size_t num = contents.size() 32 ? contents.size() : 32; - ptr = (int *)calloc(32, sizeof(int)); //should be enough - for (size_t i = 0; i num; i++) { -ptr[i] = contents[i]; - } - - printf(fmt.c_str(), ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], - ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15], ptr[16], - ptr[17], ptr[18], ptr[19], ptr[20], ptr[21], ptr[22], ptr[23], ptr[24], ptr[25], - ptr[26], ptr[27], ptr[28], ptr[29], ptr[30], ptr[31]); - free(ptr); -} +#define PRINT_SOMETHING(target_ty, conv) do { \ + pf_str = pf_str + std::string(#conv); \ + printf(pf_str.c_str(),\ + ((target_ty *)((char *)buf_addr + slot.state-out_buf_sizeof_offset * \ +global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \ + [k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i]);\ + pf_str = ; \ +} while (0) void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2) @@ -79,15 +65,15 @@ namespace gbe LockOutput lock; size_t i, j, k; std::string pf_str; - vectorint* contents = NULL; + int stmt = 0; + for (auto pf : fmts) { for (i = 0; i global_wk_sz0; i++) { for (j = 0; j global_wk_sz1; j++) { for (k = 0; k global_wk_sz2; k++) { - int flag = ((int *)index_addr)[k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i]; + int flag = ((int + *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2 + + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i]; if (flag) { pf_str = ; -contents = new vectorint(); for (auto slot : pf) { if (slot.type == PRINTF_SLOT_TYPE_STRING) { pf_str = pf_str + std::string(slot.str); @@ -98,23 +84,34 @@ namespace gbe switch (slot.state-conversion_specifier) { case PRINTF_CONVERSION_D: case PRINTF_CONVERSION_I: - contents-push_back(((int *)((char *)buf_addr + slot.state-out_buf_sizeof_offset - * global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) - [k
Re: [Beignet] [PATCH 2/2] runtime: fix image1d buffer allocation.
Spec says: For a 1D image buffer object, the image pixels are taken from the buffer object’s data store. When the contents of a buffer object’s data store are modified, those changes are reflected in the contents of the 1D image buffer object and vice-versa at corresponding sychronization points. NOTE: Concurrent reading from, writing to and copying between both a buffer object and 1D image buffer object associated with the buffer object is undefined. Only reading from both a buffer object and 1D image buffer object associated with the buffer object is defined. So corresponding sychronization points seems very important. if the user hold the mapped buffer address, this may cause some problem. On 五, 2014-06-20 at 15:47 +0800, Zhigang Gong wrote: Per bspec, a image should has a at least 2 line vertical alignment, thus we can't simply attach a buffer to a 1d image surface which has the same size. We have to create a new image, and copy the buffer data to this new image. And replace all the buffer object's reference to this image. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/cl_mem.c | 73 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/src/cl_mem.c b/src/cl_mem.c index a1d3b25..b27e64a 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -480,6 +480,23 @@ error: goto exit; } +void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo) +{ + cl_buffer_unreference(buffer-bo); + buffer-bo = new_bo; + cl_buffer_reference(new_bo); + if (buffer-type != CL_MEM_SUBBUFFER_TYPE) +return; + + struct _cl_mem_buffer *it = ((struct _cl_mem_buffer*)buffer)-sub_next; + for( ; it != (struct _cl_mem_buffer*)buffer; it = it-sub_next) + { +cl_buffer_unreference(it-base.bo); +it-base.bo = new_bo; +cl_buffer_reference(new_bo); + } +} + void cl_mem_copy_image_region(const size_t *origin, const size_t *region, void *dst, size_t dst_row_pitch, size_t dst_slice_pitch, @@ -598,10 +615,12 @@ _cl_mem_new_image(cl_context ctx, if (UNLIKELY(w == 0)) DO_IMAGE_ERROR; if (UNLIKELY(h == 0 (image_type != CL_MEM_OBJECT_IMAGE1D - image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY))) + image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY + image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER))) DO_IMAGE_ERROR; - if (image_type == CL_MEM_OBJECT_IMAGE1D) { + if (image_type == CL_MEM_OBJECT_IMAGE1D || + image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { size_t min_pitch = bpp * w; if (data pitch == 0) pitch = min_pitch; @@ -809,27 +828,43 @@ _cl_mem_new_image_from_buffer(cl_context ctx, merged_flags = ~(CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS); merged_flags |= flags (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS); } - - /* Because the buffer is NO_TILING, the image should be no tiling. */ - image = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, merged_flags, CL_FALSE, err); - if (image == NULL || err != CL_SUCCESS) -goto error; - - cl_buffer_reference(buffer-bo); - image-bo = buffer-bo; - image-size = buffer-size; - /* If it is a sub buffer, we need to start from the sub offset. */ + struct _cl_mem_buffer *mem_buffer = (struct _cl_mem_buffer*)buffer; if (buffer-type == CL_MEM_SUBBUFFER_TYPE) { offset = ((struct _cl_mem_buffer *)buffer)-sub_offset; +mem_buffer = mem_buffer-parent; } - if (image-flags CL_MEM_USE_HOST_PTR) { -/* Now point to the right offset if buffer is a SUB_BUFFER. */ -image-host_ptr = buffer-host_ptr + offset; - } + /* Get the size of each pixel */ + if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, bpp)) != CL_SUCCESS)) +goto error; - cl_mem_image_init(cl_mem_image(image), image_desc-image_width, 1, image_desc-image_type, -1, *image_format, intel_fmt, bpp, image_desc-image_width*bpp, 0, CL_NO_TILE, -0, 0, offset); + // Per bspec, a image should has a at least 2 line vertical alignment, + // thus we can't simply attach a buffer to a 1d image surface which has the same size. + // We have to create a new image, and copy the buffer data to this new image. + // And replace all the buffer object's reference to this image. + image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type, +mem_buffer-base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret); + if (image == NULL) +return NULL; + void *src = cl_mem_map(buffer); + void *dst = cl_mem_map(image); + // + // FIXME, we could use copy buffer to image to do this on GPU latter. + // currently the copy buffer to image function doesn't support 1D image. + memcpy(dst, src, mem_buffer-base.size); + cl_mem_unmap(buffer); + cl_mem_unmap(image); + + if (err != 0) +goto error; + + // Now
Re: [Beignet] [PATCH] driver: fix a potential Null reference.
Really has risk here. Thanks for fixing it On Tue, 2014-06-17 at 11:18 +0800, Zhigang Gong wrote: cl_gpgpu_flush may be called when the batch buffer has been released. We need to check whether there is a valid buffer before we really take the following actions. Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/intel/intel_gpgpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 1da6400..6af6e40 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -555,6 +555,8 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu) static void intel_gpgpu_flush(intel_gpgpu_t *gpgpu) { + if (!gpgpu-batch || !gpgpu-batch-buffer) +return; intel_batchbuffer_emit_mi_flush(gpgpu-batch); intel_batchbuffer_flush(gpgpu-batch); intel_gpgpu_check_binded_buf_address(gpgpu); ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 2/2] runtime: fix some image array related bugs.
Thanks for find the problem. I think the key point is that for array image it always uses the slice_pitch rather than the image_row_pitch. This patch set is good for me. As you mentioned, I will improve my utest case later, and you can push my first 2 patches firstly to make your patch set work. On Wed, 2014-06-18 at 10:25 +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/cl_api.c | 5 - src/cl_device_id.c | 1 + src/cl_device_id.h | 1 + src/cl_gt_device.h | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/cl_api.c b/src/cl_api.c index 327f02b..d91 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -674,7 +674,10 @@ clGetSupportedImageFormats(cl_context ctx, err = CL_INVALID_VALUE; goto error; } - if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE2D + if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D + image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY + image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY + image_type != CL_MEM_OBJECT_IMAGE2D image_type != CL_MEM_OBJECT_IMAGE3D)) { err = CL_INVALID_VALUE; goto error; diff --git a/src/cl_device_id.c b/src/cl_device_id.c index af8e90c..578b548 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -363,6 +363,7 @@ cl_get_device_info(cl_device_id device, DECL_FIELD(IMAGE_SUPPORT, image_support) DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args) DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args) +DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size) DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width) DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height) DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width) diff --git a/src/cl_device_id.h b/src/cl_device_id.h index a5449a7..769bfd2 100644 --- a/src/cl_device_id.h +++ b/src/cl_device_id.h @@ -51,6 +51,7 @@ struct _cl_device_id { cl_uint max_read_image_args; cl_uint max_write_image_args; size_t image2d_max_width; + size_t image_max_array_size; size_t image2d_max_height; size_t image3d_max_width; size_t image3d_max_height; diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index b8bda5e..6d03123 100644 --- a/src/cl_gt_device.h +++ b/src/cl_gt_device.h @@ -41,6 +41,7 @@ .image_support = CL_TRUE, .max_read_image_args = 128, .max_write_image_args = 8, +.image_max_array_size = 2048, .image2d_max_width = 8192, .image2d_max_height = 8192, .image3d_max_width = 8192, ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 3/3] Add the test cases for 1D Image Array
for this patch set, the compare_image_2d_and_1d_array output: ## x y is (30, 15), color1 is (2 1 30 15), color2 is (2 1 30 30) ## x y is (31, 0), color1 is (2 1 31 0), color2 is (2 1 31 0) ## x y is (31, 1), color1 is (2 1 31 1), color2 is (2 1 31 2) ## x y is (31, 2), color1 is (2 1 31 2), color2 is (2 1 31 4) ## x y is (31, 3), color1 is (2 1 31 3), color2 is (2 1 31 6) ## x y is (31, 4), color1 is (2 1 31 4), color2 is (2 1 31 8) ## x y is (31, 5), color1 is (2 1 31 5), color2 is (2 1 31 10) ## x y is (31, 6), color1 is (2 1 31 6), color2 is (2 1 31 12) ## x y is (31, 7), color1 is (2 1 31 7), color2 is (2 1 31 14) ## x y is (31, 8), color1 is (2 1 31 8), color2 is (2 1 31 16) ## x y is (31, 9), color1 is (2 1 31 9), color2 is (2 1 31 18) ## x y is (31, 10), color1 is (2 1 31 10), color2 is (2 1 31 20) ## x y is (31, 11), color1 is (2 1 31 11), color2 is (2 1 31 22) ## x y is (31, 12), color1 is (2 1 31 12), color2 is (2 1 31 24) ## x y is (31, 13), color1 is (2 1 31 13), color2 is (2 1 31 26) ## x y is (31, 14), color1 is (2 1 31 14), color2 is (2 1 31 28) ## x y is (31, 15), color1 is (2 1 31 15), color2 is (2 1 31 30) color1 is the result of image2d_t and color2 is the result of image1d_array_t. The h of the image1d_array_t seems always twice of the image2d_t. I can not find the problem by now, any idea? On Tue, 2014-06-17 at 12:07 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- kernels/compare_image_2d_and_1d_array.cl | 12 + kernels/test_get_image_info_array.cl | 25 ++ utests/CMakeLists.txt| 2 + utests/compare_image_2d_and_1d_array.cpp | 78 utests/compiler_get_image_info_array.cpp | 64 ++ 5 files changed, 181 insertions(+) create mode 100644 kernels/compare_image_2d_and_1d_array.cl create mode 100644 kernels/test_get_image_info_array.cl create mode 100644 utests/compare_image_2d_and_1d_array.cpp create mode 100644 utests/compiler_get_image_info_array.cpp diff --git a/kernels/compare_image_2d_and_1d_array.cl b/kernels/compare_image_2d_and_1d_array.cl new file mode 100644 index 000..ff25834 --- /dev/null +++ b/kernels/compare_image_2d_and_1d_array.cl @@ -0,0 +1,12 @@ +__kernel void +compare_image_2d_and_1d_array(image2d_t a1, image1d_array_t a2, sampler_t sampler) +{ + int2 coord; + int4 color1; + int4 color2; + coord.x = get_global_id(0); + coord.y = get_global_id(1); + color1 = read_imagei(a1, sampler, coord); + color2 = read_imagei(a2, sampler, coord); + printf(## x y is (%d, %d), color1 is (%d %d %d %d), color2 is (%d %d %d %d)\n, coord.x, coord.y, color1.x, color1.y, color1.z, color1.w, color2.x, color2.y, color2.z, color2.w); +} diff --git a/kernels/test_get_image_info_array.cl b/kernels/test_get_image_info_array.cl new file mode 100644 index 000..333da77 --- /dev/null +++ b/kernels/test_get_image_info_array.cl @@ -0,0 +1,25 @@ +__kernel void +test_get_image_info_array(__write_only image1d_array_t a1, __write_only image2d_array_t a2, __global int *result) +{ + int w, h, array_sz; + + w = get_image_width(a1); + array_sz = (int)get_image_array_size(a1); + int channel_data_type = get_image_channel_data_type(a1); + int channel_order = get_image_channel_order(a1); + result[0] = w; + result[1] = array_sz; + result[2] = channel_data_type; + result[3] = channel_order; + + w = get_image_width(a2); + h = get_image_height(a2); + array_sz = (int)get_image_array_size(a2); + channel_data_type = get_image_channel_data_type(a2); + channel_order = get_image_channel_order(a2); + result[4] = w; + result[5] = h; + result[6] = array_sz; + result[7] = channel_data_type; + result[8] = channel_order; +} diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index f0e62e2..641a73b 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -122,6 +122,7 @@ set (utests_sources compiler_volatile.cpp compiler_copy_image1.cpp compiler_get_image_info.cpp + compiler_get_image_info_array.cpp compiler_vect_compare.cpp compiler_vector_load_store.cpp compiler_vector_inc.cpp @@ -182,6 +183,7 @@ set (utests_sources enqueue_fill_buf.cpp enqueue_built_in_kernels.cpp image_1D_buffer.cpp + compare_image_2d_and_1d_array.cpp utest_assert.cpp utest.cpp utest_file_map.cpp diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp new file mode 100644 index 000..f989049 --- /dev/null +++ b/utests/compare_image_2d_and_1d_array.cpp @@ -0,0 +1,78 @@ +#include string.h +#include utest_helper.hpp + +static void compare_image_2d_and_1d_array(void) +{ + const int w = 64
Re: [Beignet] [PATCH] HSW: Fix potential issue of GT3 when calc stack address.
Tested on my HSW platform, no obvious regression found. On Thu, 2014-06-12 at 19:42 +0800, Yang Rong wrote: GT3 have 4 half slice, so should shift left 2 bits, and also should enlarge the stack buffer size, otherwize, if thread generate is non-balance, may out of bound. Per bspec, scratch size need set 2X of desired. Signed-off-by: Yang Rong rong.r.y...@intel.com --- backend/src/backend/gen75_context.cpp | 4 ++-- src/cl_command_queue_gen7.c | 6 ++ src/intel/intel_gpgpu.c | 3 +++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp index aedd4d3..da0db85 100644 --- a/backend/src/backend/gen75_context.cpp +++ b/backend/src/backend/gen75_context.cpp @@ -92,12 +92,12 @@ namespace gbe p-curr.predicate = GEN_PREDICATE_NONE; //p-AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); p-AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); - p-AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x80)); + p-AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); p-SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7)); p-curr.execWidth = this-simdWidth; p-SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); p-curr.execWidth = 1; - p-SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(1)); + p-SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2)); p-ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4)); p-SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift)); p-curr.execWidth = this-simdWidth; diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 9680535..af3030c 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -244,6 +244,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) assert(offset = 0); stack_sz *= gbe_kernel_get_simd_width(ker-opaque); stack_sz *= device-max_compute_unit; + /* Because HSW calc stack offset per thread is relative with half slice, when + thread schedule in half slice is not balance, would out of bound. Because + the max half slice is 4 in GT4, multiply stack size with 4 for safe. + */ + if(cl_driver_get_ver(ctx-drv) == 75) +stack_sz *= 4; cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl()); } diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 5093583..cae843b 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -833,6 +833,9 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size) drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr; drm_intel_bo* old = gpgpu-scratch_b.bo; uint32_t total = per_thread_size * gpgpu-max_threads; + /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */ + if (IS_HASWELL(gpgpu-drv-device_id)) + total *= 2; gpgpu-per_thread_scratch = per_thread_size; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Fix the 3D failed problem because the un-inited parameters
Sorry, this patch is for opencl-1.2 On Thu, 2014-06-12 at 14:55 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- utests/compiler_fill_image_3d.cpp | 4 utests/compiler_fill_image_3d_2.cpp | 4 2 files changed, 8 insertions(+) diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp index ac0b7e0..ec96e80 100644 --- a/utests/compiler_fill_image_3d.cpp +++ b/utests/compiler_fill_image_3d.cpp @@ -1,3 +1,4 @@ +#include string.h #include utest_helper.hpp static void compiler_fill_image_3d(void) @@ -9,6 +10,9 @@ static void compiler_fill_image_3d(void) cl_image_format format; cl_image_desc desc; + memset(desc, 0x0, sizeof(cl_image_desc)); + memset(format, 0x0, sizeof(cl_image_format)); + format.image_channel_order = CL_RGBA; format.image_channel_data_type = CL_UNSIGNED_INT8; desc.image_type = CL_MEM_OBJECT_IMAGE3D; diff --git a/utests/compiler_fill_image_3d_2.cpp b/utests/compiler_fill_image_3d_2.cpp index 4c56036..410ace8 100644 --- a/utests/compiler_fill_image_3d_2.cpp +++ b/utests/compiler_fill_image_3d_2.cpp @@ -1,3 +1,4 @@ +#include string.h #include utest_helper.hpp static void compiler_fill_image_3d_2(void) @@ -8,6 +9,9 @@ static void compiler_fill_image_3d_2(void) cl_image_format format; cl_image_desc desc; + memset(desc, 0x0, sizeof(cl_image_desc)); + memset(format, 0x0, sizeof(cl_image_format)); + format.image_channel_order = CL_RGBA; format.image_channel_data_type = CL_UNSIGNED_INT8; desc.image_type = CL_MEM_OBJECT_IMAGE3D; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [[OpenCL-1.2]] GBE: Enable some implemented Opencl 1.2 functions in icd table.
OK, if it is mandatory, I agree. LGTM. On Tue, 2014-06-10 at 14:07 +0800, Zhigang Gong wrote: Just as we discussed, for the mandatory APIs, we 'd better to just keep it as NULL if we haven't implemented it. And if we want to set some stub function, we should not set the stub function here. We should implement a dummy function in the cl_api.c and just put a NOT_SUPPORT there. And then remove the CL_1_2_NOTYET here. In one word, we don't need to add a fake stub function here. Any further comments? -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of He Junyan Sent: Tuesday, June 10, 2014 12:33 PM To: Zhigang Gong Cc: beignet@lists.freedesktop.org Subject: Re: [Beignet] [[OpenCL-1.2]] GBE: Enable some implemented Opencl 1.2 functions in icd table. hi, I want to add a fake stub function here, printf the warning of not implement, and return CL_SOME_ERRORXX, the null function always cause the program crash. On Tue, 2014-06-10 at 09:01 +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/cl_khr_icd.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c index d601134..3a7dec2 100644 --- a/src/cl_khr_icd.c +++ b/src/cl_khr_icd.c @@ -142,19 +142,19 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = { #ifdef CL_VERSION_1_2 (void *) NULL, CL_1_2_NOTYET(clCreateSubDevices), - CL_1_2_NOTYET(clRetainDevice), - CL_1_2_NOTYET(clReleaseDevice), - CL_1_2_NOTYET(clCreateImage), - CL_1_2_NOTYET(clCreateProgramWithBuiltInKernels), + clRetainDevice, + clReleaseDevice, + clCreateImage, + clCreateProgramWithBuiltInKernels, CL_1_2_NOTYET(clCompileProgram), CL_1_2_NOTYET(clLinkProgram), CL_1_2_NOTYET(clUnloadPlatformCompiler), CL_1_2_NOTYET(clGetKernelArgInfo), - CL_1_2_NOTYET(clEnqueueFillBuffer), + clEnqueueFillBuffer, CL_1_2_NOTYET(clEnqueueFillImage), CL_1_2_NOTYET(clEnqueueMigrateMemObjects), - CL_1_2_NOTYET(clEnqueueMarkerWithWaitList), - CL_1_2_NOTYET(clEnqueueBarrierWithWaitList), + clEnqueueMarkerWithWaitList, + clEnqueueBarrierWithWaitList, CL_1_2_NOTYET(clGetExtensionFunctionAddressForPlatform), CL_GL_INTEROP(clCreateFromGLTexture), (void *) NULL, ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Handle the difference timestamp count, got from drm_intel_reg_read, between 32bit system and 64bit system.
I verified this patch on both my 32bits and 64bits IVB platforms. Both results are OK and pass the profiling_exec test case. On Tue, 2014-06-10 at 16:17 +0800, Yang Rong wrote: In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of result which got from drm_intel_reg_read, and 32-35 bits are lost; but in i386 system, the timestamp count match bspec. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain 32 bits data in i386. Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/intel/intel_gpgpu.c | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index a1bd672..7aa5563 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1170,12 +1170,16 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts) drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr; drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, result); - if (IS_HASWELL(gpgpu-drv-device_id)) { -result = result 0x000F; - } else { -result = result 0xF000; -result = result 28; - } + /* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of + result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in + i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain + 32 bits data in i386. + */ +#ifdef __i386__ + result = result 0x0; +#else + result = result 32; +#endif /* __i386__ */ result *= 80; *ret_ts = result; @@ -1195,15 +1199,12 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event, uint64_t* ptr = event-ts_buf-virtual; result = ptr[index]; - if (IS_HASWELL(gpgpu-drv-device_id)) -result = (result 0xF) * 80; //convert to nanoseconds - else -/* According to BSpec, the timestamp counter should be 36 bits, - but comparing to the timestamp counter from IO control reading, - we find the first 4 bits seems to be fake. In order to keep the - timestamp counter conformable, we just skip the first 4 bits. - */ -result = ((result 0x0) 4) * 80; //convert to nanoseconds + /* According to BSpec, the timestamp counter should be 36 bits, + but comparing to the timestamp counter from IO control reading, + we find the first 4 bits seems to be fake. In order to keep the + timestamp counter conformable, we just skip the first 4 bits. + */ + result = (result 0x0) * 80; //convert to nanoseconds *ret_ts = result; drm_intel_gem_bo_unmap_gtt(event-ts_buf); ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [[OpenCL-1.2]] GBE: Enable some implemented Opencl 1.2 functions in icd table.
hi, I want to add a fake stub function here, printf the warning of not implement, and return CL_SOME_ERRORXX, the null function always cause the program crash. On Tue, 2014-06-10 at 09:01 +0800, Zhigang Gong wrote: Signed-off-by: Zhigang Gong zhigang.g...@intel.com --- src/cl_khr_icd.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c index d601134..3a7dec2 100644 --- a/src/cl_khr_icd.c +++ b/src/cl_khr_icd.c @@ -142,19 +142,19 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = { #ifdef CL_VERSION_1_2 (void *) NULL, CL_1_2_NOTYET(clCreateSubDevices), - CL_1_2_NOTYET(clRetainDevice), - CL_1_2_NOTYET(clReleaseDevice), - CL_1_2_NOTYET(clCreateImage), - CL_1_2_NOTYET(clCreateProgramWithBuiltInKernels), + clRetainDevice, + clReleaseDevice, + clCreateImage, + clCreateProgramWithBuiltInKernels, CL_1_2_NOTYET(clCompileProgram), CL_1_2_NOTYET(clLinkProgram), CL_1_2_NOTYET(clUnloadPlatformCompiler), CL_1_2_NOTYET(clGetKernelArgInfo), - CL_1_2_NOTYET(clEnqueueFillBuffer), + clEnqueueFillBuffer, CL_1_2_NOTYET(clEnqueueFillImage), CL_1_2_NOTYET(clEnqueueMigrateMemObjects), - CL_1_2_NOTYET(clEnqueueMarkerWithWaitList), - CL_1_2_NOTYET(clEnqueueBarrierWithWaitList), + clEnqueueMarkerWithWaitList, + clEnqueueBarrierWithWaitList, CL_1_2_NOTYET(clGetExtensionFunctionAddressForPlatform), CL_GL_INTEROP(clCreateFromGLTexture), (void *) NULL, ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH V2] add [opencl-1.2] API clCreateSubDevice.
That's OK On Fri, 2014-06-06 at 05:07 +0800, xionghu@intel.com wrote: From: Luo xionghu@intel.com creates an array of sub-devices that each reference a non-intersecting set of compute units within in_device, according to a partition scheme given by properties. --- src/cl_api.c | 10 -- src/cl_device_id.c | 6 ++ src/cl_device_id.h | 7 +++ src/cl_gt_device.h | 7 ++- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index 8598088..8264970 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -242,8 +242,14 @@ clCreateSubDevices(cl_device_id in_device, cl_device_id * out_devices, cl_uint *num_devices_ret) { - NOT_IMPLEMENTED; - return 0; + /* Check parameter consistency */ + if (UNLIKELY(out_devices == NULL num_devices_ret == NULL)) +return CL_INVALID_VALUE; + if (UNLIKELY(in_device == NULL properties == NULL)) +return CL_INVALID_VALUE; + + *num_devices_ret = 0; + return CL_INVALID_DEVICE_PARTITION_COUNT; } cl_int diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 8ec7741..df37519 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -393,6 +393,12 @@ cl_get_device_info(cl_device_id device, DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version) DECL_STRING_FIELD(EXTENSIONS, extensions); DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels) +DECL_FIELD(PARENT_DEVICE, parent_device) +DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device) +DECL_FIELD(PARTITION_PROPERTIES, partition_property) +DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain) +DECL_FIELD(PARTITION_TYPE, partition_type) +DECL_FIELD(REFERENCE_COUNT, device_reference_count) case CL_DRIVER_VERSION: if (param_value_size_ret) { diff --git a/src/cl_device_id.h b/src/cl_device_id.h index 2bbe98e..a5449a7 100644 --- a/src/cl_device_id.h +++ b/src/cl_device_id.h @@ -98,6 +98,13 @@ struct _cl_device_id { /* Kernel specific info that we're assigning statically */ size_t wg_sz; size_t preferred_wg_sz_mul; + /* SubDevice specific info */ + cl_device_id parent_device; + cl_uint partition_max_sub_device; + cl_device_partition_property partition_property[3]; + cl_device_affinity_domainaffinity_domain; + cl_device_partition_property partition_type[3]; + cl_uint device_reference_count; }; /* Get a device from the given platform */ diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index cab2c58..b8bda5e 100644 --- a/src/cl_gt_device.h +++ b/src/cl_gt_device.h @@ -101,5 +101,10 @@ DECL_INFO_STRING(built_in_kernels, __cl_copy_region_align4; DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING) #undef DECL_INFO_STRING - +.parent_device = NULL, +.partition_max_sub_device = 1, +.partition_property = {0}, +.affinity_domain = 0, +.partition_type = {0}, +.device_reference_count = 1, ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 5/8] HSW: Use the drm flag I915_EXEC_ENABLE_SLM to set L3 control config.
Hi I find the drm_intel_gem_context_create, which will call DRM_IOCTL_I915_GEM_CONTEXT_CREATE IOCtrl of kernel. This is implemented after kernel 3.10 version. So if the kernel version is before 3.10, assert(driver-ctx); will happen. So our beignet will not support old kernel version now, I think we should update the README. On Mon, 2014-05-12 at 23:12 +0800, Yang Rong wrote: Because LRI commands will be converted to NOOP, add the I915_EXEC_ENABLE_SLM flag to the drm kernal driver, to enable SLM in the L3. Set the flag when application use slm. Still keep the L3 config in the batch buffer for fulsim. Also create and use the openCL own context when exec, to avoid affect the other context. Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/intel/intel_batchbuffer.c | 10 +- src/intel/intel_batchbuffer.h | 3 +++ src/intel/intel_driver.c | 19 ++ src/intel/intel_driver.h | 1 + src/intel/intel_gpgpu.c | 46 +++ 5 files changed, 74 insertions(+), 5 deletions(-) diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c index 62eedd0..19dc901 100644 --- a/src/intel/intel_batchbuffer.c +++ b/src/intel/intel_batchbuffer.c @@ -74,6 +74,7 @@ intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz) batch-ptr = batch-map; batch-atomic = 0; batch-last_bo = batch-buffer; + batch-enable_slm = 0; } LOCAL void @@ -119,7 +120,14 @@ intel_batchbuffer_flush(intel_batchbuffer_t *batch) if (!is_locked) intel_driver_lock_hardware(batch-intel); - dri_bo_exec(batch-buffer, used, 0, 0, 0); + int flag = I915_EXEC_RENDER; + if(batch-enable_slm) { +/* use the hard code here temp, must change to + * I915_EXEC_ENABLE_SLM when it drm accept the patch */ +flag |= (113); + } + drm_intel_gem_bo_context_exec(batch-buffer, batch-intel-ctx, used, flag); + if (!is_locked) intel_driver_unlock_hardware(batch-intel); diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h index 74f1790..0c3bc13 100644 --- a/src/intel/intel_batchbuffer.h +++ b/src/intel/intel_batchbuffer.h @@ -83,6 +83,9 @@ typedef struct intel_batchbuffer uint32_t size; uint8_t *map; uint8_t *ptr; + /** HSW: can't set LRI in batch buffer, set I915_EXEC_ENABLE_SLM + * flag when call exec. */ + uint8_t enable_slm; int atomic; } intel_batchbuffer_t; diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index ef97835..08d6bc0 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -106,6 +106,7 @@ intel_driver_delete(intel_driver_t *driver) { if (driver == NULL) return; + if (driver-bufmgr) drm_intel_bufmgr_destroy(driver-bufmgr); cl_free(driver); @@ -139,6 +140,21 @@ intel_driver_memman_init(intel_driver_t *driver) drm_intel_bufmgr_gem_enable_reuse(driver-bufmgr); } +static void +intel_driver_context_init(intel_driver_t *driver) +{ + driver-ctx = drm_intel_gem_context_create(driver-bufmgr); + assert(driver-ctx); +} + +static void +intel_driver_context_destroy(intel_driver_t *driver) +{ + if(driver-ctx) +drm_intel_gem_context_destroy(driver-ctx); + driver-ctx = NULL; +} + static void intel_driver_init(intel_driver_t *driver, int dev_fd) { @@ -151,6 +167,7 @@ intel_driver_init(intel_driver_t *driver, int dev_fd) intel_driver_get_param(driver, I915_PARAM_CHIPSET_ID, driver-device_id); assert(res); intel_driver_memman_init(driver); + intel_driver_context_init(driver); #if EMULATE_GEN driver-gen_ver = EMULATE_GEN; @@ -364,6 +381,7 @@ intel_get_device_id(void) assert(driver != NULL); intel_driver_open(driver, NULL); intel_device_id = driver-device_id; + intel_driver_context_destroy(driver); intel_driver_close(driver); intel_driver_terminate(driver); intel_driver_delete(driver); @@ -376,6 +394,7 @@ cl_intel_driver_delete(intel_driver_t *driver) { if (driver == NULL) return; + intel_driver_context_destroy(driver); intel_driver_close(driver); intel_driver_terminate(driver); intel_driver_delete(driver); diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h index a01d881..34efbbb 100644 --- a/src/intel/intel_driver.h +++ b/src/intel/intel_driver.h @@ -78,6 +78,7 @@ typedef struct _XDisplay Display; typedef struct intel_driver { dri_bufmgr *bufmgr; + drm_intel_context *ctx; int fd; int device_id; int gen_ver; diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 603a075..103a4b2 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -118,6 +118,8 @@ struct intel_gpgpu typedef struct intel_gpgpu intel_gpgpu_t; +typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm); +intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL; static void
Re: [Beignet] [PATCH] Fix cl_event_get_timestamp for submit and queued
So, you apply the same logic in the function intel_gpgpu_event_get_exec_timestamp here. BSpec really have some mistakes. But for most of the IVB platform, -result = result 0xF000; -result = result 28; works very well. So I think if you want to correct this, you should add the PCIID check like HSW. IS_(gpgpu-drv-device_id) should be added here. If you do not know how to do it, please notify the PCIID of your Baytrail-I E3827 On Wed, 2014-06-04 at 16:15 +, michael.j.fergu...@l-3com.com wrote: commit a9ab94503348068579e8e816e80eb62598fd7f5f Author: Michael Ferguson michael.j.fergu...@l-3com.com Date: Fri May 30 11:32:36 2014 -0600 Fix cl_event_get_timestamp for submit and queued The cl_gpgpu_event_get_gpu_cur_timestamp function did not apply the same logic as the cl_gpgpu_event_get_exec_timestamp regarding the timestamp counter on the Baytrail, which resulted in a bogus GPU current timestamp. Tests on the Baytrail-I E3827 indicated the following clock values in the profiling_exec test before this patch: queued = 1920 submit = 1920 start = 2762442307840 end= 2762442351360 Obviously these values were not correct for the queued and submit counters. After applying this patch the values in the profiling_exec test indicated: queued = 320306542080 submit = 320306617600 start = 320308817920 end= 320308857600 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index bde9bd5..22e04f5 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1138,8 +1138,12 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts) if (IS_HASWELL(gpgpu-drv-device_id)) { result = result 0x000F; } else { -result = result 0xF000; -result = result 28; +/* According to BSpec, the timestamp counter should be 36 bits, + but comparing to the timestamp counter from IO control reading, + we find the first 4 bits seems to be fake. In order to keep the + timestamp counter conformable, we just skip the first 4 bits. + */ +result = (result 0x0) 4; } result *= 80; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Fix timestamp on HASWELL
OK, This patch LGTM On Mon, 2014-05-26 at 19:25 +0800, Li Peng wrote: The GPU timestamp should be lower 36 bit on HASWELL Signed-off-by: Li Peng peng...@intel.com --- src/cl_driver.h | 2 +- src/cl_event.c | 4 ++-- src/intel/intel_gpgpu.c | 26 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/cl_driver.h b/src/cl_driver.h index 9dc2330..3e01c92 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -193,7 +193,7 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event); extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete; /* Get a event time stamp */ -typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, uint64_t*); +typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*); extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp; /* Get current GPU time stamp */ diff --git a/src/cl_event.c b/src/cl_event.c index 727ee1f..30e0e06 100644 --- a/src/cl_event.c +++ b/src/cl_event.c @@ -514,11 +514,11 @@ cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name) event-timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; return CL_SUCCESS; } else if(param_name == CL_PROFILING_COMMAND_START) { -cl_gpgpu_event_get_exec_timestamp(event-gpgpu_event, 0, ret_val); +cl_gpgpu_event_get_exec_timestamp(gpgpu, event-gpgpu_event, 0, ret_val); event-timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; return CL_SUCCESS; } else if (param_name == CL_PROFILING_COMMAND_END) { -cl_gpgpu_event_get_exec_timestamp(event-gpgpu_event, 1, ret_val); +cl_gpgpu_event_get_exec_timestamp(gpgpu, event-gpgpu_event, 1, ret_val); event-timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; return CL_SUCCESS; } diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index b7b712f..2ab2bb7 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1110,8 +1110,12 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts) drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr; drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, result); - result = result 0xF000; - result = result 28; + if (IS_HASWELL(gpgpu-drv-device_id)) { +result = result 0x000F; + } else { +result = result 0xF000; +result = result 28; + } result *= 80; *ret_ts = result; @@ -1120,8 +1124,8 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts) /* Get the GPU execute time. */ static void -intel_gpgpu_event_get_exec_timestamp(intel_event_t *event, -int index, uint64_t* ret_ts) +intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event, + int index, uint64_t* ret_ts) { uint64_t result = 0; @@ -1131,11 +1135,15 @@ intel_gpgpu_event_get_exec_timestamp(intel_event_t *event, uint64_t* ptr = event-ts_buf-virtual; result = ptr[index]; - /* According to BSpec, the timestamp counter should be 36 bits, - but comparing to the timestamp counter from IO control reading, - we find the first 4 bits seems to be fake. In order to keep the - timestamp counter conformable, we just skip the first 4 bits. */ - result = ((result 0x0) 4) * 80; //convert to nanoseconds + if (IS_HASWELL(gpgpu-drv-device_id)) +result = (result 0xF) * 80; //convert to nanoseconds + else +/* According to BSpec, the timestamp counter should be 36 bits, + but comparing to the timestamp counter from IO control reading, + we find the first 4 bits seems to be fake. In order to keep the + timestamp counter conformable, we just skip the first 4 bits. + */ +result = ((result 0x0) 4) * 80; //convert to nanoseconds *ret_ts = result; drm_intel_gem_bo_unmap_gtt(event-ts_buf); ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 4/5] HSW: enable the surface's cache in HSW.
I tested the whole patch suite on my HSW platform and didn't find obvious regression. On Fri, 2014-05-30 at 09:22 +0800, Zhigang Gong wrote: On Fri, May 30, 2014 at 12:37:33AM +0800, Yang Rong wrote: HSW's surface cache control is changed, correct it. And also disable exec flag for slm. When kernel parse cmd finish, need remove it totally Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/cl_command_queue.c | 4 +-- src/cl_command_queue_gen7.c | 4 +-- src/cl_device_id.c | 2 +- src/cl_driver.h | 19 +- src/cl_driver_defs.c| 1 + src/intel/intel_gpgpu.c | 61 - 6 files changed, 62 insertions(+), 29 deletions(-) LOCAL cl_int diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 018da95..538c88a 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -86,7 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = { .max_compute_unit = 140, .max_thread_per_unit = 7, .max_work_item_sizes = {512, 512, 512}, - .max_work_group_size = 512, + .max_work_group_size = 1024, Why change max work group size in this patch? static void intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu) { - const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */ + const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */ BEGIN_BATCH(gpgpu-batch, 10); OUT_BATCH(gpgpu-batch, CMD_STATE_BASE_ADDRESS | 8); /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */ @@ -233,12 +245,12 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu) ADVANCE_BATCH(gpgpu-batch); } -uint32_t get_scratch_index_gen7(uint32_t size) { +uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) { return size / 1024 - 1; } -uint32_t get_scratch_index_gen75(uint32_t size) { -size = size 12; +uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) { +size = size 11; So this patch also fix the scratch configuration? right? If it is expected, I think you may need to add related info into the commit log. @@ -411,25 +421,29 @@ static void intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm) { /* still set L3 in batch buffer for fulsim. */ - BEGIN_BATCH(gpgpu-batch, 6); + BEGIN_BATCH(gpgpu-batch, 9); + OUT_BATCH(gpgpu-batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ + OUT_BATCH(gpgpu-batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET); + OUT_BATCH(gpgpu-batch, 0x0061); + OUT_BATCH(gpgpu-batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu-batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET); + if (use_slm) -OUT_BATCH(gpgpu-batch, gpgpu_l3_config_reg1[8]); +OUT_BATCH(gpgpu-batch, gpgpu_l3_config_reg1[12]); can we change to use a specific value here rather than to pick a value from magic array? Just as baytrail, if the register definition is published on 01.org, a meaningful comment is also nice to have. Other part LGTM. ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Refine the cl thread implement for queue.
I think the batch buffer is just one kind of resource. The other resources such as Image buffers, data buffers are also needed to keep intact for each thread. Unless you add a lock at the NDRQueue entrance, which begin to prepare all the required resources. But I think that lock should be too heavy. On Mon, 2014-05-26 at 09:37 +0800, Zhigang Gong wrote: And I just checked the clFinish and clFlush, they only need to access the batch buffer. So the root cause is that we always allocate a new batch buffer for a new kernel submitting for a queue. Even if there are many kernel enqueuing on the same queue. If we can maintain a uniform batch buffer for the single queue, then this issue will be solved clearly and gracefully. IMO, this is not the OpenCL spec issue. This is a implementation issue which we should solved In the future. What's your opinion? BTW, I'm ok with current implementation. But I found you may missed some minor comments Which embedded in my first email. Could you recheck it and solve those comment and Send a new version of the patch? -Original Message- From: Zhigang Gong [mailto:zhigang.g...@linux.intel.com] Sent: Monday, May 26, 2014 9:31 AM To: 'He Junyan' Cc: 'Junyan He'; 'beignet@lists.freedesktop.org' Subject: RE: [Beignet] [PATCH] Refine the cl thread implement for queue. Ok. The key issue is that the private gpgpu data structure is still needed after each kernel execution. And the gpgpu data is different for each kernel execution, right? Could you list all of the scenarios where we need to use the gpgpu data after the kernel submitting? I can think of the following two: 1. clFinish 2. clFlush Is there any other cases? -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of He Junyan Sent: Friday, May 23, 2014 11:36 PM To: Zhigang Gong Cc: Junyan He; beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH] Refine the cl thread implement for queue. I think it is hard to avoid using thread local data. Because when the queue creating, we do not know how many threads will use this queue later. The GPGPU resources will be hold in the queue, but at least every thread should have a local data to store the index to find the GPGPU data in the queue. And the thread should need to destroy the GPGPU resource when the thread exit, while the queue's life time may be much longer than the thread. OpenCL spec fail to define the relationship between the queue and the threads. This cause the dilemma. Please give some good advices if any. On Fri, 2014-05-23 at 16:33 +0800, Zhigang Gong wrote: Some minor comments as below. One thought about the usage of thread local data we are using here. The original reason why we want to use thread local data is to avoid lock as much as possible. But finally, we found to satisfy all the use scenario, we can't avoid lock any way. Now we introduce lock eventually. Then is there still good reason why we should use these thread local data any more? One possible question is as below: If one queue is used in another thread to enqueue task, does it make sense to create a thread local new gpgpu data and in this thread. Or we can just simply lock and wait for other thread to unlock the queue? On Tue, May 20, 2014 at 02:26:47PM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Because the cl_command_queue can be used in several threads simultaneously but without add ref to it, we now handle it like this: Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it does not have a slot, assign it. The resources are keeped in queue private, and resize it if needed. When the thread exit, the slot will be set invalid. When queue released, all the resources will be released. If user still enqueue, flush or finish the queue after it has been released, the behavior is undefined. TODO: Need to shrink the slot map. Signed-off-by: Junyan He junyan...@linux.intel.com --- src/cl_command_queue.c | 6 +- src/cl_command_queue_gen7.c | 2 +- src/cl_context.c| 2 +- src/cl_device_id.c | 2 +- src/cl_thread.c | 261 +--- src/cl_thread.h | 6 +- 6 files changed, 205 insertions(+), 74 deletions(-) diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 6a699c0..802d313 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -60,6 +60,7 @@ cl_command_queue_new(cl_context ctx) /* The queue also belongs to its context */ cl_context_add_ref(ctx); + useless new line. exit: return queue
Re: [Beignet] [PATCH V2] gbe_bin_generater: fix two bugs.
Some PCI ID do not have Gen keyword, such as ruiling's IVBridge XX So need to refine the command line On Fri, 2014-05-23 at 19:04 +0800, Zhigang Gong wrote: From: Zhigang Gong zhigang.g...@linux.intel.com The pci id detecting method is broken on some system. And the gen pci id parsing in gbe_bin_generater is incorrect when the pci id has a-f hex digit. v2: Add VGA to filter out some nonVGA devices. Signed-off-by: Zhigang Gong zhigang.g...@linux.intel.com --- backend/src/gbe_bin_generater.cpp | 7 +-- src/GetGenID.sh | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp index 50020b5..898e2f2 100644 --- a/backend/src/gbe_bin_generater.cpp +++ b/backend/src/gbe_bin_generater.cpp @@ -34,6 +34,7 @@ #include vector #include algorithm #include stdlib.h +#include iostream #include stdio.h #include backend/program.h @@ -46,7 +47,7 @@ using namespace std; #define FILE_BUILD_FAILED 3 #define FILE_SERIALIZATION_FAILED 4 -static int gen_pci_id = 0; +static uint32_t gen_pci_id = 0; class program_build_instance { @@ -296,7 +297,9 @@ int main (int argc, const char **argv) return 1; } -gen_pci_id = (s[0] - '0') 12 | (s[1] - '0') 8 | (s[2] - '0') 4 | (s[3] - '0'); +std::stringstream str(s); +str std::hex gen_pci_id; + used_index[optind-1] = 1; // We must set the image base index here, as we invoke the backend in a non-standard way. gbe_set_image_base_index(3); diff --git a/src/GetGenID.sh b/src/GetGenID.sh index 3114bd8..f8cb0a7 100755 --- a/src/GetGenID.sh +++ b/src/GetGenID.sh @@ -1,2 +1,2 @@ #!/bin/bash -lspci -nn | grep Gen .* Graphics -i | grep \[8086:.*\] -o | awk -F : '{print $2}' | awk -F ] '{print $1}' +lspci -nn | grep VGA.*Gen.*\[8086: -i | grep \[8086:.*\] -o | awk -F : '{print $2}' | awk -F ] '{print $1}' ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Add the pci id support for gbe_generate
Yes, it's on my plan and this patch is just the first step to make the HSW's buffer copy and image copy workable. On Tue, 2014-05-20 at 07:57 +, Yang, Rong R wrote: This patch detect the building platform's pci id and generate the bin for host when building, it is necessary to generate the corresponding binary for IVB and HSW. I think support cross platform bin generate and use a string in command line is next step. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Song, Ruiling Sent: Tuesday, May 20, 2014 2:35 PM To: He Junyan; beignet@lists.freedesktop.org Cc: Junyan He Subject: Re: [Beignet] [PATCH] Add the pci id support for gbe_generate You directly use pcid, right? What about changing to use a string as the command argument. Like 'ivb', 'hsw'? That would be meaningful for users. Thanks! Ruiling -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of He Junyan Sent: Tuesday, May 20, 2014 1:33 PM To: beignet@lists.freedesktop.org Cc: Junyan He Subject: Re: [Beignet] [PATCH] Add the pci id support for gbe_generate ping for review On Tue, 2014-05-13 at 09:34 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/gbe_bin_generater.cpp | 20 +++- src/CMakeLists.txt|8 +++- src/GetGenID.sh |2 ++ utests/CMakeLists.txt |7 ++- 4 files changed, 34 insertions(+), 3 deletions(-) create mode 100755 src/GetGenID.sh diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp index 15bdbd1..b6248ae 100644 --- a/backend/src/gbe_bin_generater.cpp +++ b/backend/src/gbe_bin_generater.cpp @@ -46,6 +46,8 @@ using namespace std; #define FILE_BUILD_FAILED 3 #define FILE_SERIALIZATION_FAILED 4 +static int gen_pci_id = 0; + class program_build_instance { protected: @@ -249,7 +251,7 @@ int main (int argc, const char **argv) argv_saved.push_back(string(argv[i])); } -while ( (oc = getopt(argc, (char * const *)argv, o:p:s)) != -1 ) { +while ( (oc = getopt(argc, (char * const *)argv, t:o:p:s)) != + -1 ) { switch (oc) { case 'p': { @@ -283,6 +285,22 @@ int main (int argc, const char **argv) used_index[optind-1] = 1; break; +case 't': +{ +char *s = optarg; +if (optarg[0] == '0' (optarg[1] == 'x' || optarg[1] == 'X')) +s += 2; + +if (s[0] '0' || s[0] '9') { +cout Invalid target option argument endl; +return 1; +} + +gen_pci_id = (s[0] - '0') 12 | (s[1] - '0') 8 | (s[2] - '0') 4 | (s[3] - '0'); +used_index[optind-1] = 1; +break; +} + case 's': program_build_instance::set_str_fmt_out(true); used_index[optind-1] = 1; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8164a44..f93ddcd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,12 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/ ${CMAKE_CURRENT_SOURCE_DIR}/../include ${MESA_SOURCE_INCLUDES}) + +set(GEN_PCI_ID) +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh +OUTPUT_VARIABLE GEN_PCI_ID) +message(STATUS Platform Gen PCI id is ${GEN_PCI_ID}) + macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES) foreach (KF ${KERNEL_FILES}) set (input_file ${KERNEL_PATH}/${KF}.cl) @@ -12,7 +18,7 @@ foreach (KF ${KERNEL_FILES}) add_custom_command( OUTPUT ${output_file} COMMAND rm -rf ${output_file} -COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} +COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} + -t${GEN_PCI_ID} DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater) endforeach (KF) endmacro (MakeKernelBinStr) diff --git a/src/GetGenID.sh b/src/GetGenID.sh new file mode 100755 index 000..3114bd8 --- /dev/null +++ b/src/GetGenID.sh @@ -0,0 +1,2 @@ +#!/bin/bash +lspci -nn | grep Gen .* Graphics -i | grep \[8086:.*\] -o | awk -F : '{print $2}' | awk -F ] '{print $1}' diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index 704438d..2a9ea66 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -180,10 +180,15 @@ set (utests_sources utest_file_map.cpp utest_helper.cpp) +set(GEN_PCI_ID) +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh +OUTPUT_VARIABLE GEN_PCI_ID) + +message
Re: [Beignet] [PATCH] Add the pci id support for gbe_generate
ping for review On Tue, 2014-05-13 at 09:34 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- backend/src/gbe_bin_generater.cpp | 20 +++- src/CMakeLists.txt|8 +++- src/GetGenID.sh |2 ++ utests/CMakeLists.txt |7 ++- 4 files changed, 34 insertions(+), 3 deletions(-) create mode 100755 src/GetGenID.sh diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp index 15bdbd1..b6248ae 100644 --- a/backend/src/gbe_bin_generater.cpp +++ b/backend/src/gbe_bin_generater.cpp @@ -46,6 +46,8 @@ using namespace std; #define FILE_BUILD_FAILED 3 #define FILE_SERIALIZATION_FAILED 4 +static int gen_pci_id = 0; + class program_build_instance { protected: @@ -249,7 +251,7 @@ int main (int argc, const char **argv) argv_saved.push_back(string(argv[i])); } -while ( (oc = getopt(argc, (char * const *)argv, o:p:s)) != -1 ) { +while ( (oc = getopt(argc, (char * const *)argv, t:o:p:s)) != -1 ) { switch (oc) { case 'p': { @@ -283,6 +285,22 @@ int main (int argc, const char **argv) used_index[optind-1] = 1; break; +case 't': +{ +char *s = optarg; +if (optarg[0] == '0' (optarg[1] == 'x' || optarg[1] == 'X')) +s += 2; + +if (s[0] '0' || s[0] '9') { +cout Invalid target option argument endl; +return 1; +} + +gen_pci_id = (s[0] - '0') 12 | (s[1] - '0') 8 | (s[2] - '0') 4 | (s[3] - '0'); +used_index[optind-1] = 1; +break; +} + case 's': program_build_instance::set_str_fmt_out(true); used_index[optind-1] = 1; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8164a44..f93ddcd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,12 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/ ${CMAKE_CURRENT_SOURCE_DIR}/../include ${MESA_SOURCE_INCLUDES}) + +set(GEN_PCI_ID) +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh +OUTPUT_VARIABLE GEN_PCI_ID) +message(STATUS Platform Gen PCI id is ${GEN_PCI_ID}) + macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES) foreach (KF ${KERNEL_FILES}) set (input_file ${KERNEL_PATH}/${KF}.cl) @@ -12,7 +18,7 @@ foreach (KF ${KERNEL_FILES}) add_custom_command( OUTPUT ${output_file} COMMAND rm -rf ${output_file} -COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} +COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} -t${GEN_PCI_ID} DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater) endforeach (KF) endmacro (MakeKernelBinStr) diff --git a/src/GetGenID.sh b/src/GetGenID.sh new file mode 100755 index 000..3114bd8 --- /dev/null +++ b/src/GetGenID.sh @@ -0,0 +1,2 @@ +#!/bin/bash +lspci -nn | grep Gen .* Graphics -i | grep \[8086:.*\] -o | awk -F : '{print $2}' | awk -F ] '{print $1}' diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index 704438d..2a9ea66 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -180,10 +180,15 @@ set (utests_sources utest_file_map.cpp utest_helper.cpp) +set(GEN_PCI_ID) +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh +OUTPUT_VARIABLE GEN_PCI_ID) + +message(STATUS Platform Gen PCI id is ${GEN_PCI_ID}) SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil) ADD_CUSTOM_COMMAND( OUTPUT ${kernel_bin}.bin -COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin +COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin -t${GEN_PCI_ID} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl ) ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/8] HSW: align buffer's size to DWORD.
It seems a bit randomly. You extend the sz, which may have some overlap with others', specially in multi-thread multi-buffer cases. I found it once but really can not dup it now. I think it is OK now, and we can fix this bug later if we find. On Wed, 2014-05-14 at 07:26 +, Yang, Rong R wrote: Run the buffer test 50+ times, all pass. -Original Message- From: He Junyan [mailto:junyan...@inbox.com] Sent: Tuesday, May 13, 2014 3:16 PM To: Yang, Rong R Cc: beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH 1/8] HSW: align buffer's size to DWORD. This patch will cause some regression in buffer tests On Mon, 2014-05-12 at 23:11 +0800, Yang Rong wrote: HSW: Byte scattered Read/Write require that the buffer size must be a multiple of 4 bytes. So simply alignment all buffer size to 4. Pass utest compiler_function_constant0. Because it is very light work around, align it without not check device. Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/cl_mem.c | 4 1 file changed, 4 insertions(+) diff --git a/src/cl_mem.c b/src/cl_mem.c index 44482f7..5feda74 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -334,6 +334,10 @@ cl_mem_new_buffer(cl_context ctx, goto error; } + /* HSW: Byte scattered Read/Write has limitation that + the buffer size must be a multiple of 4 bytes. */ sz = + ALIGN(sz, 4); + /* Create the buffer in video memory */ mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, err); if (mem == NULL || err != CL_SUCCESS) ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 1/3] [opencl-1.2] Add checks for clCreateImage and add 1d image creating logic
Sorry, this patch set is for Opencl-1.2 branch On Thu, 2014-05-15 at 16:43 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Add more check for Image creating according to the spec. Update the according image utest cases to pass it. The 1d image creating is also be added. Signed-off-by: Junyan He junyan...@linux.intel.com --- src/cl_api.c | 36 src/cl_mem.c | 24 ++-- utests/compiler_copy_image.cpp| 4 utests/compiler_copy_image1.cpp | 4 utests/compiler_copy_image_3d.cpp | 3 +++ utests/compiler_fill_image.cpp| 4 utests/compiler_fill_image0.cpp | 4 7 files changed, 73 insertions(+), 6 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index 9c22819..b26936e 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -506,7 +506,43 @@ clCreateImage(cl_context context, cl_mem mem = NULL; cl_int err = CL_SUCCESS; CHECK_CONTEXT (context); + if (image_format == NULL) { +err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; +goto error; + } + if (image_format-image_channel_order CL_R || + image_format-image_channel_order CL_RGBx) { +err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; +goto error; + } + if (image_format-image_channel_data_type CL_SNORM_INT8 || + image_format-image_channel_data_type CL_FLOAT) { +err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; +goto error; + } + + if (image_desc == NULL) { +err = CL_INVALID_IMAGE_DESCRIPTOR; +goto error; + } + if (image_desc-image_type = CL_MEM_OBJECT_BUFFER || + image_desc-image_type CL_MEM_OBJECT_IMAGE1D_BUFFER) { +err = CL_INVALID_IMAGE_DESCRIPTOR; +goto error; + } + /* buffer refers to a valid buffer memory object if image_type is + CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */ + if (image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER + image_desc-buffer) { +err = CL_INVALID_IMAGE_DESCRIPTOR; +goto error; + } + if (image_desc-num_mip_levels || image_desc-num_samples) { +err = CL_INVALID_IMAGE_DESCRIPTOR; +goto error; + } + /* Other details check for image_desc will leave to image create. */ mem = cl_mem_new_image(context, flags, image_format, diff --git a/src/cl_mem.c b/src/cl_mem.c index 3f1b389..0250f0a 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -544,10 +544,22 @@ _cl_mem_new_image(cl_context ctx, err = CL_INVALID_IMAGE_SIZE; \ goto error; \ } while (0); + if (UNLIKELY(w == 0)) DO_IMAGE_ERROR; - if (UNLIKELY(h == 0)) DO_IMAGE_ERROR; + if (UNLIKELY(h == 0 image_type != CL_MEM_OBJECT_IMAGE1D)) DO_IMAGE_ERROR; - if (image_type == CL_MEM_OBJECT_IMAGE2D) { + if (image_type == CL_MEM_OBJECT_IMAGE1D) { +size_t min_pitch = bpp * w; +if (data pitch == 0) + pitch = min_pitch; + +depth = 1; +h = 1; +if (UNLIKELY(w ctx-device-image2d_max_width)) DO_IMAGE_ERROR; +if (UNLIKELY(data min_pitch pitch)) DO_IMAGE_ERROR; +if (UNLIKELY(!data pitch != 0)) DO_IMAGE_ERROR; +tiling = CL_NO_TILE; + } else if (image_type == CL_MEM_OBJECT_IMAGE2D) { size_t min_pitch = bpp * w; if (data pitch == 0) pitch = min_pitch; @@ -560,9 +572,7 @@ _cl_mem_new_image(cl_context ctx, if (cl_driver_get_ver(ctx-drv) != 6) tiling = CL_TILE_Y; depth = 1; - } - - if (image_type == CL_MEM_OBJECT_IMAGE3D) { + } else if (image_type == CL_MEM_OBJECT_IMAGE3D) { size_t min_pitch = bpp * w; if (data pitch == 0) pitch = min_pitch; @@ -580,7 +590,9 @@ _cl_mem_new_image(cl_context ctx, /* Pick up tiling mode (we do only linear on SNB) */ if (cl_driver_get_ver(ctx-drv) != 6) tiling = CL_TILE_Y; - } + } else +assert(0); + #undef DO_IMAGE_ERROR /* Tiling requires to align both pitch and height */ diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp index 04c9544..dac8d50 100644 --- a/utests/compiler_copy_image.cpp +++ b/utests/compiler_copy_image.cpp @@ -1,3 +1,4 @@ +#include string.h #include utest_helper.hpp static void compiler_copy_image(void) @@ -8,6 +9,9 @@ static void compiler_copy_image(void) cl_image_desc desc; cl_sampler sampler; + memset(desc, 0x0, sizeof(cl_image_desc)); + memset(format, 0x0, sizeof(cl_image_format)); + // Setup kernel and images OCL_CREATE_KERNEL(test_copy_image); buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h); diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp index a9ef3f4..fe52dbf 100644 --- a/utests/compiler_copy_image1.cpp +++ b/utests/compiler_copy_image1.cpp @@ -1,3 +1,4 @@ +#include string.h #include utest_helper.hpp static void compiler_copy_image1
Re: [Beignet] [PATCH 1/8] HSW: align buffer's size to DWORD.
This patch will cause some regression in buffer tests On Mon, 2014-05-12 at 23:11 +0800, Yang Rong wrote: HSW: Byte scattered Read/Write require that the buffer size must be a multiple of 4 bytes. So simply alignment all buffer size to 4. Pass utest compiler_function_constant0. Because it is very light work around, align it without not check device. Signed-off-by: Yang Rong rong.r.y...@intel.com --- src/cl_mem.c | 4 1 file changed, 4 insertions(+) diff --git a/src/cl_mem.c b/src/cl_mem.c index 44482f7..5feda74 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -334,6 +334,10 @@ cl_mem_new_buffer(cl_context ctx, goto error; } + /* HSW: Byte scattered Read/Write has limitation that + the buffer size must be a multiple of 4 bytes. */ + sz = ALIGN(sz, 4); + /* Create the buffer in video memory */ mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, err); if (mem == NULL || err != CL_SUCCESS) ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 2/2] move enqueue_copy_image kernels outside of runtime code.
2/2 is OK On Mon, 2014-05-12 at 12:41 +0800, xionghu@intel.com wrote: From: Luo xionghu@intel.com seperate the kernel code from host code to make it clean; build the kernels offline by gbe_bin_generator to improve the performance. --- src/CMakeLists.txt | 23 ++- src/cl_context.h | 24 ++- src/cl_gt_device.h | 23 ++- src/cl_mem.c | 214 ++--- src/kernels/cl_internal_copy_buf_align1.cl | 8 - src/kernels/cl_internal_copy_buf_align16.cl| 2 +- src/kernels/cl_internal_copy_buf_align4.cl | 2 +- src/kernels/cl_internal_copy_buf_rect.cl | 15 ++ .../cl_internal_copy_buf_unalign_dst_offset.cl | 2 +- .../cl_internal_copy_buf_unalign_same_offset.cl| 2 +- .../cl_internal_copy_buf_unalign_src_offset.cl | 2 +- src/kernels/cl_internal_copy_buffer_to_image_2d.cl | 18 ++ src/kernels/cl_internal_copy_buffer_to_image_3d.cl | 19 ++ src/kernels/cl_internal_copy_image_2d_to_2d.cl | 21 ++ src/kernels/cl_internal_copy_image_2d_to_3d.cl | 22 +++ src/kernels/cl_internal_copy_image_2d_to_buffer.cl | 19 ++ src/kernels/cl_internal_copy_image_3d_to_2d.cl | 22 +++ src/kernels/cl_internal_copy_image_3d_to_3d.cl | 23 +++ src/kernels/cl_internal_copy_image_3d_to_buffer.cl | 22 +++ 19 files changed, 308 insertions(+), 175 deletions(-) delete mode 100644 src/kernels/cl_internal_copy_buf_align1.cl create mode 100644 src/kernels/cl_internal_copy_buf_rect.cl create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d.cl create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d.cl create mode 100644 src/kernels/cl_internal_copy_image_2d_to_2d.cl create mode 100644 src/kernels/cl_internal_copy_image_2d_to_3d.cl create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer.cl create mode 100644 src/kernels/cl_internal_copy_image_3d_to_2d.cl create mode 100644 src/kernels/cl_internal_copy_image_3d_to_3d.cl create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer.cl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8164a44..ecc04ab 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,11 +17,30 @@ foreach (KF ${KERNEL_FILES}) endforeach (KF) endmacro (MakeKernelBinStr) +macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES) + set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl) + set (file_content) + file (REMOVE ${output_file}) + foreach (KF ${KERNEL_NAMES}) +set (input_file ${KERNEL_PATH}/${KF}.cl) +file(READ ${input_file} file_content ) +STRING(REGEX REPLACE ; ; file_content ${file_content}) +file(APPEND ${output_file} ${file_content}) + endforeach (KF) +endmacro (MakeBuiltInKernelStr) + set (KERNEL_STR_FILES) -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 +set (KERNEL_NAMES cl_internal_copy_buf_align4 cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset -cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset) +cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset +cl_internal_copy_buf_rect cl_internal_copy_image_2d_to_2d cl_internal_copy_image_3d_to_2d +cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d +cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer +cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d) +set (BUILT_IN_NAME cl_internal_built_in_kernel) +MakeBuiltInKernelStr (${CMAKE_CURRENT_SOURCE_DIR}/kernels/ ${KERNEL_NAMES}) MakeKernelBinStr (${CMAKE_CURRENT_SOURCE_DIR}/kernels/ ${KERNEL_NAMES}) +MakeKernelBinStr (${CMAKE_CURRENT_SOURCE_DIR}/kernels/ ${BUILT_IN_NAME}) set(OPENCL_SRC ${KERNEL_STR_FILES} diff --git a/src/cl_context.h b/src/cl_context.h index 782a9af..24281be 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -46,14 +46,22 @@ enum _cl_internal_ker_type { CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, CL_ENQUEUE_COPY_BUFFER_RECT, - CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image 2d - CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image 2d - CL_ENQUEUE_COPY_IMAGE_2, //copy image 2d to image 3d - CL_ENQUEUE_COPY_IMAGE_3, //copy image 3d to image 3d - CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0, //copy image 2d to buffer - CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1, //copy image 3d tobuffer - CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0, //copy buffer to image 2d - CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1, //copy buffer to image 3d + CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d + CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d + CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, //copy image 2d to
Re: [Beignet] [V2 PATCH 1/6] Update the device info description for HSW
OK, name modified and new patch sent On Thu, 2014-05-08 at 07:43 -0400, Jesper Pedersen wrote: Hi, On 05/07/2014 06:02 AM, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Split the cl_device_id description for HSW into GT1, GT2 and GT3, with different parameters. Signed-off-by: Junyan He junyan...@linux.intel.com --- src/cl_device_id.c | 135 +++-- 1 file changed, 90 insertions(+), 45 deletions(-) #define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \ STRUCT.FIELD = STRING; \ STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \ +hsw_device = STRUCT; \ goto BREAK; Can't this be moved to the actual Haswell block ? It doesn't really make sense to assign it for all non-Haswell devices. has_break: - intel_hsw_device.vendor_id = device_id; - intel_hsw_device.platform = intel_platform; - ret = intel_hsw_device; + hsw_device-vendor_id = device_id; + hsw_device-platform = intel_platform; + ret = hsw_device; break; E.g. down here. Best regards, Jesper ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH 2/3] [opencl-1.2] Implement the clEnqueueFillBuffer API.
On Tue, 2014-04-29 at 13:57 +0800, Zhigang Gong wrote: Some minor comments as below: On Wed, Apr 23, 2014 at 04:35:25PM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com We use the floatn's assigment to do the copy. 128 pattern size is according to double16, and because the double problem on our platform, we use to float16 to handle this. unaligned cases is not optimized now, just use the char assigment. Signed-off-by: Junyan He junyan...@linux.intel.com --- src/cl_api.c | 78 src/cl_context.c | 133 ++- src/cl_context.h | 8 src/cl_enqueue.c | 1 + src/cl_enqueue.h | 1 + src/cl_event.c | 1 + src/cl_mem.c | 102 ++ src/cl_mem.h | 3 ++ 8 files changed, 295 insertions(+), 32 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index 1543ff4..be94bcb 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -1592,6 +1592,84 @@ error: } cl_int +clEnqueueFillBuffer(cl_command_queue command_queue, +cl_mem buffer, +const void * pattern, +size_t pattern_size, +size_t offset, +size_t size, +cl_uintnum_events_in_wait_list, +const cl_event * event_wait_list, +cl_event * event) +{ + cl_int err = CL_SUCCESS; + enqueue_data *data, no_wait_data = { 0 }; + static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128}; + int i = 0; + + CHECK_QUEUE(command_queue); + CHECK_MEM(buffer); + + if (command_queue-ctx != buffer-ctx) { +err = CL_INVALID_CONTEXT; +goto error; + } + + if (offset 0 || offset + size buffer-size) { +err = CL_INVALID_VALUE; +goto error; + } + + if (pattern == NULL) { +err = CL_INVALID_VALUE; +goto error; + } + + for (i = 0; i sizeof(valid_sz)/sizeof(size_t); i++) { coding style issue, we'd better to use sizeof(valid_sz) / sizeof(size_t) rather than the above compact style. I noticed you mixed two styles in the same patch, please fix it in the new version. OK, that needs to be refined. +if (valid_sz[i] == pattern_size) + break; + } + if (i == sizeof(valid_sz)/sizeof(size_t)) { +err = CL_INVALID_VALUE; +goto error; + } + + if (offset%pattern_size || size%pattern_size) { +err = CL_INVALID_VALUE; +goto error; + } + + err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size); + if (err) { +goto error; + } + + TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer-ctx); + + data = no_wait_data; + data-type = EnqueueFillBuffer; + data-queue = command_queue; + + if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, + event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { +if (event (*event)-type != CL_COMMAND_USER + (*event)-queue-props CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); +} + +err = cl_command_queue_flush(command_queue); + } + + if(b_output_kernel_perf) +time_end(command_queue-ctx, beignet internal kernel : cl_fill_buffer, command_queue); + + return 0; + + error: + return err; +} + +cl_int clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, diff --git a/src/cl_context.c b/src/cl_context.c index 8190e6a..e2dba65 100644 --- a/src/cl_context.c +++ b/src/cl_context.c @@ -1,4 +1,4 @@ -/* +/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or @@ -188,6 +188,7 @@ error: LOCAL void cl_context_delete(cl_context ctx) { + int i = 0; if (UNLIKELY(ctx == NULL)) return; @@ -195,6 +196,18 @@ cl_context_delete(cl_context ctx) if (atomic_dec(ctx-ref_n) 1) return; + /* delete the internal programs. */ + for (i = CL_ENQUEUE_COPY_BUFFER_ALIGN4; i CL_INTERNAL_KERNEL_MAX; i++) { Use i = 0 here may be better or define CL_INTERNAL_KERNEL_MIN to 0 and use that macro instead of using a specific enum number. Because the CL_ENQUEUE_COPY_BUFFER_ALIGN4 is not the first one, and we just handle these 4 cases, so it is hard to start from 0 or CL_INTERNAL_KERNEL_MIN +if (ctx-internel_kernels[i]) { + cl_kernel_delete(ctx-internel_kernels[i]); + ctx-internel_kernels[i] = NULL; + + assert(ctx-internal_prgs[i]); + cl_program_delete(ctx
Re: [Beignet] [PATCH 3/3] GBE: work around baytrail-t hang issue.
The whole patch set is OK. I use the same manner to recode the gen version for HSW enabling. PCI device ID seems more precise when useful to handle the conner case even in same gen version. I will rebase to your patch. On Thu, 2014-04-17 at 15:06 +0800, Chuanbo Weng wrote: From: Zhigang Gong zhigang.g...@linux.intel.com There is an unkown issue with baytrail-t platform. It will hang at utest's compiler_global_constant case. After some investigation, it turns out to be related to the DWORD GATHER READ send message on the constand cache data port. I change to use data cache data port could work around that hang issue. Now we only fail one more case on baytrail-t compare to the IVB desktop platform which is the: profiling_exec()[FAILED] Error: Too large time from submit to start That may be caused by kernel related issue. And that bug will not cause serious issue for normal kernel. So after this patch, the baytrail-t platform should be in a pretty good shape with beignet. Signed-off-by: Zhigang Gong zhigang.g...@linux.intel.com --- backend/src/backend/gen_encoder.cpp | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index c991661..1d1b5df 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -206,7 +206,11 @@ namespace gbe uint32_t msg_length, uint32_t response_length) { -const GenMessageTarget sfid = GEN6_SFID_DATAPORT_CONSTANT_CACHE; +// FIXME there is a unknown issue with baytrail-t platform, the DWORD scatter +// message causes a hang at unit test case compiler_global_constant. +// We workaround it to use DATA CACHE instead. +const GenMessageTarget sfid = (p-deviceID == PCI_CHIP_BAYTRAIL_T) ? + GEN_SFID_DATAPORT_DATA_CACHE : GEN6_SFID_DATAPORT_CONSTANT_CACHE; setMessageDescriptor(p, insn, sfid, msg_length, response_length); insn-bits3.gen7_dword_rw.msg_type = msg_type; insn-bits3.gen7_dword_rw.bti = bti; ___ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context
So can I understand like this: TLS (Thread local storage) is a global section map to each thread's space. Each thread keep one copy of this section's copy. And thread_specific is in heap, using sync function to manage the resource for each thread. ? On Fri, 2013-11-08 at 02:58 +, Zou, Nanhai wrote: TLS (Thread local storage) is useful for convert legacy thread unsafe program into thread-safe. E.g. errno in glibc. But for this case, I think explicitly separate the thread specific data is better. Not only for thread safe, but also for later optimization. This help us to collect all data that will be modified during NDRange. Thanks Zou Nanhai -Original Message- From: beignet-boun...@lists.freedesktop.org [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Song, Ruiling Sent: Friday, November 08, 2013 10:38 AM To: Zhigang Gong; junyan...@inbox.com Cc: Junyan He; beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context I am really new to the keyword __thread, and have a quick look at docs on the web: http://gcc.gnu.org/onlinedocs/gcc-3.3.1/gcc/Thread-Local.html#Thread-Local it says: The __thread specifier may be applied to any global, file-scoped static, function-scoped static, or static data member of a class. It may not be applied to block-scoped automatic or non-static data member. From my understanding, this is not proper for our case. Thanks! Ruiling -Original Message- From: beignet-boun...@lists.freedesktop.org [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Zhigang Gong Sent: Friday, November 08, 2013 8:29 AM To: junyan...@inbox.com Cc: Junyan He; beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context I agree with you that use thread data is better than locking. One comment, how about to use thread local storage to simplify this patch as below: struct _cl_commonand_queue { ... __thread cl_gpgpu gpgpu; ... }; Then in the initialization stage, set it to NULL; queue-gpgpu = NULL; In the head of each functions which use queue-gpgpu, add the following code: if (queue-gpgpu == NULL) TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv)); Then we don't need to change any other code? What's your opinion? On Fri, Nov 08, 2013 at 12:58:00AM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com We find some cases will use multi-threads to run on the same queue, executing the same kernel. This will cause the gpgpu struct which is very important for GPU context setting be destroyed because we do not implement any sync protect on it now. Move the gpgpu struct into thread specific space will fix this problem because the lib_drm will do the GPU command serialization for us. --- src/CMakeLists.txt |1 + src/cl_command_queue.c | 27 +++- src/cl_command_queue.h |9 +- src/cl_command_queue_gen7.c |7 +++-- src/cl_event.c |6 ++-- src/cl_thread.c | 72 +++ src/cl_thread.h | 34 utests/CMakeLists.txt |2 +- 8 files changed, 144 insertions(+), 14 deletions(-) create mode 100644 src/cl_thread.c create mode 100644 src/cl_thread.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1e28c6c..59d330e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,6 +39,7 @@ set(OPENCL_SRC cl_command_queue.c cl_command_queue.h cl_command_queue_gen7.c +cl_thread.c cl_driver.h cl_driver.cpp cl_driver_defs.c diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 3f9d95c..3530976 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -24,6 +24,7 @@ #include cl_device_id.h #include cl_mem.h #include cl_utils.h +#include cl_thread.h #include cl_alloc.h #include cl_driver.h #include cl_khr_icd.h @@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx) queue-magic = CL_MAGIC_QUEUE_HEADER; queue-ref_n = 1; queue-ctx = ctx; - TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv)); + if ((queue-thread_data = cl_thread_data_create()) == NULL) { +goto error; + } /* Append the command queue in the list */ pthread_mutex_lock(ctx-queue_lock); @@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue) cl_mem_delete(queue-fulsim_out); queue-fulsim_out = NULL; } + + cl_thread_data_destroy(queue-thread_data); + queue-thread_data = NULL; cl_mem_delete(queue-perf); cl_context_delete(queue-ctx); - cl_gpgpu_delete(queue-gpgpu); cl_free(queue-wait_events); queue-magic = CL_MAGIC_DEAD_HEADER; /* For safety
Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context
I have tried __thread extension of GCC as you said. It seems OK for global var but now workable for struct field. I think struct memory may be allocated from heap, may be on the stack, and can also be global var. So it may be impossible for compiler to figure out how to store one of its field in thread specific space. As you said, it really may have a problem if one thread configure all the queue context and then create another thread to exec NDRange. But in current code, the gpu state will always be inited every time when call exec NDRange. On Fri, 2013-11-08 at 08:29 +0800, Zhigang Gong wrote: I agree with you that use thread data is better than locking. One comment, how about to use thread local storage to simplify this patch as below: struct _cl_commonand_queue { ... __thread cl_gpgpu gpgpu; ... }; Then in the initialization stage, set it to NULL; queue-gpgpu = NULL; In the head of each functions which use queue-gpgpu, add the following code: if (queue-gpgpu == NULL) TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv)); Then we don't need to change any other code? What's your opinion? On Fri, Nov 08, 2013 at 12:58:00AM +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com We find some cases will use multi-threads to run on the same queue, executing the same kernel. This will cause the gpgpu struct which is very important for GPU context setting be destroyed because we do not implement any sync protect on it now. Move the gpgpu struct into thread specific space will fix this problem because the lib_drm will do the GPU command serialization for us. --- src/CMakeLists.txt |1 + src/cl_command_queue.c | 27 +++- src/cl_command_queue.h |9 +- src/cl_command_queue_gen7.c |7 +++-- src/cl_event.c |6 ++-- src/cl_thread.c | 72 +++ src/cl_thread.h | 34 utests/CMakeLists.txt |2 +- 8 files changed, 144 insertions(+), 14 deletions(-) create mode 100644 src/cl_thread.c create mode 100644 src/cl_thread.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1e28c6c..59d330e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,6 +39,7 @@ set(OPENCL_SRC cl_command_queue.c cl_command_queue.h cl_command_queue_gen7.c +cl_thread.c cl_driver.h cl_driver.cpp cl_driver_defs.c diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 3f9d95c..3530976 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -24,6 +24,7 @@ #include cl_device_id.h #include cl_mem.h #include cl_utils.h +#include cl_thread.h #include cl_alloc.h #include cl_driver.h #include cl_khr_icd.h @@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx) queue-magic = CL_MAGIC_QUEUE_HEADER; queue-ref_n = 1; queue-ctx = ctx; - TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv)); + if ((queue-thread_data = cl_thread_data_create()) == NULL) { +goto error; + } /* Append the command queue in the list */ pthread_mutex_lock(ctx-queue_lock); @@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue) cl_mem_delete(queue-fulsim_out); queue-fulsim_out = NULL; } + + cl_thread_data_destroy(queue-thread_data); + queue-thread_data = NULL; cl_mem_delete(queue-perf); cl_context_delete(queue-ctx); - cl_gpgpu_delete(queue-gpgpu); cl_free(queue-wait_events); queue-magic = CL_MAGIC_DEAD_HEADER; /* For safety */ cl_free(queue); @@ -119,13 +124,15 @@ LOCAL cl_int cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k) { uint32_t i; + GET_QUEUE_THREAD_GPGPU(queue); + for (i = 0; i k-image_sz; i++) { int id = k-images[i].arg_idx; struct _cl_mem_image *image; assert(gbe_kernel_get_arg_type(k-opaque, id) == GBE_ARG_IMAGE); image = cl_mem_image(k-args[id].mem); set_image_info(k-curbe, k-images[i], image); -cl_gpgpu_bind_image(queue-gpgpu, k-images[i].idx, image-base.bo, image-offset, +cl_gpgpu_bind_image(gpgpu, k-images[i].idx, image-base.bo, image-offset, image-intel_fmt, image-image_type, image-w, image-h, image-depth, image-row_pitch, image-tiling); @@ -136,6 +143,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k) LOCAL cl_int cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k) { + GET_QUEUE_THREAD_GPGPU(queue); + /* Bind all user buffers (given by clSetKernelArg) */ uint32_t i; enum gbe_arg_type arg_type; /* kind of argument */ @@ -147,9 +156,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k) offset
Re: [Beignet] [REFINED PATCH 4/4] Add a test case for binary load.
Fix the problem of out-source building On Thu, 2013-09-12 at 14:06 +0800, junyan...@inbox.com wrote: From: Junyan He junyan...@linux.intel.com Signed-off-by: Junyan He junyan...@linux.intel.com --- utests/CMakeLists.txt| 13 +++ utests/load_program_from_bin.cpp | 77 ++ utests/utest_helper.cpp |8 ++-- utests/utest_helper.hpp |3 ++ 4 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 utests/load_program_from_bin.cpp diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index ffabc39..06188a6 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -141,11 +141,22 @@ set (utests_sources compiler_long_mult.cpp compiler_long_cmp.cpp compiler_bool_cross_basic_block.cpp + load_program_from_bin.cpp utest_assert.cpp utest.cpp utest_file_map.cpp utest_helper.cpp) +SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil) +ADD_CUSTOM_COMMAND( +OUTPUT ${kernel_bin}.bin +COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl -o${kernel_bin}.bin +DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl +) + +ADD_CUSTOM_TARGET(kernel_bin.bin +DEPENDS ${kernel_bin}.bin) + if (EGL_FOUND AND MESA_SOURCE_FOUND) SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp) SET(CMAKE_CXX_FLAGS -DHAS_EGL ${CMAKE_CXX_FLAGS}) @@ -158,7 +169,9 @@ TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) ADD_EXECUTABLE(utest_run utest_run.cpp) TARGET_LINK_LIBRARIES(utest_run utests) +ADD_DEPENDENCIES (utest_run kernel_bin.bin) ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp) TARGET_LINK_LIBRARIES(flat_address_space utests) + diff --git a/utests/load_program_from_bin.cpp b/utests/load_program_from_bin.cpp new file mode 100644 index 000..d45c2bd --- /dev/null +++ b/utests/load_program_from_bin.cpp @@ -0,0 +1,77 @@ +#include utest_helper.hpp +#include utest_file_map.hpp +#include cmath +#include algorithm + +using namespace std; + +static void cpu(int global_id, float *src, float *dst) { +dst[global_id] = ceilf(src[global_id]); +} + +static void test_load_program_from_bin(void) +{ +const size_t n = 16; +float cpu_dst[16], cpu_src[16]; +cl_int status; +cl_int binary_status; +char *ker_path = NULL; + +cl_file_map_t *fm = cl_file_map_new(); +ker_path = cl_do_kiss_path(compiler_ceil.bin, device); +OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS); + +const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm); +const size_t sz = cl_file_map_size(fm); + +program = clCreateProgramWithBinary(ctx, 1, + device, sz, src, binary_status, status); + +OCL_ASSERT(program status == CL_SUCCESS); + +/* OCL requires to build the program even if it is created from a binary */ +OCL_ASSERT(clBuildProgram(program, 1, device, NULL, NULL, NULL) == CL_SUCCESS); + +kernel = clCreateKernel(program, compiler_ceil, status); +OCL_ASSERT(status == CL_SUCCESS); + +OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL); +OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL); +OCL_SET_ARG(0, sizeof(cl_mem), buf[0]); +OCL_SET_ARG(1, sizeof(cl_mem), buf[1]); +globals[0] = 16; +locals[0] = 16; + +// Run random tests +for (uint32_t pass = 0; pass 8; ++pass) { +OCL_MAP_BUFFER(0); +for (int32_t i = 0; i (int32_t) n; ++i) +cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() 15) - .75f; +OCL_UNMAP_BUFFER(0); + +// Run the kernel on GPU +OCL_NDRANGE(1); + +// Run on CPU +for (int32_t i = 0; i (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst); + +// Compare +OCL_MAP_BUFFER(1); + +#if 0 +printf( GPU:\n); +for (int32_t i = 0; i (int32_t) n; ++i) +printf( %f, ((float *)buf_data[1])[i]); +printf(\n CPU:\n); +for (int32_t i = 0; i (int32_t) n; ++i) +printf( %f, cpu_dst[i]); +printf(\n); +#endif + +for (int32_t i = 0; i (int32_t) n; ++i) +OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]); +OCL_UNMAP_BUFFER(1); +} +} + +MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin); diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp index b4f61df..8089799 100644 --- a/utests/utest_helper.cpp +++ b/utests/utest_helper.cpp @@ -205,8 +205,8 @@ clpanic(const char *msg, int rval) exit(-1); } -static char* -do_kiss_path(const char *file, cl_device_id device) +char* +cl_do_kiss_path(const char *file, cl_device_id device) { cl_int ver; const char *sub_path = NULL; @@ -239,7 +239,7 @@ cl_kernel_init