Re: [Beignet] [PATCH 1/3] Runtime: fix a recurrent release context error.

2017-07-09 Thread He Junyan
This patchset is OK for me.

On Tue, Jun 20, 2017 at 07:07:45PM +0800, Yang Rong wrote:
> Date: Tue, 20 Jun 2017 19:07:45 +0800
> From: Yang Rong 
> To: beignet@lists.freedesktop.org
> Cc: Yang Rong 
> Subject: [Beignet] [PATCH 1/3] Runtime: fix a recurrent release context
>  error.
> X-Mailer: git-send-email 2.1.4
> 
> Before release internal resources, must set them to null, otherwize,
> when delete these resources, will call release context again.
> The ctx->built_in_prgs should be release by application.
> 
> Signed-off-by: Yang Rong 
> ---
>  src/cl_context.c | 18 --
>  1 file changed, 8 insertions(+), 10 deletions(-)
> 
> diff --git a/src/cl_context.c b/src/cl_context.c
> index c5f3678..f3dd421 100644
> --- a/src/cl_context.c
> +++ b/src/cl_context.c
> @@ -366,9 +366,6 @@ cl_context_delete(cl_context ctx)
>++internal_ctx_refs;
>}
>  
> -  if (ctx->built_in_prgs)
> -++internal_ctx_refs;
> -
>if (ctx->image_queue)
>  ++internal_ctx_refs;
>  
> @@ -382,30 +379,31 @@ cl_context_delete(cl_context ctx)
>CL_OBJECT_INC_REF(ctx);
>  
>if (ctx->image_queue) {
> -clReleaseCommandQueue(ctx->image_queue);
> +cl_command_queue q = ctx->image_queue;
>  ctx->image_queue = NULL;
> +clReleaseCommandQueue(q);
>}
>  
>/* delete the internal programs. */
>for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
>  if (ctx->internal_kernels[i]) {
> -  cl_kernel_delete(ctx->internal_kernels[i]);
> +  cl_kernel k = ctx->internal_kernels[i];
>ctx->internal_kernels[i] = NULL;
> +  cl_kernel_delete(k);
>  
>assert(ctx->internal_prgs[i]);
> -  cl_program_delete(ctx->internal_prgs[i]);
> +  cl_program p = ctx->internal_prgs[i];
>ctx->internal_prgs[i] = NULL;
> +  cl_program_delete(p);
>  }
>  
>  if (ctx->built_in_kernels[i]) {
> -  cl_kernel_delete(ctx->built_in_kernels[i]);
> +  cl_kernel k = ctx->built_in_kernels[i];
>ctx->built_in_kernels[i] = NULL;
> +  cl_kernel_delete(k);
>  }
>}
>  
> -  cl_program_delete(ctx->built_in_prgs);
> -  ctx->built_in_prgs = NULL;
> -
>CL_OBJECT_DEC_REF(ctx);
>  
>cl_free(ctx->prop_user);
> -- 
> 2.1.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH newRT] Wrap all memory allocate functions.

2017-03-30 Thread He Junyan
I have already tried glibc's hsearch and tsearch.
tsearch using binary tree, but you can only have one search tree within one
program, which is unacceptible.
hsearch using hash table, but you can just insert the element but can not
delete the element. For our case, the point address is the key and there
hundred of thoundred of them, so this is also unacceptible.


On Thu, Mar 30, 2017 at 07:35:20AM +, Yang, Rong R wrote:
> Date: Thu, 30 Mar 2017 07:35:20 +
> From: "Yang, Rong R" <rong.r.y...@intel.com>
> To: "junyan...@inbox.com" <junyan...@inbox.com>,
>  "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org>
> Cc: "He, Junyan" <junyan...@intel.com>
> Subject: Re: [Beignet] [PATCH newRT] Wrap all memory allocate functions.
> 
> Actually, you implement a hash table with insert/delete operations, does 
> linux has these apis?
> 
> > -Original Message-
> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> > junyan...@inbox.com
> > Sent: Thursday, March 23, 2017 15:46
> > To: beignet@lists.freedesktop.org
> > Cc: He, Junyan <junyan...@intel.com>
> > Subject: [Beignet] [PATCH newRT] Wrap all memory allocate functions.
> > 
> > From: Junyan He <junyan...@intel.com>
> > 
> > We modify all memory allocated functions in cl_alloc file, make it
> > easy to debug all the memory leak point.
> > 
> > Signed-off-by: Junyan He <junyan...@intel.com>
> > ---
> >  src/cl_accelerator_intel.c |   4 +-
> >  src/cl_alloc.c | 197 
> > ++--
> > -
> >  src/cl_alloc.h |  43 +++--
> >  src/cl_api.c   |   3 +-
> >  src/cl_api_context.c   |   4 +-
> >  src/cl_api_kernel.c|  12 +--
> >  src/cl_command_queue.c |  12 +--
> >  src/cl_command_queue_enqueue.c |   6 +-
> >  src/cl_command_queue_gen7.c|   2 +-
> >  src/cl_context.c   |  14 +--
> >  src/cl_device_enqueue.c|   2 +-
> >  src/cl_enqueue.c   |   6 +-
> >  src/cl_event.c |  20 ++---
> >  src/cl_kernel.c|  30 +++
> >  src/cl_mem.c   |  28 +++---
> >  src/cl_program.c   |  54 +--
> >  src/cl_sampler.c   |   4 +-
> >  src/cl_utils.h |   3 -
> >  src/gen/cl_command_queue_gen.c |  12 +--
> >  src/gen/cl_kernel_gen.c|  28 +++---
> >  src/gen/cl_program_gen.c   |  12 +--
> >  src/intel/intel_batchbuffer.c  |   4 +-
> >  src/intel/intel_driver.c   |   8 +-
> >  src/intel/intel_gpgpu.c|  18 ++--
> >  src/x11/dricommon.c|   6 +-
> >  25 files changed, 342 insertions(+), 190 deletions(-)
> > 
> > diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
> > index ae08184..62700b2 100644
> > --- a/src/cl_accelerator_intel.c
> > +++ b/src/cl_accelerator_intel.c
> > @@ -18,7 +18,7 @@ cl_accelerator_intel_new(cl_context ctx,
> >cl_int err = CL_SUCCESS;
> > 
> >/* Allocate and inialize the structure itself */
> > -  TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
> > +  TRY_ALLOC(accel, CL_CALLOC(1, sizeof(struct _cl_accelerator_intel)));
> >CL_OBJECT_INIT_BASE(accel, CL_OBJECT_ACCELERATOR_INTEL_MAGIC);
> > 
> >if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) {
> > @@ -81,5 +81,5 @@ cl_accelerator_intel_delete(cl_accelerator_intel accel)
> > 
> >cl_context_delete(accel->ctx);
> >CL_OBJECT_DESTROY_BASE(accel);
> > -  cl_free(accel);
> > +  CL_FREE(accel);
> >  }
> > diff --git a/src/cl_alloc.c b/src/cl_alloc.c
> > index e532569..b9ac853 100644
> > --- a/src/cl_alloc.c
> > +++ b/src/cl_alloc.c
> > @@ -1,4 +1,4 @@
> > -/*
> > +/*
> >   * Copyright © 2012 Intel Corporation
> >   *
> >   * This library is free software; you can redistribute it and/or
> > @@ -14,75 +14,204 @@
> >   * You should have received a copy of the GNU Lesser General Public
> >   * License along with this library. If not, see 
> > <http://www.gnu.org/licenses/>.
> >   *
> > - * Author: Benjamin Segovia <benjamin.sego...@intel.com>
> >   */
> > -
> >  #include "cl_alloc.h"
> >  #include "cl_utils.h"
> > -
> > +#include "cl_device_id.h"
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +
> > +#ifdef CL_ALLO

Re: [Beignet] [PATCH] Typo in error message

2017-02-03 Thread He Junyan
Thanks for fixing it.

On Mon, Jan 30, 2017 at 03:18:09PM +0100, Giuseppe Bilotta wrote:
> Date: Mon, 30 Jan 2017 15:18:09 +0100
> From: Giuseppe Bilotta 
> To: Beignet ML 
> Cc: Giuseppe Bilotta 
> Subject: [Beignet] [PATCH] Typo in error message
> X-Mailer: git-send-email 2.11.0.745.g0978fb64a4
> 
> ---
>  src/cl_event.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 3e1dc224..a2b16be4 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -579,7 +579,7 @@ cl_event_exec(cl_event event, cl_int exec_to_status, 
> cl_bool ignore_depends)
>  
>  if (ret != CL_SUCCESS) {
>assert(ret < 0);
> -  DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error staus is 
> %d",
> +  DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error status is 
> %d",
>   event, event->event_type, ret);
>ret = cl_event_set_status(event, ret);
>assert(ret == CL_SUCCESS);
> -- 
> 2.11.0.745.g0978fb64a4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Runtime: return CL_INVALID_EVENT_WAIT_LIST if not event in the wait list.

2016-12-28 Thread He Junyan
Thanks for catching that bug.

On Wed, Dec 28, 2016 at 06:47:01PM +0800, Yang Rong wrote:
> Date: Wed, 28 Dec 2016 18:47:01 +0800
> From: Yang Rong 
> To: beignet@lists.freedesktop.org
> Cc: Meng Mengmeng , Yang Rong
>  
> Subject: [Beignet] [PATCH] Runtime: return CL_INVALID_EVENT_WAIT_LIST if
>  not event in the wait list.
> X-Mailer: git-send-email 2.1.4
> 
> From: Meng Mengmeng 
> 
> Signed-off-by: Yang Rong 
> ---
>  src/cl_event.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 8173578..644a21f 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -546,7 +546,7 @@ cl_event_check_waitlist(cl_uint num_events_in_wait_list, 
> const cl_event *event_w
>  /* check the event and context */
>  for (i = 0; i < num_events_in_wait_list; i++) {
>if (!CL_OBJECT_IS_EVENT(event_wait_list[i])) {
> -err = CL_INVALID_EVENT;
> +err = CL_INVALID_EVENT_WAIT_LIST;
>  break;
>}
>  
> -- 
> 2.1.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Refine mem.h and improve the related macro defination.

2016-12-22 Thread He Junyan
According to my understanding, PIPE is also a mem object.
clGetMemObjectInfo can be used to query all kinds of mem object.
clGetPipeInfo just query additional PIPE info.
According to CL spec, it just says query CL_MEM_HOST_PTR return
the host ptr if image/buffer created by CL_MEM_USE_HOST_PTR is specified,
otherwise, return NULL.
So, as a PIPE, I think it just retuns NULL and it that correct?


On Tue, Dec 20, 2016 at 09:05:41AM +0100, Simon Richter wrote:
> Date: Tue, 20 Dec 2016 09:05:41 +0100
> From: Simon Richter <simon.rich...@hogyros.de>
> To: beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] Refine mem.h and improve the related macro
>  defination.
> 
> Hi,
> 
> On 20.12.2016 04:23, He Junyan wrote:
> 
> >>> +if (!CL_OBJECT_IS_BUFFER(memobj)) {
> 
> >> That would match pipes as well, is that intended?
> 
> > Already redefine CL_OBJECT_IS_BUFFER to just match buffer and subbuffer.
> 
> > +#define CL_OBJECT_IS_BUFFER(mem) ((mem &&  
> >\
> > +   ((cl_base_object)mem)->magic == 
> > CL_OBJECT_MEM_MAGIC && \
> > +   CL_OBJECT_GET_REF(mem) >= 1 &&  
> >\
> > +   ((cl_mem)mem)->type <= 
> > CL_MEM_SUBBUFFER_TYPE))
> 
> Exactly my point. A pipe object would not match CL_OBJECT_IS_BUFFER(),
> so the negated test would return true, and the code would look at
> memobj->host_ptr in a pipe object, which is wrong.
> 
>Simon
> 
> 




> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH V4] Add profiling feature based on new event implementation.

2016-12-22 Thread He Junyan
I notice this should be caused by event timestamp recording sequence.
We submit the NDRange and then record the queued timestamp, this is
wrong. I have already sent an another patch to "Improve event execute function"
to modify this. You can have a try, apply that patch based on this patch.
thanks.

On Thu, Dec 22, 2016 at 06:41:49AM +, Pan, Xiuli wrote:
> Date: Thu, 22 Dec 2016 06:41:49 +
> From: "Pan, Xiuli" <xiuli@intel.com>
> To: "junyan...@inbox.com" <junyan...@inbox.com>,
>  "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org>
> Subject: Re: [Beignet] [PATCH V4] Add profiling feature based on new event
>   implementation.
> 
> It seems still have bugs.
> Here are some logs I got, the gen timestamps is print in the function 
> cl_event_update_timestamp_gen, and the final result is print last as 
> timestamp.
> 
> gen timestamp[0] is d88bddb30
> gen timestamp[1] is d88bde2b0
> run for 8 times
> gen timestamp[2] is d88bddae0 // It is smaller than timestamp[0] we get some 
> negative value
> gen timestamp[3] is d8f002390
> timestamp[2] is ffaf
> timestamp[3] is 642485f
> gen timestamp[0] is d8f03fab0
> gen timestamp[1] is d8f0400f0
> run for 9 times
> gen timestamp[2] is d8f03fd30
> gen timestamp[3] is d954687d0
> timestamp[2] is 27f
> timestamp[3] is 6428d1f
> gen timestamp[0] is d954a9d20
> gen timestamp[1] is d954aa450
> run for 10 times
> gen timestamp[2] is d954a9d20 //It is the same as timestamp[0] we get -1
> gen timestamp[3] is d9b8df420
> timestamp[2] is 
> 
> 
> The overflow handler seems to have some problems.
> 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
> junyan...@inbox.com
> Sent: Monday, December 19, 2016 7:24 PM
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH V4] Add profiling feature based on new event 
> implementation.
> 
> From: Junyan He <junyan...@intel.com>
> 
> TODO:
> In opencl 2.0, a new profiling item called CL_PROFILING_COMMAND_COMPLETE is 
> imported. It means that we need to record the time stamp of all the child 
> events created by the "Kernel enqueing kernels" feature finish.
> This should be done after the "Kernel enqueing kernels" feature enabled.
> 
> V2:
> Update event time stamp before inserting to queue thread, avoid MT issue.
> 
> V3:
> Fixup overflow problem.
> 
> V4:
> Fixup overflow to 0xf problem.
> Just take ownership and release event lock when call the update timestamp 
> function. The update timestamp function may have block system call can should 
> not hold the lock to call it.
> 
> Signed-off-by: Junyan He <junyan...@intel.com>
> ---
>  src/cl_api.c   |  51 
>  src/cl_api_event.c |  41 +
>  src/cl_api_mem.c   |   9 +++
>  src/cl_base_object.c   |  29 ++---
>  src/cl_base_object.h   |  10 ++--
>  src/cl_command_queue_enqueue.c |   2 +
>  src/cl_driver.h|   4 +-
>  src/cl_enqueue.c   |   9 ---
>  src/cl_event.c | 132 
> -
>  src/cl_event.h |  10 ++--
>  src/intel/intel_gpgpu.c|  16 +++--
>  11 files changed, 195 insertions(+), 118 deletions(-)
> 
> diff --git a/src/cl_api.c b/src/cl_api.c index d7b5434..6a4f4ec 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -1312,57 +1312,6 @@ error:
>return err;
>  }
>  
> -
> -cl_int
> -clGetEventProfilingInfo(cl_event event,
> -cl_profiling_infoparam_name,
> -size_t   param_value_size,
> -void *   param_value,
> -size_t * param_value_size_ret)
> -{
> -  cl_int err = CL_SUCCESS;
> -  cl_ulong ret_val;
> -
> -  CHECK_EVENT(event);
> -  //cl_event_update_status(event, 0);
> -
> -  if (event->event_type == CL_COMMAND_USER ||
> -  !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
> -  event->status != CL_COMPLETE) {
> -err = CL_PROFILING_INFO_NOT_AVAILABLE;
> -goto error;
> -  }
> -
> -  if (param_value && param_value_size < sizeof(cl_ulong)) {
> -err = CL_INVALID_VALUE;
> -goto error;
> -  }
> -
> -  if (param_name == CL_PROFILING_COMMAND_QUEUED) {
> -ret_val = event->queued_timestamp;
> -  } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
> -ret_val= event->queued_timestamp + 
> cl_event_get_timestam

Re: [Beignet] [PATCH] Refine mem.h and improve the related macro defination.

2016-12-19 Thread He Junyan
On Mon, Dec 19, 2016 at 06:25:26PM +0100, Simon Richter wrote:
> Date: Mon, 19 Dec 2016 18:25:26 +0100
> From: Simon Richter 
> To: beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] Refine mem.h and improve the related macro
>  defination.
> 
> Hi,
> 
> On 19.12.2016 10:21, junyan...@inbox.com wrote:
> 
> > --- a/src/cl_api_mem.c
> > +++ b/src/cl_api_mem.c
> > @@ -71,54 +71,54 @@ clGetMemObjectInfo(cl_mem memobj,
> >  break;
> >case CL_MEM_HOST_PTR: {
> >  ptr = 0;
> > -if (memobj->type == CL_MEM_IMAGE_TYPE) {
> > +if (!CL_OBJECT_IS_BUFFER(memobj)) {
> >ptr = (size_t)memobj->host_ptr;
> >  } else {
> 
> That would match pipes as well, is that intended?
> 
>Simon
> 
Already redefine CL_OBJECT_IS_BUFFER to just match buffer and subbuffer.

+#define CL_OBJECT_IS_BUFFER(mem) ((mem &&  
   \
+   ((cl_base_object)mem)->magic == 
CL_OBJECT_MEM_MAGIC && \
+   CL_OBJECT_GET_REF(mem) >= 1 &&  
   \
+   ((cl_mem)mem)->type <= 
CL_MEM_SUBBUFFER_TYPE))




> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 01/11] Runtime: Add CL base object for all cl objects.

2016-07-19 Thread He Junyan
The whole patch set should be push to dev/runtime first.
When it is stable, we can merge them back.

On Tue, Jul 19, 2016 at 07:25:47PM +0800, junyan...@inbox.com wrote:
> Date: Tue, 19 Jul 2016 19:25:47 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH 01/11] Runtime: Add CL base object for all cl
>  objects.
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@intel.com>
> 
> The runtime code is a little verbose in CL object handle.
> Every CL objects should have a reference, a lock to protect itself
> and an ICD dispatcher. We can organize them to a struct and place
> it at the beginning of each CL object.
> This base object is also used to protect the CL objects MT safe.
> CL_OBJECT_LOCK/CL_OBJECT_UNLOCK macro will lock/unlock objects,
> but we should use them within one function call, and the critical
> region should be short.
> We add CL_OBJECT_TAKE_OWNERSHIP/CL_OBJECT_RELEASE_OWNERSHIP macro
> to own the object for a long time. CL_OBJECT_TAKE_OWNERSHIP will
> not hold the lock and so will not cause deadlock problems.
> For example, when we call NDRange on some memobj, we should take
> the ownship of the memobj. If another thread call NDRange on the
> same memobj, we should return some error like CL_OUT_OF_RESOURCE
> to users and protect the memobj from accessing simultaneously.
> 
> Signed-off-by: Junyan He <junyan...@intel.com>
> ---
>  src/CMakeLists.txt   |1 +
>  src/cl_base_object.c |  102 
> ++
>  src/cl_base_object.h |   77 +
>  3 files changed, 180 insertions(+)
>  create mode 100644 src/cl_base_object.c
>  create mode 100644 src/cl_base_object.h
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index a002865..cec7cfc 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -65,6 +65,7 @@ MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" 
> "${BUILT_IN_NAME}")
>  
>  set(OPENCL_SRC
>  ${KERNEL_STR_FILES}
> +cl_base_object.c
>  cl_api.c
>  cl_alloc.c
>  cl_kernel.c
> diff --git a/src/cl_base_object.c b/src/cl_base_object.c
> new file mode 100644
> index 000..4661977
> --- /dev/null
> +++ b/src/cl_base_object.c
> @@ -0,0 +1,102 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see 
> <http://www.gnu.org/licenses/>.
> + *
> + */
> +#include 
> +#include "cl_base_object.h"
> +
> +static pthread_t invalid_thread_id = -1;
> +
> +LOCAL void
> +cl_object_init_base(cl_base_object obj, cl_ulong magic)
> +{
> +  obj->magic = magic;
> +  obj->ref = 1;
> +  SET_ICD(obj->dispatch);
> +  pthread_mutex_init(>mutex, NULL);
> +  pthread_cond_init(>cond, NULL);
> +  obj->owner = invalid_thread_id;
> +}
> +
> +LOCAL void
> +cl_object_destroy_base(cl_base_object obj)
> +{
> +  int ref = CL_OBJECT_GET_REF(obj);
> +  if (ref != 0) {
> +DEBUGP(DL_ERROR, "CL object %p, call destroy with a reference %d", obj,
> +   ref);
> +assert(0);
> +  }
> +
> +  if (!CL_OBJECT_IS_VALID(obj)) {
> +DEBUGP(DL_ERROR,
> +   "CL object %p, call destroy while it is already a dead object", 
> obj);
> +assert(0);
> +  }
> +
> +  if (obj->owner != invalid_thread_id) {
> +DEBUGP(DL_ERROR, "CL object %p, call destroy while still has a owener 
> %d",
> +   obj, (int)obj->owner);
> +assert(0);
> +  }
> +
> +  obj->magic = CL_OBJECT_INVALID_MAGIC;
> +  pthread_mutex_destroy(>mutex);
> +  pthread_cond_destroy(>cond);
> +}
> +
> +LOCAL cl_int
> +cl_object_take_ownership(cl_base_object obj, cl_int wait)
> +{
> +  pthread_t self;
> +
> +  assert(CL_OBJECT_IS_VALID(obj));
> +
> +  self = pthread_self();
> +
> +  pthread_mutex_lock(>mutex);
> +  if (pthread_equal(obj->owner, invalid_thread_id)) {
> +obj->owner = self;
> +pthread_mutex_

Re: [Beignet] [PATCH] Runtime: Add CL base object for all cl objects.

2016-07-19 Thread He Junyan
On Fri, Jul 15, 2016 at 11:50:06AM +0200, Simon Richter wrote:
> Date: Fri, 15 Jul 2016 11:50:06 +0200
> From: Simon Richter 
> To: beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] Runtime: Add CL base object for all cl
>  objects.
> 
> Hi,
> 
> On 14.07.2016 10:15, junyan...@inbox.com wrote:
> 
> > The runtime code is a little verbose in CL object handle.
> > Every CL objects should have a reference, a lock to protect itself
> > and an ICD dispatcher. We can organize them to a struct and place
> > it at the beginning of each CL object.
> 
> Does that mean that only a single call to DEFINE_ICD() and SET_ICD()
> remains? If so, can/should these be inlined?
Really it is, it's useless to define a Macro.
Thanks.

> 
>Simon
> 
> 



> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


GET FREE 5GB EMAIL - Check out spam free email with many cool features!
Visit http://www.inbox.com/email to find out more!


___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 0/5] Add support for kernel debugging

2016-07-13 Thread He Junyan
Hi Mircea,
Thanks a lot for you contribution.
I have several questions here.
According to my understanding, your debug feature strongly depends on
the libigfxdbgxchg64.so lib, which I think is a binary and I can not
find it in your patchset. I notice you get a SIP of some system routine
and allocate a buffer object to copy that system routine. How can user
control the execution of GPU program is unknown in this patchset. So
is it possible to open the source code of libigfxdbgxchg64.so.  Or at
least can you provide that lib and give a document about how to use it?

Thanks


On Fri, Jul 08, 2016 at 02:39:34PM +0200, Mircea Gherzan wrote:
> Date: Fri,  8 Jul 2016 14:39:34 +0200
> From: Mircea Gherzan 
> To: beignet@lists.freedesktop.org
> Cc: Mircea Gherzan , fabian.schn...@intel.com
> Subject: [Beignet] [PATCH 0/5] Add support for kernel debugging
> X-Mailer: git-send-email 1.7.0.7
> 
> This patch series enables debugging OpenCL Beignet shaders with the GDB port
> for Intel(R) GPUs.
>   
> Enabling debugging in the Beignet codebase involves:
> * checking if the debugger is present
> * setting a breakpoint on the first instruction of a kernel,
> * getting the "debug system routine" that dumps the content of the EU 
> registers
>   to a "debug surface" once a breakpoint is encountered in the shader,
> * setting up the BOs for the system routine and for the debug surface,
> * writing the right MMIO registers (via batch buffer commands) in order
>   to enable the shader debug in the hardware.
> * notifying the debugger infrastructure that a certain kernel is under debug,
>   in order to prevent the "Debug Companion Driver" from auto-resuming
>   the kernel.
> 
> The interaction with the debugger is done via the debugger interchange 
> library.
> This library as well as other debugger open-source components (the kernel 
> driver,
> libraries, GDB) are delivered in the Intel(R) Parallel Studio XE.
> 
> Tested on HSW, BDW and SKL.
> 
> Mircea Gherzan (5):
>   backend: add support for kernel debugging
>   runtime: add support for the interchange library of the debugger
>   runtime: use the "-debug" build option if the debugger is active
>   runtime: set the kernel name in the cl_gpgpu_kernel structure
>   runtime: support for the debug system routine, surface and MMIO
> registers
> 
>  backend/src/backend/context.cpp |   4 +-
>  backend/src/backend/context.hpp |   4 +-
>  backend/src/backend/gen_context.cpp |  12 ++-
>  backend/src/backend/gen_context.hpp |   2 +-
>  backend/src/backend/gen_program.cpp |   7 +-
>  backend/src/backend/gen_program.hpp |   8 +-
>  backend/src/backend/program.cpp |   5 +-
>  src/CMakeLists.txt  |   1 +
>  src/cl_command_queue_gen7.c |   6 +-
>  src/cl_context.c|   4 +
>  src/cl_context.h|   2 +
>  src/cl_driver.h |   4 +
>  src/cl_driver_defs.c|   2 +-
>  src/cl_program.c|  40 
>  src/intel/intel_debugger.c  | 158 +
>  src/intel/intel_debugger.h  |  70 +
>  src/intel/intel_defines.h   |  11 +++
>  src/intel/intel_gpgpu.c | 192 
> ++--
>  src/intel/intel_gpgpu.h |   7 ++
>  19 files changed, 516 insertions(+), 23 deletions(-)
>  create mode 100644 src/intel/intel_debugger.c
>  create mode 100644 src/intel/intel_debugger.h
> 
> -- 
> 1.8.3.1
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


Can't remember your password? Do you need a strong and secure password?
Use Password manager! It stores your passwords & protects your account.
Check it out at http://mysecurelogon.com/manager


___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [Printf v2][PATCH 07/12] Add the implementation of printf ir instruction.

2016-02-04 Thread He Junyan
patch of 06 and 07 have the same title?
I think it is a typo here.
Please correct it.
All the other things are OK, just rename this one and
the whole patchset can be pushed later.

Also can push my patch about printf test cases together.

On Mon, Feb 01, 2016 at 03:42:16PM +0800, yan.w...@linux.intel.com wrote:
> Date: Mon,  1 Feb 2016 15:42:16 +0800
> From: yan.w...@linux.intel.com
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [Printf v2][PATCH 07/12] Add the implementation of
>  printf ir instruction.
> X-Mailer: git-send-email 2.5.0
> 
> From: Yan Wang <yan.w...@linux.intel.com>
> 
> Contributor: Junyan He <junyan...@linux.intel.com>
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  backend/src/llvm/llvm_gen_backend.cpp | 95 
> +--
>  1 file changed, 80 insertions(+), 15 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index dba9dba..4870285 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -486,6 +486,9 @@ namespace gbe
>  typedef map>::iterator PtrOrigMapIter;
>  // map pointer source to bti
>  map BtiMap;
> +// map printf pointer source to bti
> +int printfBti;
> +uint32_t printfNum;
>  // map ptr to its bti register
>  map BtiValueMap;
>  // map ptr to it's base
> @@ -520,6 +523,8 @@ namespace gbe
>  unit(unit),
>  ctx(unit),
>  regTranslator(ctx),
> +printfBti(-1),
> +printfNum(0),
>  LI(0),
>  TheModule(0),
>  btiBase(BTI_RESERVED_NUM),
> @@ -586,6 +591,7 @@ namespace gbe
>addrStoreInst.clear();
>// Reset for next function
>btiBase = BTI_RESERVED_NUM;
> +  printfBti = -1;
>return false;
>  }
>  /*! Given a possible pointer value, find out the interested escape like
> @@ -594,7 +600,7 @@ namespace gbe
>  /*! For all possible pointers, GlobalVariable, function pointer argument,
>  alloca instruction, find their pointer escape points */
>  void analyzePointerOrigin(Function );
> -unsigned getNewBti(Value *origin, bool isImage);
> +unsigned getNewBti(Value *origin, bool force);
>  void assignBti(Function );
>  bool isSingleBti(Value *Val);
>  Value *getBtiRegister(Value *v);
> @@ -717,11 +723,10 @@ namespace gbe
>  // handle load of dword/qword with unaligned address
>  void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, 
> ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, 
> bool fixedBTI);
>  void visitInstruction(Instruction ) {NOT_SUPPORTED;}
> -void* getPrintfInfo(CallInst* inst)
> -{
> -  if ([inst])
> -return (void*)[inst];
> -  return NULL;
> +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
> +  if (unit.printfs.find(inst) == unit.printfs.end())
> +return NULL;
> +  return [inst];
>  }
>  private:
>void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug 
> infomation in context for subsequently passing to Gen insn
> @@ -1127,21 +1132,15 @@ namespace gbe
>  }
>}
>  
> -  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
> +  unsigned GenWriter::getNewBti(Value *origin, bool force) {
>  unsigned new_bti = 0;
> -if (isImage) {
> +if (force) {
>new_bti = btiBase;
>incBtiBase();
>return new_bti;
>  }
>  
> -if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
> -  new_bti = btiBase;
> -  incBtiBase();
> -} else if 
> (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
> -  new_bti = btiBase;
> -  incBtiBase();
> -} else if 
> (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
> +if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
>new_bti = btiBase;
>incBtiBase();
>  }
> @@ -3716,6 +3715,16 @@ namespace gbe
>  this->newRegister();
>  break;
>case GEN_OCL_PRINTF:
> +this->newRegister();  // fall through
> +  case GEN_OCL_PUTS:
> +  {
> + // We need a new BTI as printf output.
> + if (printfBti < 0) {
> +   printfBti = this->getNewBti(, true);
> +   ctx.getFunction().getPrintfSet()->setBufBTI(printfBti);
> + }
> + break;
> +  }
>case GEN_OCL_CALC_TIMESTAMP:
> 

Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf instruction.

2016-01-27 Thread He Junyan
On Thu, Jan 21, 2016 at 11:30:21AM +0800, Yan Wang wrote:
> Date: Thu, 21 Jan 2016 11:30:21 +0800
> From: Yan Wang <yan.w...@linux.intel.com>
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [Printf][PATCH 06/11] Implement emision of printf
>  instruction.
> X-Mailer: git-send-email 2.5.0
> 
> Contributor: Junyan He <junyan...@linux.intel.com>
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  backend/src/ir/context.hpp|  5 ++
>  backend/src/llvm/llvm_gen_backend.cpp | 89 
> ---
>  2 files changed, 78 insertions(+), 16 deletions(-)
> 
I think it is better to write another patch to type TUPLE logic
> diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
> index b95741f..877d639 100644
> --- a/backend/src/ir/context.hpp
> +++ b/backend/src/ir/context.hpp
> @@ -149,6 +149,11 @@ namespace ir {
>GBE_ASSERTM(fn != NULL, "No function currently defined");
>return fn->file.appendArrayTuple(reg, regNum);
>  }
> +/*! Make a tuple from an array of types */
> +INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) {
> +  GBE_ASSERTM(fn != NULL, "No function currently defined");
> +  return fn->file.appendArrayTypeTuple((uint8_t*)type, num);
> +}
>  /*! We just use variadic templates to forward instruction functions */
>  #define DECL_INSN(NAME, FAMILY) \
>  template  INLINE void NAME(Args...args);
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index dba9dba..cc736d7 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -486,6 +486,9 @@ namespace gbe
>  typedef map>::iterator PtrOrigMapIter;
>  // map pointer source to bti
>  map BtiMap;
> +// map printf pointer source to bti
> +int printfBti;
> +uint32_t printfNum;
>  // map ptr to its bti register
>  map BtiValueMap;
>  // map ptr to it's base
> @@ -520,6 +523,8 @@ namespace gbe
>  unit(unit),
>  ctx(unit),
>  regTranslator(ctx),
> +printfBti(-1),
Also need to reset printfBti for each runOnFunction.

> +printfNum(0),
>  LI(0),
>  TheModule(0),
>  btiBase(BTI_RESERVED_NUM),
> @@ -594,7 +599,7 @@ namespace gbe
>  /*! For all possible pointers, GlobalVariable, function pointer argument,
>  alloca instruction, find their pointer escape points */
>  void analyzePointerOrigin(Function );
> -unsigned getNewBti(Value *origin, bool isImage);
> +unsigned getNewBti(Value *origin, bool force);
>  void assignBti(Function );
>  bool isSingleBti(Value *Val);
>  Value *getBtiRegister(Value *v);
> @@ -717,12 +722,7 @@ namespace gbe
>  // handle load of dword/qword with unaligned address
>  void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, 
> ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, 
> bool fixedBTI);
>  void visitInstruction(Instruction ) {NOT_SUPPORTED;}
> -void* getPrintfInfo(CallInst* inst)
> -{
> -  if ([inst])
> -return (void*)[inst];
> -  return NULL;
> -}
> +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { return 
> [inst]; }

I think
ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
  if (unit.printfs.find(inst) == unit.printfs.end())
return NULL;

 return [inst];
 }

would be better

>  private:
>void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug 
> infomation in context for subsequently passing to Gen insn
>ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t 
> index = 0u);
> @@ -1127,21 +1127,15 @@ namespace gbe
>  }
>}
>  
> -  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
> +  unsigned GenWriter::getNewBti(Value *origin, bool force) {
>  unsigned new_bti = 0;
> -if (isImage) {
> +if (force) {
>new_bti = btiBase;
>incBtiBase();
>return new_bti;
>  }
>  
> -if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
> -  new_bti = btiBase;
> -  incBtiBase();
> -} else if 
> (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
> -  new_bti = btiBase;
> -  incBtiBase();
> -} else if 
> (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
> +if (origin->getName().equals(StringRef("__gen

Re: [Beignet] [PATCH 00/18] Enable profiling by line number.

2016-01-27 Thread He Junyan
Ping for review

On Thu, Dec 24, 2015 at 07:01:52PM +0800, junyan...@inbox.com wrote:
> Date: Thu, 24 Dec 2015 19:01:52 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [PATCH 00/18] Enable profiling by line number.
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> This patch set will let the user to specify the line numbers in the source
> code to insert the profiling watch points.
> As the first step, we just use the env var OCL_PROFILING_LINES to control
> the kernel name and line numbers. The format is:
> KERNEL_NAME:PROFILING_MODE:LINE_NUMBER0,LINE_NUMBER1,LINE_NUMBER2,...
> for example:
> export OCL_PROFILING_LINES="builtin_atanpi_float8:2:2,6,7,8,15"
> will insert watch points at 2 6 7 8 15 lines in the kernel named 
> builtin_atanpi_float8.
> We have 3 PROFILING_MODE,
>   level 1: just brief timestamp with line number.
>Total log number is 6
>  Line2:Timestamp:   190  Thread Exec:6
>  Line6:Timestamp:  1174  Thread Exec:6
>  Line7:Timestamp:  3092  Thread Exec:6
>  Line8:Timestamp:  3105  Thread Exec:6
>  Line   15:Timestamp:  3241  Thread Exec:6
> 
>level 2: timestamp with source, plus:
>Format: Average Timestamp  Exec number Source
> |  __kernel void 
> builtin_atanpi_float8(__global float *dst,  __global float *src1, __global 
> int *vector) {
>TS:  190  Num:   6 > |int i = get_global_id(0);
> |float8 x1 = (float8) (src1[i * (*vector) 
> + 0],src1[i * (*vector) + 1],src1[i * (*vector) + 2],src1[i * (*vector) + 
> 3],src1[i * (*vector) + 4],src1[i * (*vector) + 5],src1[i * (*vector) + 
> 6],src1[i * (*vector) + 7]);
> |  
> |float8 ret;
>TS: 1174  Num:   6 > |ret = atanpi(x1);
>TS: 3092  Num:   6 > |dst[i * (*vector) + 0] = ret[0];
>TS: 3105  Num:   6 > |dst[i * (*vector) + 1] = ret[1];
> |dst[i * (*vector) + 2] = ret[2];
> |dst[i * (*vector) + 3] = ret[3];
> |dst[i * (*vector) + 4] = ret[4];
> |dst[i * (*vector) + 5] = ret[5];
> |dst[i * (*vector) + 6] = ret[6];
> |dst[i * (*vector) + 7] = ret[7];
>TS: 3241  Num:   6 > |  };
> 
> 
>level 3: output the detail logs, add all logs as:
> Log 0  ---
>| fix functions id:   7 simd:   16   kernel id:0  |
>| thread id:0  EU id:   8  sub slice id: 0 slice id 0 |
>| dispatch Mask:   1 prolog:  6860  epilog: 19548 |
>| globalX:   3~   3  globalY:   0~   0  globalZ:   0~   0 |
>|  ts0 :   201  | ts1 :  1180  | ts2 : 12417  |
>|  ts3 : 12430  | ts4 : 12637  | ts5 : 0  |
>|  ts6 : 0  | ts7 : 0  | ts8 : 0  |
>|  ts9 : 0  | ts10: 0  | ts11: 0  |
>|  ts12: 0  | ts13: 0  | ts14: 0  |
>|  ts15: 0  | ts16: 0  | ts17: 0  |
>|  ts18: 0  | ts19: 0  |  |
> Log 1  ---
>| fix functions id:   7 simd:   16   kernel id:0  |
>| thread id:0  EU id:   8  sub slice id: 1 slice id 0 |
>| dispatch Mask:   1 prolog:  6877  epilog: 19569 |
>| globalX:   4~   4  globalY:   0~   0  globalZ:   0~   0 |
>|  ts0 :   209  | ts1 :  1190  | ts2 : 12423  |
>|  ts3 : 12436  | ts4 : 12643  | ts5 : 0  |
>|  ts6 : 0  | ts7 : 0  | ts8 : 0  |
>|  ts9 : 0  | ts10: 0  | ts11: 0  |
>|  ts12: 0  | ts13: 0  | ts14: 0  |
>|  ts15: 0  | ts16: 0  | ts17: 0  |
>|  ts18: 0  | ts19: 0  |  |
>.
>.
> 
> 
> 
> Some problems:
> 1. On BDW, the timestamp sometimes gives invalid huge value.
>It may be a HW issue or feature, we need to check it further.
> 2. Sometimes the line number of instruction is different from the
>source code. This is caused by optimization and we can notice
>and analyse it by Gen IR or ASM. I will send a patch to set
>optimization level later.
> 3. Some line numbers are missing

Re: [Beignet] [Printf][PATCH 04/11] Add the implementation of printf ir instruction.

2016-01-27 Thread He Junyan
On Thu, Jan 21, 2016 at 11:29:41AM +0800, Yan Wang wrote:
> Date: Thu, 21 Jan 2016 11:29:41 +0800
> From: Yan Wang <yan.w...@linux.intel.com>
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [Printf][PATCH 04/11] Add the implementation of printf
>  ir instruction.
> X-Mailer: git-send-email 2.5.0
> 
> Contributor: Junyan He <junyan...@linux.intel.com>
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  backend/src/ir/function.hpp|  8 ++
>  backend/src/ir/instruction.cpp | 57 
> +-
>  backend/src/ir/instruction.hpp | 13 ++
>  backend/src/ir/instruction.hxx |  1 +
>  backend/src/ir/register.cpp|  8 ++
>  backend/src/ir/register.hpp| 21 
>  6 files changed, 107 insertions(+), 1 deletion(-)
> 
> diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
> index 78250cf..5785bee 100644
> --- a/backend/src/ir/function.hpp
> +++ b/backend/src/ir/function.hpp
> @@ -341,6 +341,14 @@ namespace ir {
>  INLINE void setRegister(Tuple ID, uint32_t which, Register reg) {
>file.set(ID, which, reg);
>  }
I think need to extract this logic to the type TUPLE patch.
> +/*! Get the type from the tuple vector */
> +INLINE uint8_t getType(Tuple ID, uint32_t which) const {
> +  return file.getType(ID, which);
> +}
> +/*! Set the type into the tuple vector */
> +INLINE void setType(Tuple ID, uint32_t which, uint8_t type) {
> +  file.setType(ID, which, type);
> +}
>  /*! Get the register file */
>  INLINE const RegisterFile (void) const { return file; }
>  /*! Get the given value ie immediate from the function */
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index bb5aac5..652c1fb 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -994,6 +994,40 @@ namespace ir {
>  Register dst[1];
>  };
>  
> +class ALIGNED_INSTRUCTION PrintfInstruction :
> +  public BasePolicy,
> +  public TupleSrcPolicy,
> +  public NDstPolicy<PrintfInstruction, 1>
> +{
> +  public:
> +INLINE PrintfInstruction(Register dst, Tuple srcTuple, Tuple 
> typeTuple,
> + uint8_t srcNum, uint8_t bti, uint16_t num) {
> +  this->opcode = OP_PRINTF;
> +  this->dst[0] = dst;
> +  this->src = srcTuple;
> +  this->type = typeTuple;
> +  this->srcNum = srcNum;
> +  this->bti = bti;
> +  this->num = num;
> +}
> +INLINE bool wellFormed(const Function , std::string ) 
> const;
> +INLINE void out(std::ostream , const Function ) const;
> +
> +uint32_t getNum(void) const { return this->num; }
> +uint32_t getBti(void) const { return this->bti; }
> +Type getType(const Function& fn, uint32_t ID) const {
> +  GBE_ASSERTM(ID < this->srcNum, "Out-of-bound types");
> +  return (Type)fn.getType(type, ID);
> +}
> +
> +uint32_t srcNum:8;//!< Source Number
> +uint32_t bti:8;   //!< The BTI
> +uint32_t num:16;  //!< The printf statement number of one kernel.
> +Tuple src;
> +Tuple type;
> +Register dst[1];
> +};
> +
>  #undef ALIGNED_INSTRUCTION
>  
>  /
> @@ -1473,6 +1507,10 @@ namespace ir {
>return true;
>  }
>  
> +INLINE bool PrintfInstruction::wellFormed(const Function , 
> std::string ) const {
> +  return true;
> +}
> +
>  #undef CHECK_TYPE
>  
>  /
> @@ -1702,6 +1740,11 @@ namespace ir {
>  
>out << "TheadID Map at SLM: " << this->slmAddr;
>  }
> +
> +INLINE void PrintfInstruction::out(std::ostream , const Function 
> ) const {
> +  this->outOpcode(out);
> +}
> +
>} /* namespace internal */
>  
>std::ostream << (std::ostream , AddressSpace addrSpace) {
> @@ -1862,6 +1905,10 @@ START_INTROSPECTION(WorkGroupInstruction)
>  #include "ir/instruction.hxx"
>  END_INTROSPECTION(WorkGroupInstruction)
>  
> +START_INTROSPECTION(PrintfInstruction)
> +#include "ir/instruction.hxx"
> +END_INTROSPECTION(PrintfInstruction)
> +
>  #undef END_INTROSPECTION
>  #undef START_INTROSPECTION
>  #undef DECL_INSN
> @@ -2008,7 +2055,8 @@ END_FU

Re: [Beignet] [Printf][PATCH 03/11] Reconstruct printf parser.

2016-01-27 Thread He Junyan
On Thu, Jan 21, 2016 at 11:29:24AM +0800, Yan Wang wrote:
> Date: Thu, 21 Jan 2016 11:29:24 +0800
> From: Yan Wang <yan.w...@linux.intel.com>
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [Printf][PATCH 03/11] Reconstruct printf parser.
> X-Mailer: git-send-email 2.5.0
> 
> Contributor: Junyan He <junyan...@linux.intel.com>
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  backend/src/ir/unit.cpp |   1 -
>  backend/src/ir/unit.hpp |   2 +-
>  backend/src/llvm/llvm_gen_backend.cpp   |   4 +-
>  backend/src/llvm/llvm_printf_parser.cpp | 112 
> ++--
>  4 files changed, 53 insertions(+), 66 deletions(-)
> 
> diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
> index a350c60..5604244 100644
> --- a/backend/src/ir/unit.cpp
> +++ b/backend/src/ir/unit.cpp
> @@ -34,7 +34,6 @@ namespace ir {
>Unit::~Unit(void) {
>  for (const auto  : functions) GBE_DELETE(pair.second);
>  delete profilingInfo;
> -for (const auto  : printfs) GBE_DELETE(pair.second);
>}
>Function *Unit::getFunction(const std::string ) const {
>  auto it = functions.find(name);
> diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
> index 10a1af6..9b9e41f 100644
> --- a/backend/src/ir/unit.hpp
> +++ b/backend/src/ir/unit.hpp
> @@ -47,7 +47,7 @@ namespace ir {
>public:
>  typedef map<std::string, Function*> FunctionSet;
>  /*! Moved from printf pass */
> -map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
> +map<llvm::CallInst*, PrintfSet::PrintfFmt> printfs;
>  /*! Create an empty unit */
>  Unit(PointerSize pointerSize = POINTER_32_BITS);
>  /*! Release everything (*including* the function pointers) */
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index dec023c..dba9dba 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -719,8 +719,8 @@ namespace gbe
>  void visitInstruction(Instruction ) {NOT_SUPPORTED;}
>  void* getPrintfInfo(CallInst* inst)
>  {
> -  if (unit.printfs[inst])
> -return (void*)unit.printfs[inst];
> +  if ([inst])
> +return (void*)[inst];
>return NULL;
>  }
>  private:
> diff --git a/backend/src/llvm/llvm_printf_parser.cpp 
> b/backend/src/llvm/llvm_printf_parser.cpp
> index 1c88981..13ce099 100644
> --- a/backend/src/llvm/llvm_printf_parser.cpp
> +++ b/backend/src/llvm/llvm_printf_parser.cpp
> @@ -293,41 +293,21 @@ error:
>public:
>  static char ID;
>  typedef std::pair<Instruction*, bool> PrintfInst;
> -std::vector deadprintfs;
>  Module* module;
>  IRBuilder<>* builder;
>  Type* intTy;
> -Value* pbuf_ptr;
> -Value* index_buf_ptr;
> -Value* g1Xg2Xg3;
> -Value* wg_offset;
> -int out_buf_sizeof_offset;
>  ir::Unit 
> -int printf_num;
> -int totalSizeofSize;
> -
> -struct PrintfParserInfo {
> -  llvm::CallInst* call;
> -  PrintfSet::PrintfFmt* printf_fmt;
> -};
>  
>  PrintfParser(ir::Unit ) : FunctionPass(ID),
> -unit(unit)
> +  unit(unit)
>  {
>module = NULL;
>builder = NULL;
>intTy = NULL;
> -  out_buf_sizeof_offset = 0;
> -  pbuf_ptr = NULL;
> -  index_buf_ptr = NULL;
> -  g1Xg2Xg3 = NULL;
> -  wg_offset = NULL;
> -  printf_num = 0;
> -  totalSizeofSize = 0;
>  }
>  
> -bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, 
> int& sizeof_size);
> -bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& 
> dst_type, int& sizeof_size);
> +bool parseOnePrintfInstruction(CallInst * call);
> +bool generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& 
> new_arg);
>  
>  virtual const char *getPassName() const
>  {
> @@ -337,7 +317,7 @@ error:
>  virtual bool runOnFunction(llvm::Function );
>};
>  
> -  bool PrintfParser::parseOnePrintfInstruction(CallInst * call, 
> PrintfParserInfo& info, int& sizeof_size)
> +  bool PrintfParser::parseOnePrintfInstruction(CallInst * call)
>{
>  CallSite CS(call);
>  CallSite::arg_iterator CI_FMT = CS.arg_begin();
> @@ -359,16 +339,44 @@ error:
>  PrintfSet::PrintfFmt* printf_fmt = NULL;

Maybe we can check whether the printf string is just "" here.
   if (fmt.size() == 0) {
  return false; // A null string, do nothing.

Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf instruction.

2016-01-27 Thread He Junyan
After applied the printf patch set, I find the last test still
failed, please help to check.

On Thu, Jan 28, 2016 at 12:33:05PM +0800, He Junyan wrote:
> Date: Thu, 28 Jan 2016 12:33:05 +0800
> From: He Junyan <junyan...@inbox.com>
> To: beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf
>  instruction.
> 
> On Thu, Jan 21, 2016 at 11:30:21AM +0800, Yan Wang wrote:
> > Date: Thu, 21 Jan 2016 11:30:21 +0800
> > From: Yan Wang <yan.w...@linux.intel.com>
> > To: beignet@lists.freedesktop.org
> > Cc: Yan Wang <yan.w...@linux.intel.com>
> > Subject: [Beignet] [Printf][PATCH 06/11] Implement emision of printf
> >  instruction.
> > X-Mailer: git-send-email 2.5.0
> > 
> > Contributor: Junyan He <junyan...@linux.intel.com>
> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> > ---
> >  backend/src/ir/context.hpp|  5 ++
> >  backend/src/llvm/llvm_gen_backend.cpp | 89 
> > ---
> >  2 files changed, 78 insertions(+), 16 deletions(-)
> > 
> I think it is better to write another patch to type TUPLE logic
> > diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
> > index b95741f..877d639 100644
> > --- a/backend/src/ir/context.hpp
> > +++ b/backend/src/ir/context.hpp
> > @@ -149,6 +149,11 @@ namespace ir {
> >GBE_ASSERTM(fn != NULL, "No function currently defined");
> >return fn->file.appendArrayTuple(reg, regNum);
> >  }
> > +/*! Make a tuple from an array of types */
> > +INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) {
> > +  GBE_ASSERTM(fn != NULL, "No function currently defined");
> > +  return fn->file.appendArrayTypeTuple((uint8_t*)type, num);
> > +}
> >  /*! We just use variadic templates to forward instruction functions */
> >  #define DECL_INSN(NAME, FAMILY) \
> >  template  INLINE void NAME(Args...args);
> > diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> > b/backend/src/llvm/llvm_gen_backend.cpp
> > index dba9dba..cc736d7 100644
> > --- a/backend/src/llvm/llvm_gen_backend.cpp
> > +++ b/backend/src/llvm/llvm_gen_backend.cpp
> > @@ -486,6 +486,9 @@ namespace gbe
> >  typedef map>::iterator PtrOrigMapIter;
> >  // map pointer source to bti
> >  map BtiMap;
> > +// map printf pointer source to bti
> > +int printfBti;
> > +uint32_t printfNum;
> >  // map ptr to its bti register
> >  map BtiValueMap;
> >  // map ptr to it's base
> > @@ -520,6 +523,8 @@ namespace gbe
> >  unit(unit),
> >  ctx(unit),
> >  regTranslator(ctx),
> > +printfBti(-1),
> Also need to reset printfBti for each runOnFunction.
> 
> > +printfNum(0),
> >  LI(0),
> >  TheModule(0),
> >  btiBase(BTI_RESERVED_NUM),
> > @@ -594,7 +599,7 @@ namespace gbe
> >  /*! For all possible pointers, GlobalVariable, function pointer 
> > argument,
> >  alloca instruction, find their pointer escape points */
> >  void analyzePointerOrigin(Function );
> > -unsigned getNewBti(Value *origin, bool isImage);
> > +unsigned getNewBti(Value *origin, bool force);
> >  void assignBti(Function );
> >  bool isSingleBti(Value *Val);
> >  Value *getBtiRegister(Value *v);
> > @@ -717,12 +722,7 @@ namespace gbe
> >  // handle load of dword/qword with unaligned address
> >  void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, 
> > ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, 
> > bool fixedBTI);
> >  void visitInstruction(Instruction ) {NOT_SUPPORTED;}
> > -void* getPrintfInfo(CallInst* inst)
> > -{
> > -  if ([inst])
> > -return (void*)[inst];
> > -  return NULL;
> > -}
> > +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { return 
> > [inst]; }
> 
> I think
> ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
> if (unit.printfs.find(inst) == unit.printfs.end())
>   return NULL;
>   
>return [inst];
>}
> 
> would be better
> 
> >  private:
> >void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug 
> > infomation in context for subsequently passing to Gen insn
> >ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, 
> > int32_t index = 0u)

Re: [Beignet] beignet git with llvm 3.7.1 + haswell (gen 7.5) failing

2016-01-22 Thread He Junyan
I think the correct steps to use FP64 should be:
1. Query the device using clGetDeviceInfo with CL_DEVICE_EXTENSIONS.
2. Check whether the extension string has "cl_khr_fp64"
3. If so, then you can enable DOUBLE support with
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
You can refer to the cl_check_double helper function in utests.

Haswell really could use double before, but not a full feature one.
Because of the hardware's limitation, the precision of double is
even lower than the float on PRE-HASWELL platforms. And double
division is also unavailable on HASWELL. So we decide to totally
disable double support on PRE-BDW platforms.

The better user experience is on our plan. We really should give
more useful prompt information and handle compiling error more
decently, rather than a simple ASSERT.


On Thu, Jan 21, 2016 at 06:50:36AM +, Song, Ruiling wrote:
> Date: Thu, 21 Jan 2016 06:50:36 +
> From: "Song, Ruiling" <ruiling.s...@intel.com>
> To: Paulo Dias <paulo.miguel.d...@gmail.com>, "He, Junyan"
>  <junyan...@intel.com>
> Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org>
> Subject: Re: [Beignet] beignet git with llvm 3.7.1 + haswell (gen 7.5)
>  failing
> 
> Yes, I agree with you. But at least as I know haswell hardware does not 
> support double well.
> 
> Hi Junyan,
> 
> Can we handle it graciously? Do you have any idea?
> 
> Thanks!
> Ruiling
> 
> From: Paulo Dias [mailto:paulo.miguel.d...@gmail.com]
> Sent: Wednesday, January 20, 2016 8:37 PM
> To: Song, Ruiling <ruiling.s...@intel.com>
> Cc: beignet@lists.freedesktop.org
> Subject: Re: [Beignet] beignet git with llvm 3.7.1 + haswell (gen 7.5) failing
> 
> it does, but beignet should fail graciously with an error message then, not 
> segfault. and it used to work even with haswell.
> 
> groo@hydra:~/devel/opencl/tools-master$ ./cl-demo 1 10
> Choose platform:
> [0] Intel
> [1] Mesa
> Enter choice: 0
> Choose device:
> [0] Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile
> Enter choice: 0
> -
> NAME: Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile
> VENDOR: Intel
> PROFILE: FULL_PROFILE
> VERSION: OpenCL 1.2 beignet 1.2
> EXTENSIONS: cl_khr_global_int32_base_atomics 
> cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics 
> cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store 
> cl_khr_image2d_from_buffer cl_khr_spir cl_khr_icd cl_intel_accelerator 
> cl_intel_motion_estimation
> DRIVER_VERSION: 1.2
> 
> Type: GPU
> EXECUTION_CAPABILITIES: Kernel Native
> GLOBAL_MEM_CACHE_TYPE: Read-Write (2)
> CL_DEVICE_LOCAL_MEM_TYPE: Global (2)
> SINGLE_FP_CONFIG: 0x6
> QUEUE_PROPERTIES: 0x2
> 
> VENDOR_ID: 32902
> MAX_COMPUTE_UNITS: 20
> MAX_WORK_ITEM_DIMENSIONS: 3
> MAX_WORK_GROUP_SIZE: 512
> PREFERRED_VECTOR_WIDTH_CHAR: 16
> PREFERRED_VECTOR_WIDTH_SHORT: 8
> PREFERRED_VECTOR_WIDTH_INT: 4
> PREFERRED_VECTOR_WIDTH_LONG: 2
> PREFERRED_VECTOR_WIDTH_FLOAT: 4
> PREFERRED_VECTOR_WIDTH_DOUBLE: 0
> MAX_CLOCK_FREQUENCY: 1000
> ADDRESS_BITS: 32
> MAX_MEM_ALLOC_SIZE: 1610612736
> IMAGE_SUPPORT: 1
> MAX_READ_IMAGE_ARGS: 128
> MAX_WRITE_IMAGE_ARGS: 8
> IMAGE2D_MAX_WIDTH: 8192
> IMAGE2D_MAX_HEIGHT: 8192
> IMAGE3D_MAX_WIDTH: 8192
> IMAGE3D_MAX_HEIGHT: 8192
> IMAGE3D_MAX_DEPTH: 2048
> MAX_SAMPLERS: 16
> MAX_PARAMETER_SIZE: 1024
> MEM_BASE_ADDR_ALIGN: 1024
> MIN_DATA_TYPE_ALIGN_SIZE: 128
> GLOBAL_MEM_CACHELINE_SIZE: 64
> GLOBAL_MEM_CACHE_SIZE: 8192
> GLOBAL_MEM_SIZE: 2147483648
> MAX_CONSTANT_BUFFER_SIZE: 134217728
> MAX_CONSTANT_ARGS: 8
> LOCAL_MEM_SIZE: 65536
> ERROR_CORRECTION_SUPPORT: 0
> PROFILING_TIMER_RESOLUTION: 80
> ENDIAN_LITTLE: 1
> AVAILABLE: 1
> COMPILER_AVAILABLE: 1
> MAX_WORK_GROUP_SIZES: 512 512 512
> -
> ASSERTION FAILED: 0
>   at file 
> /build/beignet-4N2m2_/beignet-1.2.0~git201601200931.13f504c~padoka0/backend/src/backend/gen_encoder.cpp,
>  function virtual void gbe::GenEncoder::handleDouble(gbe::GenEncoder*, 
> uint32_t, gbe::GenRegister, gbe::GenRegister, gbe::GenRegister), line 634
> Trace/breakpoint trap (core dumped)
> 
> | Paulo Dias
> | paulo.miguel.d...@gmail.com<mailto:paulo.miguel.d...@gmail.com>
> 
> Tempora mutantur, nos et mutamur in illis.
> 
> On Mon, Jan 18, 2016 at 12:10 AM, Song, Ruiling 
> <ruiling.s...@intel.com<mailto:ruiling.s...@intel.com>> wrote:
> 
> Haswell does not support double data type, and Beignet does not expose the 
> extension. Looks like cl-demo use double data type?
> 
> 
> 
> Thanks!
> 

Re: [Beignet] [PATCH V2] Fix the bug of crash when we pass -I path with spaces.

2016-01-20 Thread He Junyan
V2 just fix some typo, please just ignore the previous one.
This patch should be merget to master and release1.1

On Wed, Jan 20, 2016 at 05:57:20PM +0800, junyan...@inbox.com wrote:
> Date: Wed, 20 Jan 2016 17:57:20 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH V2] Fix the bug of crash when we pass -I path
>  with spaces.
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> We failed to handle -I "/XX X/YY YY/" like path passed
> from the build option. We need to consider the spaces
> here and pass it correctly to Clang.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
>  backend/src/backend/program.cpp | 51 
> +
>  1 file changed, 47 insertions(+), 4 deletions(-)
> 
> diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
> index f886d03..c8bc688 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -770,17 +770,60 @@ namespace gbe {
>  bool useDefaultCLCVersion = true;
>  
>  if (options) {
> -  char *str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
> -  memcpy(str, options, strlen(options) + 1);
> -  std::string optionStr(str);
> +  char *c_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
> +  memcpy(c_str, options, strlen(options) + 1);
> +  std::string optionStr(c_str);
>const std::string unsupportedOptions("-cl-denorms-are-zero, 
> -cl-strict-aliasing, -cl-opt-disable,"
> "-cl-no-signed-zeros, 
> -cl-fp32-correctly-rounded-divide-sqrt");
>  
>const std::string uncompatiblePCHOptions = 
> ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1, 
> -cl-finite-math-only");
>const std::string fastMathOption = ("-cl-fast-relaxed-math");
>while (end != std::string::npos) {
> +/* need to handle -I"/XX X/X XX" with spaces first. */
> +if (optionStr[start] == '-' && optionStr[start + 1] == 'I') {
> +  end = start + 2;
> +  while(end < optionStr.size() && optionStr[end] == ' ') // Ignore 
> the spaces
> +end++;
> +
> +  if (end == optionStr.size()) { //reach the end and no content, 
> ignore
> +free(c_str);
> +return true;
> +  }
> +
> +  if (optionStr[end] != '"') { // just a normal path without " "
> +clOpt.push_back("-I");
> +start = end;
> +continue;
> +  }
> +
> +  end++;
> +  start = end;
> +  clOpt.push_back("-I");
> +
> +  /* find the second " */
> +  while (end < optionStr.size() && optionStr[end] != '"')
> +end++;
> +
> +  if (optionStr[end] != '"') {
> +free(c_str);
> +return false;
> +  }
> +
> +  if (end == start + 1) { // the case of "", ignore
> +start = end + 1;
> +continue;
> +  }
> +
> +  std::string IPath = optionStr.substr(start, end - start);
> +  clOpt.push_back(IPath.c_str());
> +  start = end + 1;
> +  continue;
> +}
> +
> +
>  end = optionStr.find(' ', start);
>  std::string str = optionStr.substr(start, end - start);
> +
>  start = end + 1;
>  if(str.size() == 0)
>continue;
> @@ -822,7 +865,7 @@ namespace gbe {
>  
>  clOpt.push_back(str);
>}
> -  free(str);
> +  free(c_str);
>  }
>  
>  if (useDefaultCLCVersion) {
> -- 
> 1.9.1
> 
> 
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Workgroup reduce add optimization

2016-01-03 Thread He Junyan
I think the basic idea is OK and the result is als OK.
Please pay attention to the code format, we prefer spaces rather
than TABs to begin the line.

And some comments below.

On Wed, Dec 23, 2015 at 05:32:19PM +0200, Grigore Lupescu wrote:
> Date: Wed, 23 Dec 2015 17:32:19 +0200
> From: Grigore Lupescu 
> To: beignet@lists.freedesktop.org
> Subject: [Beignet]  [PATCH] Workgroup reduce add optimization
> X-Mailer: git-send-email 2.1.4
> 
> Signed-off-by: Grigore Lupescu 
> ---
>  backend/src/backend/gen_context.cpp | 48 
> -
>  1 file changed, 32 insertions(+), 16 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp 
> b/backend/src/backend/gen_context.cpp
> index a2e11a4..52e988e 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -2943,21 +2943,38 @@ namespace gbe
>}
>  }
>}
> -} else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) {
> -  GBE_ASSERT(tmp.type == theVal.type);
> -  GenRegister v = GenRegister::toUniform(tmp, theVal.type);
> -  for (uint32_t i = 0; i < simd; i++) {
> -p->ADD(threadData, threadData, v);
> -v.subnr += typeSize(theVal.type);
> -if (v.subnr == 32) {
> -  v.subnr = 0;
> -  v.nr++;
> -}
> -  }
> -}
> -
> -p->pop();
> -  }
> +} else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){
> +
> + tmp.hstride = GEN_HORIZONTAL_STRIDE_1;
> + tmp.vstride = GEN_VERTICAL_STRIDE_4;
> + tmp.width = GEN_WIDTH_4;
> +
> + GBE_ASSERT(tmp.type == theVal.type);
> + GenRegister partialSum = tmp;
> +
> + /* adjust offset, compute add with ADD4/ADD */
> + for (uint32_t i = 1; i < simd/4; i++){
> + tmp = tmp.suboffset(tmp, 4);
> + GenNativeInstruction* insnQ1 = p->next(GEN_OPCODE_ADD);
> + p->setHeader(insnQ1);
> + p->setDst(insnQ1, partialSum);
> + p->setSrc0(insnQ1, partialSum);
> + p->setSrc1(insnQ1, tmp);
> + insnQ1->header.execution_size = GEN_WIDTH_4;
> + }
I think it is not good to use generate the instruction directly here.
Maybe you can set simd=4 and call p->ADD.
We want to put all the instructions generation to gen_encoder.cpp

> +
> + partialSum = GenRegister::toUniform(partialSum, theVal.type);
> + for (uint32_t i = 0; i < 4; i++){
> + p->ADD(threadData, threadData, partialSum);
> + partialSum.subnr += typeSize(theVal.type);
> + if (partialSum.subnr == 32) {
> + partialSum.subnr = 0;
> + partialSum.nr++;
> + }
I think you can also use suboffset here.
> + }
> + }
> + p->pop();
> +}
>  
>  #define SEND_RESULT_MSG() \
>  do { \
> @@ -3123,7 +3140,6 @@ do { \
>  p->curr.predicate = GEN_PREDICATE_NONE;
>  p->WAIT(2);
>  p->patchJMPI(jip, (p->n_instruction() - jip), 0);
> -
>  /* Do something when get the msg. */
>  p->curr.execWidth = simd;
>  p->MOV(dst, msgData);
> -- 
> 2.1.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] add Broxton support

2015-12-10 Thread He Junyan
Ok, That's good for me.
Thanks for enabling BTX.

On Fri, Dec 04, 2015 at 03:22:20AM +0800, Guo Yejun wrote:
> Date: Fri,  4 Dec 2015 03:22:20 +0800
> From: Guo Yejun 
> To: beignet@lists.freedesktop.org
> Cc: Guo Yejun 
> Subject: [Beignet] [PATCH] add Broxton support
> X-Mailer: git-send-email 1.9.1
> 
> special versions of linux kernel and libdrm are needed.
> utest and conformance test PASSED.
> 
> Signed-off-by: Guo Yejun 
> ---
>  GetGenID.sh|   2 +-
>  backend/src/backend/gen8_context.cpp   |   2 +-
>  backend/src/backend/gen8_context.hpp   |   2 +
>  backend/src/backend/gen9_context.cpp   | 110 
> +
>  backend/src/backend/gen9_context.hpp   |  22 ++
>  backend/src/backend/gen_insn_selection.cpp |  11 +++
>  backend/src/backend/gen_insn_selection.hpp |   7 ++
>  backend/src/backend/gen_program.cpp|  17 -
>  backend/src/gbe_bin_generater.cpp  |   4 ++
>  src/cl_device_data.h   |   9 ++-
>  src/cl_device_id.c |  34 +++--
>  src/intel/intel_gpgpu.c|   5 +-
>  12 files changed, 213 insertions(+), 12 deletions(-)
> 
> diff --git a/GetGenID.sh b/GetGenID.sh
> index 7acf9bd..30296da 100755
> --- a/GetGenID.sh
> +++ b/GetGenID.sh
> @@ -1,5 +1,5 @@
>  #!/bin/bash
> -genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 
> 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 
> 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26)
> +genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 
> 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 
> 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 
> 5a84)
>  pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F 
> ] '{print $1}'))
>  n=${#pciid[*]}
>  i=0
> diff --git a/backend/src/backend/gen8_context.cpp 
> b/backend/src/backend/gen8_context.cpp
> index 71d900f..7455bfc 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -417,7 +417,7 @@ namespace gbe
>  GBE_ASSERT(0);
>}
>  
> -  static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0)
> +  GenRegister Gen8Context::unpacked_ud(GenRegister reg, uint32_t offset)
>{
>  if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) {
>if(offset == 0)
> diff --git a/backend/src/backend/gen8_context.hpp 
> b/backend/src/backend/gen8_context.hpp
> index 537aef5..cc415c6 100644
> --- a/backend/src/backend/gen8_context.hpp
> +++ b/backend/src/backend/gen8_context.hpp
> @@ -76,6 +76,8 @@ namespace gbe
>  
>  virtual void emitF64DIVInstruction(const SelectionInstruction );
>  
> +static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0);
> +
>protected:
>  virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, 
> int sz = 0);
>  virtual void subTimestamps(GenRegister& t0, GenRegister& t1, 
> GenRegister& tmp);
> diff --git a/backend/src/backend/gen9_context.cpp 
> b/backend/src/backend/gen9_context.cpp
> index c35293a..47b1496 100644
> --- a/backend/src/backend/gen9_context.cpp
> +++ b/backend/src/backend/gen9_context.cpp
> @@ -55,4 +55,114 @@ namespace gbe
>p->WAIT();
>  p->pop();
>}
> +
> +  void BxtContext::newSelection(void) {
> +this->sel = GBE_NEW(SelectionBxt, *this);
> +  }
> +
> +  void BxtContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, 
> GenRegister dst_h,
> + GenRegister dst_l, GenRegister 
> s0l_s1h, GenRegister s0h_s1l)
> +  {
> +src0.type = src1.type = GEN_TYPE_UD;
> +dst_h.type = dst_l.type = GEN_TYPE_UL;
> +s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
> +
> +//GenRegister tmp;
> +
> +GenRegister s0l = unpacked_ud(src0);
> +GenRegister s1l = unpacked_ud(src1);
> +GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, 
> reuse s0l_s1h
> +GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, 
> reuse dst_l
> +
> +p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
> +p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
> +
> +/* High 32 bits X High 32 bits. */
> +p->MUL(dst_h, s0h, s1h);
> +/* High 32 bits X low 32 bits. */
> +p->MUL(s0h_s1l, s0h, s1l);
> +/* Low 32 bits X high 32 bits. */
> +p->MUL(s0l_s1h, s0l, s1h);
> +/* Low 32 bits X low 32 bits. */
> +p->MUL(dst_l, s0l, s1l);
> +
> +/*  Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 
> 1 - 2^(N+1), here N = 32
> +The max of addding 2 32bits integer to it is
> +2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
> +which means the product s0h_s1l adds dst_l's high 32 bits and then 
> adds s0l_s1h's low 32 bits will not
> +overflow and have no 

Re: [Beignet] [PATCH 07/13] Backend: Add WORKGROUP_OP instruction selection.

2015-12-09 Thread He Junyan
On Wed, Dec 09, 2015 at 08:18:29AM +, Yang, Rong R wrote:
> Date: Wed, 9 Dec 2015 08:18:29 +
> From: "Yang, Rong R" <rong.r.y...@intel.com>
> To: "junyan...@inbox.com" <junyan...@inbox.com>,
>  "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org>
> Subject: Re: [Beignet] [PATCH 07/13] Backend: Add WORKGROUP_OP instruction
>   selection.
> 
> 
> 
> > -Original Message-
> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> > junyan...@inbox.com
> > Sent: Tuesday, December 1, 2015 16:11
> > To: beignet@lists.freedesktop.org
> > Subject: [Beignet] [PATCH 07/13] Backend: Add WORKGROUP_OP instruction
> > selection.
> > 
> > From: Junyan He <junyan...@linux.intel.com>
> > 
> > Signed-off-by: Junyan He <junyan...@linux.intel.com>
> > ---
> >  backend/src/backend/gen_context.cpp|3 ++
> >  backend/src/backend/gen_context.hpp|1 +
> >  .../src/backend/gen_insn_gen7_schedule_info.hxx|1 +
> >  backend/src/backend/gen_insn_selection.cpp |   34
> > 
> >  backend/src/backend/gen_insn_selection.hpp |1 +
> >  backend/src/backend/gen_insn_selection.hxx |1 +
> >  6 files changed, 41 insertions(+)
> > 
> > diff --git a/backend/src/backend/gen_context.cpp
> > b/backend/src/backend/gen_context.cpp
> > index 43fa7fa..5c819b7 100644
> > --- a/backend/src/backend/gen_context.cpp
> > +++ b/backend/src/backend/gen_context.cpp
> > @@ -2844,6 +2844,9 @@ namespace gbe
> >  } p->pop();
> >}
> > 
> > +  void GenContext::emitWorkGroupOpInstruction(const
> > + SelectionInstruction ) {  }
> > +
> >void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset,
> > int sz) {
> >  if (sz == 0)
> >sz = 8;
> > diff --git a/backend/src/backend/gen_context.hpp
> > b/backend/src/backend/gen_context.hpp
> > index da9bbbe..22ec0ea 100644
> > --- a/backend/src/backend/gen_context.hpp
> > +++ b/backend/src/backend/gen_context.hpp
> > @@ -179,6 +179,7 @@ namespace gbe
> >  virtual void emitF64DIVInstruction(const SelectionInstruction );
> >  void emitCalcTimestampInstruction(const SelectionInstruction );
> >  void emitStoreProfilingInstruction(const SelectionInstruction );
> > +void emitWorkGroupOpInstruction(const SelectionInstruction );
> >  void scratchWrite(const GenRegister header, uint32_t offset, uint32_t
> > reg_num, uint32_t reg_type, uint32_t channel_mode);
> >  void scratchRead(const GenRegister dst, const GenRegister header,
> > uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t
> > channel_mode);
> >  unsigned beforeMessage(const SelectionInstruction , GenRegister
> > bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc); diff --git
> > a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> > b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> > index 739cc04..8ef422f 100644
> > --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> > +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> > @@ -47,3 +47,4 @@ DECL_GEN7_SCHEDULE(I64SATSUB,   20,40,  
> > 20)
> >  DECL_GEN7_SCHEDULE(F64DIV,  20,40,  20)
> >  DECL_GEN7_SCHEDULE(CalcTimestamp,   80,1,1)
> >  DECL_GEN7_SCHEDULE(StoreProfiling,  80,1,1)
> > +DECL_GEN7_SCHEDULE(WorkGroupOp,80, 1,
> >  1)
> > diff --git a/backend/src/backend/gen_insn_selection.cpp
> > b/backend/src/backend/gen_insn_selection.cpp
> > index 5b08958..536d347 100644
> > --- a/backend/src/backend/gen_insn_selection.cpp
> > +++ b/backend/src/backend/gen_insn_selection.cpp
> > @@ -680,6 +680,9 @@ namespace gbe
> >  void I64REM(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int 
> > tmp_int);
> >  /*! double division */
> >  void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum);
> > +/*! Work Group Operations */
> > +void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
> > GenRegister nextThreadID,
> > + GenRegister threadID, GenRegister threadn,
> > + GenRegister tmp);
> >  /* common functions for both binary instruction and sel_cmp and
> > compare instruction.
> > It will handle the IMM or normal register assignment, and will try 
> > to avoid
> > LOADI
> > as much as possible. */
> > @@ -1897,6 +190

Re: [Beignet] [PATCH] Add benchmark for workgroup functions

2015-12-06 Thread He Junyan
Hi Grigore,

I notice that you just reuse the kernel in utest as the benchmark kernel.
In this kernel, we just call the workgroup function once, while the time
diff calculated by your benchmark here includes the whole process of exec
a kernel on GPU.
The OCL_NDRANGE itself and the LOAD and STORE in the kernel may occupy more
time than the workgroup function. So I think it is hard for us to judge the
performance base on this time diff.
I think maybe you can re-write a kernel and call the workgroup function, for
example _add, more than 100 times within one kernel, and then the time diff may
be more valuable.


On Fri, Dec 04, 2015 at 03:37:28PM +0200, Grigore Lupescu wrote:
> Date: Fri,  4 Dec 2015 15:37:28 +0200
> From: Grigore Lupescu 
> To: beignet@lists.freedesktop.org
> Subject: [Beignet]  [PATCH] Add benchmark for workgroup functions
> X-Mailer: git-send-email 2.1.4
> 
> Signed-off-by: Grigore Lupescu 
> ---
>  benchmark/CMakeLists.txt|   3 +-
>  benchmark/benchmark_workgroup_functions.cpp | 176 
> 
>  2 files changed, 178 insertions(+), 1 deletion(-)
>  create mode 100644 benchmark/benchmark_workgroup_functions.cpp
> 
> diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
> index dd33829..fd7fd7d 100644
> --- a/benchmark/CMakeLists.txt
> +++ b/benchmark/CMakeLists.txt
> @@ -18,7 +18,8 @@ set (benchmark_sources
>benchmark_copy_buffer_to_image.cpp
>benchmark_copy_image_to_buffer.cpp
>benchmark_copy_buffer.cpp
> -  benchmark_copy_image.cpp)
> +  benchmark_copy_image.cpp
> +  benchmark_workgroup_functions.cpp)
>  
>  
>  SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
> diff --git a/benchmark/benchmark_workgroup_functions.cpp 
> b/benchmark/benchmark_workgroup_functions.cpp
> new file mode 100644
> index 000..81403a0
> --- /dev/null
> +++ b/benchmark/benchmark_workgroup_functions.cpp
> @@ -0,0 +1,176 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "utest_helper.hpp"
> +#include 
> +
> +double benchmark_workgroup_add_uint(void)
> +{
> + cl_int ret;
> + struct timeval start,stop;
> + const size_t set_size = 256;
> + const size_t set_num = set_size * set_size;
> + size_t set_num_work = set_num;
> + uint32_t* src = NULL; /* input set will be generated */
> +
> + cl_mem sub_buf_in;
> + cl_mem sub_buf_out;
> + cl_buffer_region buf_region_in;
> + cl_buffer_region buf_region_out;
> +
> + buf_region_in.size = set_size * sizeof(uint32_t);
> + buf_region_in.origin = 0;
> + buf_region_out.size = set_size * sizeof(uint32_t);
> + buf_region_out.origin = 0;
> +
> + /* Each set is of the form (1, 0, 0, ..0) */
> + src = (uint32_t*)calloc(sizeof(uint32_t), set_num * set_size);
> + OCL_ASSERT(src != NULL);
> + for(uint32_t i = 0; i < set_num * set_size; i++)
> + if((i % set_size) == 0)
> + src[i] = 1;
> +
> + /* Setup kernel and buffers */
> + OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
> + "compiler_workgroup_reduce_add_uint");
> + OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(uint32_t), 
> NULL);
> + OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(uint32_t), 
> NULL);
> +
> + OCL_MAP_BUFFER(0);
> + memcpy(buf_data[0], src, set_num* set_size * sizeof(uint32_t));
> + OCL_UNMAP_BUFFER(0);
> +
> + globals[0] = set_size;
> + locals[0] = set_size;
> +
> + /* Measure performance */
> + gettimeofday(,0);
> + while(set_num_work > 0){
> + /* Perform reductions, subBuffers with offsets */
> + for(uint32_t i = 0; i < set_num; i++){
> + sub_buf_in = clCreateSubBuffer(buf[0], 0,
> + CL_BUFFER_CREATE_TYPE_REGION, 
> _region_in, );
> + OCL_ASSERT(ret == 0);
> + sub_buf_out = clCreateSubBuffer(buf[1], 0,
> + CL_BUFFER_CREATE_TYPE_REGION, 
> _region_out, );
> + OCL_ASSERT(ret == 0);
> +
> + OCL_SET_ARG(0, sizeof(cl_mem), _buf_in);
> + OCL_SET_ARG(1, sizeof(cl_mem), _buf_out);
> + OCL_NDRANGE(1);
> +
> + buf_region_in.origin += set_size * sizeof(uint32_t);
> + buf_region_out.origin += set_size * sizeof(uint32_t);
> + }
> + /* Prepare memory for next set of reductions */
> + OCL_MAP_BUFFER(0);
> + OCL_MAP_BUFFER(1);
> + for (uint32_t i = 0; i < set_num_work; i++) {
> + ((uint32_t *)buf_data[0])[i] =
> + ((uint32_t *)buf_data[1])[i * set_size];
> + }
> + OCL_UNMAP_BUFFER(0);
> + OCL_UNMAP_BUFFER(1);
> +
> + set_num_work /= set_size;
> +   

Re: [Beignet] [PATCH 01/13] Backend: Add sr0 reg helper function.

2015-12-01 Thread He Junyan
This is V2.

V2:
   Just rebase to master and ping for review.


On Tue, Dec 01, 2015 at 04:10:28PM +0800, junyan...@inbox.com wrote:
> Date: Tue,  1 Dec 2015 16:10:28 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH 01/13] Backend: Add sr0 reg helper function.
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> sr0 is used to specify the state reigster where we can get the
> state of each EU thread.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
>  backend/src/backend/gen75_context.cpp |8 +---
>  backend/src/backend/gen_register.hpp  |   10 ++
>  2 files changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/backend/src/backend/gen75_context.cpp 
> b/backend/src/backend/gen75_context.cpp
> index 7d407c3..fa8b029 100644
> --- a/backend/src/backend/gen75_context.cpp
> +++ b/backend/src/backend/gen75_context.cpp
> @@ -44,13 +44,7 @@ namespace gbe
>  p->push();
>p->curr.execWidth = 1;
>p->curr.predicate = GEN_PREDICATE_NONE;
> -  GenRegister sr0 = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
> -GEN_ARF_STATE,
> -1,
> -GEN_TYPE_UD,
> -GEN_VERTICAL_STRIDE_8,
> -GEN_WIDTH_8,
> -GEN_HORIZONTAL_STRIDE_1);
> +  GenRegister sr0 = GenRegister::sr(0, 1);
>p->SHR(sr0, slm_index, GenRegister::immud(16));
>  p->pop();
>}
> diff --git a/backend/src/backend/gen_register.hpp 
> b/backend/src/backend/gen_register.hpp
> index 5c813be..aa0744b 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -828,6 +828,16 @@ namespace gbe
>   GEN_HORIZONTAL_STRIDE_0);
>  }
>  
> +static INLINE GenRegister sr(uint32_t nr, uint32_t subnr = 0) {
> +  return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
> + GEN_ARF_STATE | nr,
> + subnr,
> + GEN_TYPE_UD,
> + GEN_VERTICAL_STRIDE_8,
> + GEN_WIDTH_8,
> + GEN_HORIZONTAL_STRIDE_1);
> +}
> +
>  static INLINE GenRegister notification0(uint32_t subnr) {
>return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
>   GEN_ARF_NOTIFICATION_COUNT,
> -- 
> 1.7.9.5
> 
> 
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 21/21 V3] CMake: Add -lrt to the link command of libcl.so

2015-11-16 Thread He Junyan
Sorry, this patch should not belong to this patch set, please ignore it.

On Tue, Nov 17, 2015 at 07:40:23AM +0800, junyan...@inbox.com wrote:
> Date: Tue, 17 Nov 2015 07:40:23 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH 21/21 V3] CMake: Add -lrt to the link command of
>  libcl.so
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> The clock_gettime will cause the linkage error on some
> version of GCC, we need to add -lrt at the end of the
> link command line.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
>  src/CMakeLists.txt |1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index c917e76..4c5112c 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -144,6 +144,7 @@ add_library(cl SHARED ${OPENCL_SRC})
>  ADD_DEPENDENCIES(cl ${GIT_SHA1})
>  target_link_libraries(
>cl
> +  rt
>${X11_LIBRARIES}
>${XEXT_LIBRARIES}
>${XFIXES_LIBRARIES}
> -- 
> 1.7.9.5
> 
> 
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/7 V2] Backend: Delete the useless MOV_DF instruction.

2015-11-05 Thread He Junyan
V2:

Fix uniform bug in conversion.
Delete verbose printf in utests.
Fix a bug for BSW when convert half to double.

On Thu, Nov 05, 2015 at 04:15:41PM +0800, junyan...@inbox.com wrote:
> Date: Thu,  5 Nov 2015 16:15:41 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH 1/7 V2] Backend: Delete the useless MOV_DF
>  instruction.
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> Because just platform after BDW will support double,
> the special instruction for double MOV is not needed
> anymore.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
>  backend/src/backend/gen75_encoder.cpp  | 36 -
>  backend/src/backend/gen75_encoder.hpp  |  1 -
>  backend/src/backend/gen8_encoder.cpp   | 36 -
>  backend/src/backend/gen8_encoder.hpp   |  1 -
>  backend/src/backend/gen_context.cpp|  3 ---
>  backend/src/backend/gen_encoder.cpp| 43 
> --
>  backend/src/backend/gen_encoder.hpp|  2 --
>  backend/src/backend/gen_insn_selection.cpp | 23 +---
>  backend/src/backend/gen_insn_selection.hxx |  1 -
>  9 files changed, 1 insertion(+), 145 deletions(-)
> 
> diff --git a/backend/src/backend/gen75_encoder.cpp 
> b/backend/src/backend/gen75_encoder.cpp
> index 135be02..5d1a964 100644
> --- a/backend/src/backend/gen75_encoder.cpp
> +++ b/backend/src/backend/gen75_encoder.cpp
> @@ -251,42 +251,6 @@ namespace gbe
>  pop();
>}
>  
> -  void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister 
> tmp) {
> -GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && 
> dest.type == GEN_TYPE_F));
> -GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
> -int w = curr.execWidth;
> -GenRegister r0;
> -r0 = GenRegister::h2(r);
> -push();
> -curr.execWidth = 4;
> -curr.predicate = GEN_PREDICATE_NONE;
> -curr.noMask = 1;
> -MOV(r0, src0);
> -MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
> -curr.noMask = 0;
> -curr.quarterControl = 0;
> -curr.nibControl = 0;
> -MOV(dest, r0);
> -curr.nibControl = 1;
> -MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
> -pop();
> -if (w == 16) {
> -  push();
> -  curr.execWidth = 4;
> -  curr.predicate = GEN_PREDICATE_NONE;
> -  curr.noMask = 1;
> -  MOV(r0, GenRegister::suboffset(src0, 8));
> -  MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
> -  curr.noMask = 0;
> -  curr.quarterControl = 1;
> -  curr.nibControl = 0;
> -  MOV(GenRegister::suboffset(dest, 8), r0);
> -  curr.nibControl = 1;
> -  MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
> -  pop();
> -}
> -  }
> -
>void Gen75Encoder::JMPI(GenRegister src, bool longjmp) {
>  alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
>}
> diff --git a/backend/src/backend/gen75_encoder.hpp 
> b/backend/src/backend/gen75_encoder.hpp
> index e494f29..f5044c0 100644
> --- a/backend/src/backend/gen75_encoder.hpp
> +++ b/backend/src/backend/gen75_encoder.hpp
> @@ -42,7 +42,6 @@ namespace gbe
>  virtual void JMPI(GenRegister src, bool longjmp = false);
>  /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump 
> distance */
>  virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
> -virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp 
> = GenRegister::null());
>  virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double 
> value);
>  virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
> GenRegister bti, uint32_t srcNum);
>  virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
> bti, uint32_t elemNum);
> diff --git a/backend/src/backend/gen8_encoder.cpp 
> b/backend/src/backend/gen8_encoder.cpp
> index 55fc3fb..98c3917 100644
> --- a/backend/src/backend/gen8_encoder.cpp
> +++ b/backend/src/backend/gen8_encoder.cpp
> @@ -260,42 +260,6 @@ namespace gbe
>  MOV(dest, value);
>}
>  
> -  void Gen8Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister 
> tmp) {
> -GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && 
> dest.type == GEN_TYPE_F));
> -GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
> -int w = curr.execWidth;
> -GenRegister r0;
> -r0 = GenRegister::h2(r);
> -push();
> -curr.e

Re: [Beignet] [V2 PATCH 0/8] Implement double division on BDW

2015-10-14 Thread He Junyan
Ping for review.

On Fri, Sep 18, 2015 at 05:58:11PM +0800, junyan...@inbox.com wrote:
> Date: Fri, 18 Sep 2015 17:58:11 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [V2 PATCH 0/8] Implement double division on BDW
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> We use the macro:
> r0 = 0, r6 = a, r7 = b, r1 = 1
> 
> math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE
> (-f0.0) if
> madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2   // Step(1), q0=a*y0
> madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), e0=(1-b*y0)
> madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), r0=a-b*q0
> madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2   // Step(4), y1=y0+e0*y0
> madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), e1=(1-b*y1)
> madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6   // Step(6), y2=y0+e0*y1
> madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6   // Step(7), q1=q0+r0*y1
> madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7  // Step(8), y3=y1+e1*y2
> madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), r1=a-b*q1
> 
> madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2  // Step(10), q=q1+r1*y3
> endif
> 
> to implement hi precision double division on BDW.
> 
> 
> V2:
> 1. Correct the spelling slips.
> 2. Fix some bugs for double registers format.
> 3. Redefine the handle double logic and delete the double support on pre-gen7
> 4. Declare fp64 extension support on BDW.
> 5. Consider the uniform case for F64DIV.
> 
> With this patch set, the +-*/ is basically OK on BDW platform.
> All pre-gen7 platforms will not support double any more.
> Conversion and bitcast between double and other types are not OK now.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
> backend/src/backend/gen/gen_mesa_disasm.c   | 134 
> 
> backend/src/backend/gen75_encoder.hpp   |   4 -
> backend/src/backend/gen7_encoder.hpp|   4 -
> backend/src/backend/gen8_context.cpp| 145 
> ++
> backend/src/backend/gen8_context.hpp|   2 +
> backend/src/backend/gen8_encoder.cpp| 164 
> +-
> backend/src/backend/gen8_encoder.hpp|  12 ++-
> backend/src/backend/gen8_instruction.hpp|  86 
> backend/src/backend/gen_context.cpp |   4 +
> backend/src/backend/gen_context.hpp |   1 +
> backend/src/backend/gen_defs.hpp|  13 +++
> backend/src/backend/gen_encoder.cpp |  52 ++
> backend/src/backend/gen_encoder.hpp |   3 +-
> backend/src/backend/gen_insn_gen7_schedule_info.hxx |   1 +
> backend/src/backend/gen_insn_selection.cpp  |  54 +-
> backend/src/backend/gen_insn_selection.hxx  |   1 +
> backend/src/backend/gen_register.hpp|   6 +-
> kernels/compiler_double_4.cl|   5 -
> kernels/compiler_double_div.cl  |  11 ++
> src/cl_device_id.c  |   3 +
> src/cl_extensions.c |  21 
> src/cl_extensions.h |   2 +
> utests/CMakeLists.txt   |   2 +
> utests/compiler_double.cpp  |   5 +-
> utests/compiler_double_4.cpp|  40 
> utests/compiler_double_div.cpp  |  80 +++
> utests/utest_helper.cpp |  19 
> utests/utest_helper.hpp |   3 +
> 
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Backend: Refine ConvertInstruction logic in insn_selection

2015-10-14 Thread He Junyan
Ping for review.

On Tue, Sep 22, 2015 at 06:29:23PM +0800, junyan...@inbox.com wrote:
> Date: Tue, 22 Sep 2015 18:29:23 +0800
> From: junyan...@inbox.com
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH] Backend: Refine ConvertInstruction logic in
>  insn_selection
> X-Mailer: git-send-email 1.7.9.5
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> The ConvertInstruction now need to handle a lot of special
> cases instead of simple MOV. The judgement of native long
> support, half support and reg restriction of long type and
> the situation very complicated. The current code logic is
> too verbose and hard to read. We now use sub routine functions
> to make it clear and readable.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp |  780 
> +---
>  1 file changed, 475 insertions(+), 305 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp 
> b/backend/src/backend/gen_insn_selection.cpp
> index ab00269..4800f7f 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -4124,148 +4124,132 @@ namespace gbe
>return false;
>  }
>  
> -INLINE bool emitOne(Selection::Opaque , const ir::ConvertInstruction 
> , bool ) const
> +INLINE void convertBetweenHalfFloat(Selection::Opaque , const 
> ir::ConvertInstruction , bool ) const
>  {
>using namespace ir;
>const Type dstType = insn.getDstType();
>const Type srcType = insn.getSrcType();
> -  const RegisterFamily dstFamily = getFamily(dstType);
> -  const RegisterFamily srcFamily = getFamily(srcType);
>const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
>const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
>const Opcode opcode = insn.getOpcode();
> -  sel.push();
> -if (sel.isScalarReg(insn.getDst(0)) == true) {
> -  sel.curr.execWidth = 1;
> -  sel.curr.predicate = GEN_PREDICATE_NONE;
> -  sel.curr.noMask = 1;
> -}
> -  if(opcode == ir::OP_SAT_CVT)
> -sel.curr.saturate = 1;
>  
> -  // We need two instructions to make the conversion
>if (opcode == OP_F16TO32) {
>  sel.F16TO32(dst, src);
>} else if (opcode == OP_F32TO16) {
> +// We need two instructions to make the conversion
>  GenRegister unpacked;
>  unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, 
> sel.isScalarReg(insn.getSrc(0;
>  sel.push();
> -  if (sel.isScalarReg(insn.getSrc(0))) {
> -sel.curr.execWidth = 1;
> -sel.curr.predicate = GEN_PREDICATE_NONE;
> -sel.curr.noMask = 1;
> -  }
> -  sel.F32TO16(unpacked, src);
> +if (sel.isScalarReg(insn.getSrc(0))) {
> +  sel.curr.execWidth = 1;
> +  sel.curr.predicate = GEN_PREDICATE_NONE;
> +  sel.curr.noMask = 1;
> +}
> +sel.F32TO16(unpacked, src);
>  sel.pop();
>  sel.MOV(dst, unpacked);
> -  } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && 
> srcFamily == FAMILY_DWORD) {//convert i32 to small int and half
> -GenRegister unpacked;
> -if (dstFamily == FAMILY_WORD) {
> -  uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
> -
> -   /* The special case, when dst is half, float->word->half will lose 
> accuracy. */
> -   if (dstType == TYPE_HALF) {
> -GBE_ASSERT(sel.hasHalfType());
> -type = GEN_TYPE_HF;
> -  }
> +  } else {
> +GBE_ASSERT("Not conversion between float and half\n");
> +  }
> +}
>  
> -  if (!sel.isScalarReg(dst.reg())) {
> -unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, 
> sel.isScalarReg(insn.getSrc(0;
> -unpacked = GenRegister::retype(unpacked, type);
> -  } else
> -unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
> -} else {
> -  const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : 
> GEN_TYPE_B;
> -  if (!sel.isScalarReg(dst.reg())) {
> -unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, 
> sel.isScalarReg(insn.getSrc(0;
> -unpacked = GenRegister::retype(unpacked, type);
> -  } else
> -unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
> -}
> +INLINE void convert32bitsToSmall(Selection::Opaque , const 
> ir::ConvertInstruction , bool ) const
> +{
>

Re: [Beignet] [PATCH] fix __kernel function called in __kernel issue.

2015-10-12 Thread He Junyan
LGTM, thanks

On Sat, Oct 10, 2015 at 06:55:45AM -0400, xionghu@intel.com wrote:
> Date: Sat, 10 Oct 2015 06:55:45 -0400
> From: xionghu@intel.com
> To: beignet@lists.freedesktop.org
> Cc: xionghu@intel.com
> Subject: [Beignet] [PATCH] fix __kernel function called in __kernel issue.
> X-Mailer: git-send-email 1.9.1
> 
> From: Luo Xionghu 
> 
> the printfPaser variables g1Xg2Xg3 and wg_offset should be reinit after
> the builder is deleted, or else the variables will be freed and caused
> memory leak;
> query the Constants related to the globallist by name instead: the
> GenWriter pass will be called by the number of __kernel functions in the
> module, since the globallist is always the same, constant index is
> not simply increased in different kernel function.
> 
> this patch could fix fdo bug: 
> https://bugs.freedesktop.org/show_bug.cgi?id=90472.
> 
> Signed-off-by: Luo Xionghu 
> ---
>  backend/src/llvm/llvm_gen_backend.cpp   | 4 +---
>  backend/src/llvm/llvm_printf_parser.cpp | 2 ++
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 4905415..1a65ee0 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2443,7 +2443,6 @@ namespace gbe
>{
>  // Allocate a address register for each global variable
>  const Module::GlobalListType  = TheModule->getGlobalList();
> -size_t j = 0;
>  for(auto i = globalList.begin(); i != globalList.end(); i ++) {
>const GlobalVariable  = *i;
>if(!v.isConstantUsed()) continue;
> @@ -2475,8 +2474,7 @@ namespace gbe
>  GBE_ASSERT(v.hasInitializer());
>  this->newRegister(const_cast());
>  ir::Register reg = 
> regTranslator.getScalar(const_cast(), 0);
> -ir::Constant  = unit.getConstantSet().getConstant(j ++);
> -GBE_ASSERT(con.getName() == v.getName());
> +ir::Constant  = unit.getConstantSet().getConstant(v.getName());
>  ctx.LOADI(ir::TYPE_S32, reg, 
> ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
>} else {
>  if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
> diff --git a/backend/src/llvm/llvm_printf_parser.cpp 
> b/backend/src/llvm/llvm_printf_parser.cpp
> index 3d84457..7ebda65 100644
> --- a/backend/src/llvm/llvm_printf_parser.cpp
> +++ b/backend/src/llvm/llvm_printf_parser.cpp
> @@ -755,6 +755,8 @@ error:
>  
>  deadprintfs.clear();
>  delete builder;
> +g1Xg2Xg3 = NULL;
> +wg_offset = NULL;
>  
>  return changed;
>}
> -- 
> 1.9.1
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/7] add debugloc for inserted llvm instructions

2015-10-08 Thread He Junyan
This patch set can basically work, but has some problems.

Common issue:
1. Please pay attention to the code format, the line should not
   begin with TAB and should not have trailing SPACEs.
2. Some tmp comments should be deleted, and no need for author
   name in code.
3. I think the reorganise the patches by stage of the backend,
   one patch for each stage.

Details are as following: 


On Fri, Sep 18, 2015 at 05:01:51PM +0800, Bai Yannan wrote:
> Date: Fri, 18 Sep 2015 17:01:51 +0800
> From: Bai Yannan 
> To: beignet@lists.freedesktop.org
> Cc: Bai Yannan 
> Subject: [Beignet] [PATCH 1/7] add debugloc for inserted llvm instructions
> X-Mailer: git-send-email 1.9.1
> 
> add debugloc when llvm instuctions inserted, the debugloc is inherited 
> from the contiguous one.
> 
> Signed-off-by: Bai Yannan 
> ---
>  backend/src/backend/program.cpp  |   7 +
>  backend/src/llvm/llvm_gen_backend.cpp|  25 ++
>  backend/src/llvm/llvm_loadstore_optimization.cpp |  18 ++
>  backend/src/llvm/llvm_printf_parser.cpp  |  20 ++
>  backend/src/llvm/llvm_sampler_fix.cpp|  17 ++
>  backend/src/llvm/llvm_scalarize.cpp  |  18 ++
>  backend/src/llvm/llvm_timestamp.cpp  | 337 
> +++
>  backend/src/llvm/llvm_to_gen.cpp |  10 +-
>  8 files changed, 451 insertions(+), 1 deletion(-)
>  create mode 100644 backend/src/llvm/llvm_timestamp.cpp
> 
> diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
> index f5865c2..af817de 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -49,6 +49,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef GBE_COMPILER_AVAILABLE
>  /* Not defined for LLVM 3.0 */
> @@ -554,6 +555,12 @@ namespace gbe {
>  args.push_back("stringInput.cl");
>  args.push_back("-ffp-contract=off");
>  
> + if(getenv("OCL_PROFILING")) {
> + char * isProfiling =  getenv("OCL_PROFILING");
> + if(*isProfiling == '1')
> + args.push_back("-g");
> + }
I think here we need to use BVAR or IVAR auxiliary functions instead of using
system getenv.

> +
>  // The compiler invocation needs a DiagnosticsEngine so it can report 
> problems
>  std::string ErrorString;
>  llvm::raw_string_ostream ErrorInfo(ErrorString);
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 4905415..238370a 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -108,6 +108,8 @@
>  
>  #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
>  #include "llvm/IR/Mangler.h"
> +#include "llvm/IR/DebugLoc.h"
> +#include "llvm/IR/DebugInfo.h"
>  #else
>  #include "llvm/Target/Mangler.h"
>  #endif
> @@ -178,6 +180,20 @@
>  
>  using namespace llvm;
>  
> +#define OCL_PROFILING (bool)(getenv("OCL_PROFILING")[0]-48)
> +#define SETDEBUGLOCATION(BUILDER, INSN)  \   
> + if(OCL_PROFILING) { 
> \
> + llvm::BasicBlock *bb = INSN->getParent();   \
> + llvm::BasicBlock::iterator iter =bb->begin();   \
> + while(!(iter++)->isIdenticalTo(INSN))   ;   \
> + llvm::MDNode *N = iter->getMetadata("dbg"); \
> + llvm::DebugLoc dg = iter->getDebugLoc();\
> + while(!N)   N = (++iter)->getMetadata("dbg");   \
> + BUILDER.SetCurrentDebugLocation(dg);\
> + }
> +// end define SETDEBUGLOCATION
I notice that all the SETDEBUGLOCATION macro have the almost same logic.
I prefer to rewrite it as a function and place it at some common place.

> +
> +
>  namespace gbe
>  {
>/*! Gen IR manipulates only scalar types */
> @@ -977,6 +993,7 @@ namespace gbe
>Value *trueVal = getPointerBase((*iter).second[0]);
>Value *falseVal = getPointerBase((*iter).second[1]);
>Builder.SetInsertPoint(si);
> +   SETDEBUGLOCATION(Builder, si);
>Value *base = Builder.CreateSelect(si->getCondition(), trueVal, 
> falseVal);
>pointerBaseMap.insert(std::make_pair(ptr, base));
>  return base;
> @@ -984,6 +1001,7 @@ namespace gbe
>PHINode *phi = dyn_cast(ptr);
>IRBuilder<> Builder(phi->getParent());
>Builder.SetInsertPoint(phi);
> +   SETDEBUGLOCATION(Builder, phi);
>  
>PHINode *basePhi = Builder.CreatePHI(ptr->getType(), 
> phi->getNumIncomingValues());
>unsigned srcNum = pointers.size();
> @@ -997,7 +1015,10 @@ namespace gbe
>  IRBuilder<> Builder2(phi->getIncomingBlock(x));
>  BasicBlock *predBB = phi->getIncomingBlock(x);
>  if (predBB->getTerminator())
> +   

Re: [Beignet] [PATCH 4/7] fix bug that LOADI cannot inherits debug info

2015-10-08 Thread He Junyan
On Fri, Sep 18, 2015 at 05:01:54PM +0800, Bai Yannan wrote:
> Date: Fri, 18 Sep 2015 17:01:54 +0800
> From: Bai Yannan 
> To: beignet@lists.freedesktop.org
> Cc: Bai Yannan 
> Subject: [Beignet] [PATCH 4/7] fix bug that LOADI cannot inherits debug info
> X-Mailer: git-send-email 1.9.1
> 
> Signed-off-by: Bai Yannan 
> ---
>  backend/src/backend/gen_context.cpp|  4 +---
>  backend/src/backend/gen_insn_selection.cpp | 12 +++-
>  backend/src/backend/program.cpp|  4 +++-
>  backend/src/ir/function.cpp|  7 +++
>  backend/src/ir/function.hpp|  1 +
>  backend/src/llvm/llvm_gen_backend.cpp  | 31 
> --
>  6 files changed, 44 insertions(+), 15 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp 
> b/backend/src/backend/gen_context.cpp
> index 411336e..9264cd9 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -254,6 +254,7 @@ namespace gbe
>void GenContext::emitLabelInstruction(const SelectionInstruction ) {
>  const ir::LabelIndex label(insn.index);
>  this->labelPos.insert(std::make_pair(label, p->store.size()));
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitUnaryInstruction(const SelectionInstruction ) {
> @@ -631,9 +632,6 @@ namespace gbe
>  const GenRegister dst = ra->genReg(insn.dst(0));
>  const GenRegister src0 = ra->genReg(insn.src(0));
>  const GenRegister src1 = ra->genReg(insn.src(1));
> - // debug
> - if(insn.dbginfo.hasdbginfo)
> - std::cout<<"*** "<   
>  switch (insn.opcode) {
>case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
> diff --git a/backend/src/backend/gen_insn_selection.cpp 
> b/backend/src/backend/gen_insn_selection.cpp
> index e861b7c..5ad665d 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -1104,26 +1104,19 @@ namespace gbe
>   {
>   SelectionInstruction  = *it;
>   if(!selinsn.dbginfo.hasdbginfo)
> - //SET_SELINSN_DBGINFO(selinsn)
> - {   selinsn.dbginfo.line = line;
> - selinsn.dbginfo.col = col;  
> - selinsn.dbginfo.hasdbginfo = true;}
> - //else break;
> + SET_SELINSN_DBGINFO(selinsn)
>   }
>   else
>   for(auto it = this->blockList.rbegin(); it!= 
> this->blockList.rend(); it++)
>   {
>   SelectionBlock  = *it;
> - for(auto jt = block.insnList.rbegin(); jt!= 
> block.insnList.rend(); jt++)
> + for(auto jt = block.insnList.begin(); jt!= 
> block.insnList.end(); jt++)
>   {
>   SelectionInstruction  = *jt;
>   if(!selinsn.dbginfo.hasdbginfo)
>   SET_SELINSN_DBGINFO(selinsn)
> - else goto OVER;
>   }
>   }
> - OVER:
> - ;
>}
>  #undef SET_SELINSN_DBGINFO
>  
> @@ -4768,6 +4761,7 @@ namespace gbe
>const uint32_t simdWidth = sel.ctx.getSimdWidth();
>GBE_ASSERTM(label < sel.ctx.getMaxLabel(), "We reached the maximum 
> label number which is reserved for barrier handling");
>sel.LABEL(label);
> +   SET_SELOPAQUE_DBGINFO(insn);
>  
>if(!insn.getParent()->needIf)
>  return true;
> diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
> index af817de..e317230 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -557,8 +557,10 @@ namespace gbe {
>  
>   if(getenv("OCL_PROFILING")) {
>   char * isProfiling =  getenv("OCL_PROFILING");
> - if(*isProfiling == '1')
> + if(*isProfiling == '1'){
> + args.push_back("-o0");
>   args.push_back("-g");
> + }
>   }
>  
>  // The compiler invocation needs a DiagnosticsEngine so it can report 
> problems
> diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
> index f87f23a..439b34c 100644
> --- a/backend/src/ir/function.cpp
> +++ b/backend/src/ir/function.cpp
> @@ -389,6 +389,13 @@ namespace ir {
>  return const_cast();
>}
>  
> +  Instruction *BasicBlock::getSuccessorInstruction(Instruction *pos) {
> + for(auto it = this->begin();it != this->end();it++)
> + if(it == pos)   
> + return &(*(++it));
> + return &(*(--this->end()));
> +  }
> +  
>LabelIndex BasicBlock::getLabelIndex(void) const {
>  const Instruction *first = 

Re: [Beignet] [PATCH 7/7] refine pass debug info from llvm ir to gen insn

2015-10-08 Thread He Junyan
On Fri, Sep 18, 2015 at 05:01:57PM +0800, Bai Yannan wrote:
> Date: Fri, 18 Sep 2015 17:01:57 +0800
> From: Bai Yannan 
> To: beignet@lists.freedesktop.org
> Cc: Bai Yannan , Lv Meng 
> Subject: [Beignet] [PATCH 7/7] refine pass debug info from llvm ir to gen
>  insn
> X-Mailer: git-send-email 1.9.1
> 
> Add line and col to ctx to pass debug infomation
> 
> Signed-off-by: Bai Yannan 
> Signed-off-by: Lv Meng 
> ---
>  backend/src/backend/gen_context.cpp   |  93 +++---
>  backend/src/backend/gen_encoder.cpp   |  24 +++---
>  backend/src/backend/gen_encoder.hpp   |   7 +-
>  backend/src/ir/context.cpp|   2 +
>  backend/src/ir/context.hpp|   3 +
>  backend/src/llvm/llvm_gen_backend.cpp | 142 
> ++
>  backend/src/llvm/llvm_to_gen.cpp  |   4 +-
>  7 files changed, 93 insertions(+), 182 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp 
> b/backend/src/backend/gen_context.cpp
> index 43c0b25..18a02a0 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -92,11 +92,6 @@ namespace gbe
>}
>  
>  #define OCL_PROFILING (bool)(getenv("OCL_PROFILING")[0]-48)
> -#define SET_GENINSN_DBGINFO(INSN)\
> -  if(INSN.dbginfo.hasdbginfo)\
> -   p->setDbginfo(INSN.dbginfo.line,INSN.dbginfo.col);\
> -  else p->setDbginfo(0,0)
> -
>void GenContext::emitInstructionStream(void) {
>  // Emit Gen ISA
>  for (auto  : *sel->blockList)
> @@ -106,6 +101,9 @@ namespace gbe
>// no more virtual register here in that part of the code generation
>GBE_ASSERT(insn.state.physicalFlag);
>p->curr = insn.state;
> +   //meng
> +   p->line = insn.dbginfo.line;
> +   p->col = insn.dbginfo.col;
>switch (opcode) {
>  #define DECL_SELECTION_IR(OPCODE, FAMILY) \
>case SEL_OP_##OPCODE: this->emit##FAMILY(insn); break;
> @@ -119,7 +117,6 @@ namespace gbe
>   instruction prefetcher prefetch into an invalide page */
>  for(int i = 0; i < 8; i++)
>   p->NOP();
> - p->setDbginfo(0,0);
>}
>  
>bool GenContext::patchBranches(void) {
> @@ -255,7 +252,7 @@ namespace gbe
>void GenContext::emitLabelInstruction(const SelectionInstruction ) {
>  const ir::LabelIndex label(insn.index);
>  this->labelPos.insert(std::make_pair(label, p->store.size()));
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
it seems to be a tmp comment, and should not be in the patches.
>}
>  
>void GenContext::emitUnaryInstruction(const SelectionInstruction ) {
> @@ -330,7 +327,7 @@ namespace gbe
>  break;
>default: NOT_IMPLEMENTED;
>  }
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction 
> ) {
> @@ -489,7 +486,7 @@ namespace gbe
>default:
>  NOT_IMPLEMENTED;
>  }
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction 
> ) {
> @@ -590,7 +587,7 @@ namespace gbe
>default:
>  NOT_IMPLEMENTED;
>  }
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitSimdShuffleInstruction(const SelectionInstruction 
> ) {
> @@ -682,7 +679,7 @@ namespace gbe
>  break;
>default: NOT_IMPLEMENTED;
>  }
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::collectShifter(GenRegister dest, GenRegister src) {
> @@ -777,7 +774,7 @@ namespace gbe
>  }
>  storeTopHalf(dest, e);
>  storeBottomHalf(dest, f);
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64MADSATInstruction(const SelectionInstruction 
> ) {
> @@ -909,7 +906,7 @@ namespace gbe
>  }
>  storeTopHalf(dest, g);
>  storeBottomHalf(dest, h);
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64HADDInstruction(const SelectionInstruction ) {
> @@ -937,7 +934,7 @@ namespace gbe
>  p->OR(c, c, d);
>  storeBottomHalf(dest, a);
>  storeTopHalf(dest, c);
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64RHADDInstruction(const SelectionInstruction ) 
> {
> @@ -968,7 +965,7 @@ namespace gbe
>  p->OR(c, c, d);
>  storeBottomHalf(dest, a);
>  storeTopHalf(dest, c);
> - SET_GENINSN_DBGINFO(insn);
> + //SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64ShiftInstruction(const SelectionInstruction ) 
> {
> @@ -1075,7 +1072,7 @@ namespace gbe
>default:
>  NOT_IMPLEMENTED;
>  }
> - SET_GENINSN_DBGINFO(insn);
> + 

Re: [Beignet] [PATCH 3/7] pass dbginfo from gen ir to geninsn

2015-10-08 Thread He Junyan
On Fri, Sep 18, 2015 at 05:01:53PM +0800, Bai Yannan wrote:
> Date: Fri, 18 Sep 2015 17:01:53 +0800
> From: Bai Yannan 
> To: beignet@lists.freedesktop.org
> Cc: Bai Yannan 
> Subject: [Beignet] [PATCH 3/7] pass dbginfo from gen ir to geninsn
> X-Mailer: git-send-email 1.9.1
> 
> 1, pass debug infomation first from gen ir to selection ir;
> 2, pass debug infomation from selection ir to gen instruction;
> 3, print line and column binded with ASM into a log file.
> 
> Signed-off-by: Bai Yannan 
> ---
>  backend/src/backend/gen_context.cpp| 54 +++
>  backend/src/backend/gen_defs.hpp   |  4 ++
>  backend/src/backend/gen_encoder.cpp| 15 ++
>  backend/src/backend/gen_encoder.hpp|  4 ++
>  backend/src/backend/gen_insn_selection.cpp | 84 
> +-
>  backend/src/backend/gen_insn_selection.hpp |  7 +++
>  6 files changed, 167 insertions(+), 1 deletion(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp 
> b/backend/src/backend/gen_context.cpp
> index 25fdf08..411336e 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -91,6 +91,11 @@ namespace gbe
>  return i;
>}
>  
> +#define SET_GENINSN_DBGINFO(INSN)\
> +  if(INSN.dbginfo.hasdbginfo)\
> +   p->setDbginfo(INSN.dbginfo.line,INSN.dbginfo.col);\
> +  else p->setDbginfo(0,0)
> +
>void GenContext::emitInstructionStream(void) {
>  // Emit Gen ISA
>  for (auto  : *sel->blockList)
> @@ -106,12 +111,14 @@ namespace gbe
>  #include "backend/gen_insn_selection.hxx"
>  #undef DECL_INSN
>}
> +   //p->setDbginfo(insn.dbginfo.line,insn.dbginfo.col);
>p->pop();
>  }
>  /* per spec, pad the instruction stream with 8 nop to avoid
>   instruction prefetcher prefetch into an invalide page */
>  for(int i = 0; i < 8; i++)
>   p->NOP();
> + p->setDbginfo(0,0);
>}
>  
>bool GenContext::patchBranches(void) {
> @@ -241,6 +248,7 @@ namespace gbe
>p->curr.execWidth = this->simdWidth;
>p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
>  p->pop();
> + //SET_GENINSN_DBGINFO(0);
>}
>  
>void GenContext::emitLabelInstruction(const SelectionInstruction ) {
> @@ -320,6 +328,7 @@ namespace gbe
>  break;
>default: NOT_IMPLEMENTED;
>  }
> + SET_GENINSN_DBGINFO(insn);
I think that SET_GENINSN_DBGINFO should be add to a common place like in 
instruction.cpp.


>}
>  
>void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction 
> ) {
> @@ -478,6 +487,7 @@ namespace gbe
>default:
>  NOT_IMPLEMENTED;
>  }
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction 
> ) {
> @@ -578,6 +588,7 @@ namespace gbe
>default:
>  NOT_IMPLEMENTED;
>  }
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitSimdShuffleInstruction(const SelectionInstruction 
> ) {
> @@ -620,6 +631,10 @@ namespace gbe
>  const GenRegister dst = ra->genReg(insn.dst(0));
>  const GenRegister src0 = ra->genReg(insn.src(0));
>  const GenRegister src1 = ra->genReg(insn.src(1));
> + // debug
> + if(insn.dbginfo.hasdbginfo)
> + std::cout<<"*** "< + 
>  switch (insn.opcode) {
>case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
>case SEL_OP_SEL_INT64:
> @@ -668,6 +683,7 @@ namespace gbe
>  break;
>default: NOT_IMPLEMENTED;
>  }
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::collectShifter(GenRegister dest, GenRegister src) {
> @@ -762,6 +778,7 @@ namespace gbe
>  }
>  storeTopHalf(dest, e);
>  storeBottomHalf(dest, f);
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64MADSATInstruction(const SelectionInstruction 
> ) {
> @@ -893,6 +910,7 @@ namespace gbe
>  }
>  storeTopHalf(dest, g);
>  storeBottomHalf(dest, h);
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64HADDInstruction(const SelectionInstruction ) {
> @@ -920,6 +938,7 @@ namespace gbe
>  p->OR(c, c, d);
>  storeBottomHalf(dest, a);
>  storeTopHalf(dest, c);
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64RHADDInstruction(const SelectionInstruction ) 
> {
> @@ -950,6 +969,7 @@ namespace gbe
>  p->OR(c, c, d);
>  storeBottomHalf(dest, a);
>  storeTopHalf(dest, c);
> + SET_GENINSN_DBGINFO(insn);
>}
>  
>void GenContext::emitI64ShiftInstruction(const SelectionInstruction ) 
> {
> @@ -1056,6 +1076,7 @@ namespace gbe
>default:
>  NOT_IMPLEMENTED;
>  }
> + SET_GENINSN_DBGINFO(insn);
>}
>void GenContext::setFlag(GenRegister flagReg, GenRegister src) {
>  p->push();
> @@ -1211,6 +1232,7 @@ 

Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW.

2015-09-15 Thread He Junyan
On Tue, Sep 15, 2015 at 06:00:57AM -0700, Matt Turner wrote:
> Date: Tue, 15 Sep 2015 06:00:57 -0700
> From: Matt Turner <matts...@gmail.com>
> To: "junyan.he" <junyan...@inbox.com>
> Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org>
> Subject: Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW.
> 
> On Tue, Sep 15, 2015 at 4:15 AM,  <junyan...@inbox.com> wrote:
> > From: Junyan He <junyan...@linux.intel.com>
> >
> > According to the document, we use a set of instructions
> > to implement double type division.
> >
> > Signed-off-by: Junyan He <junyan...@linux.intel.com>
> > ---
> >  backend/src/backend/gen8_context.cpp | 68 
> > 
> >  backend/src/backend/gen8_context.hpp |  2 ++
> >  2 files changed, 70 insertions(+)
> >
> > diff --git a/backend/src/backend/gen8_context.cpp 
> > b/backend/src/backend/gen8_context.cpp
> > index b497ee5..f465832 100644
> > --- a/backend/src/backend/gen8_context.cpp
> > +++ b/backend/src/backend/gen8_context.cpp
> > @@ -924,6 +924,74 @@ namespace gbe
> >  this->unpackLongVec(src, dst, p->curr.execWidth);
> >}
> >
> > +  void Gen8Context::emitF64DIVInstruction(const SelectionInstruction 
> > ) {
> > +/* Macro for Double Precision IEEE Compliant fdiv
> > +
> > +   Set Rounding Mode in CR to RNE
> > +   GRF are initialized: r0 = 0, r6 = a, r7 = b, r1 = 1
> > +   The default data type for the macro is :df
> > +
> > +   math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE
> > +   (-f0.0) if
> > +   madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2   // Step(1), q0=a*y0
> > +   madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), 
> > e0=(1-b*y0)
> > +   madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), 
> > r0=a-b*q0
> > +   madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2   // Step(4), 
> > y1=y0+e0*y0
> > +   madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), 
> > e1=(1-b*y1)
> > +   madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6   // Step(6), 
> > y2=y0+e0*y1
> > +   madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6   // Step(7), 
> > q1=q0+r0*y1
> > +   madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7  // Step(8), 
> > y3=y1+e1*y2
> > +   madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), 
> > r1=a-b*q1
> > +
> > +   Change Rounding Mode in CR if required
> > +   Implicit Accumulator for destination is NULL
> > +
> > +   madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2  // Step(10), 
> > q=q1+r1*y3
> > +   endif */
> 
> I don't see an IF or an ENDIF instruction emitted in the code below.
> Is that intentional, or am I misreading the code?
> 
Here, we use f0.1 as the predication for all the instructions, like:
(-f0.1) madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 
(-f0.1) madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2
.
I avoid using IF-Endif here, because we need to calculate the instruction number
within IF clause, and it is not convenient.

> > +GenRegister r6 = GenRegister::retype(ra->genReg(insn.src(0)), 
> > GEN_TYPE_DF);
> > +GenRegister r7 = GenRegister::retype(ra->genReg(insn.src(1)), 
> > GEN_TYPE_DF);
> > +GenRegister r8 = GenRegister::retype(ra->genReg(insn.dst(0)), 
> > GEN_TYPE_DF);
> > +const GenRegister r0 = GenRegister::retype(ra->genReg(insn.dst(1)), 
> > GEN_TYPE_DF);
> > +const GenRegister r1 = GenRegister::retype(ra->genReg(insn.dst(2)), 
> > GEN_TYPE_DF);
> > +const GenRegister r9 = GenRegister::retype(ra->genReg(insn.dst(3)), 
> > GEN_TYPE_DF);
> > +const GenRegister r10 = GenRegister::retype(ra->genReg(insn.dst(4)), 
> > GEN_TYPE_DF);
> > +const GenRegister r11 = GenRegister::retype(ra->genReg(insn.dst(5)), 
> > GEN_TYPE_DF);
> > +const GenRegister r12 = GenRegister::retype(ra->genReg(insn.dst(6)), 
> > GEN_TYPE_DF);
> > +const GenRegister r13 = GenRegister::retype(ra->genReg(insn.dst(7)), 
> > GEN_TYPE_DF);
> > +Gen8Encoder *p8 = reinterpret_cast(p);
> > +p->push(); {
> > +  p->curr.execWidth = 4;
> > +  p->curr.predicate = GEN_PREDICATE_NONE;
> > +  p->curr.noMask= 1;
> > +  p->MOV(r1, GenRegister::immdf(1.0d));
> > +  p->MOV(r0, GenRegister::immdf(0.0d));
> > +
> > +  for (int i = 0; i < (simdWidth == 16 ? 4 : 2); i++) {
> > +p->curr.predicate 

Re: [Beignet] [PATCH 5/8] Backend: Add the MADM function to gen8 encoder.

2015-09-15 Thread He Junyan
On Tue, Sep 15, 2015 at 05:57:13AM -0700, Matt Turner wrote:
> Date: Tue, 15 Sep 2015 05:57:13 -0700
> From: Matt Turner <matts...@gmail.com>
> To: "junyan.he" <junyan...@inbox.com>
> Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org>
> Subject: Re: [Beignet] [PATCH 5/8] Backend: Add the MADM function to gen8
>  encoder.
> 
> On Tue, Sep 15, 2015 at 4:15 AM,  <junyan...@inbox.com> wrote:
> > From: Junyan He <junyan...@linux.intel.com>
> >
> > Signed-off-by: Junyan He <junyan...@linux.intel.com>
> > ---
> >  backend/src/backend/gen8_encoder.cpp | 56 
> > 
> >  backend/src/backend/gen8_encoder.hpp |  2 ++
> >  backend/src/backend/gen_defs.hpp |  2 ++
> >  3 files changed, 60 insertions(+)
> >
> > diff --git a/backend/src/backend/gen8_encoder.cpp 
> > b/backend/src/backend/gen8_encoder.cpp
> > index 0af27a3..002a8b5 100644
> > --- a/backend/src/backend/gen8_encoder.cpp
> > +++ b/backend/src/backend/gen8_encoder.cpp
> > @@ -591,4 +591,60 @@ namespace gbe
> >   this->setSrc0WithAcc(insn, src0, src0Acc);
> >   this->setSrc1WithAcc(insn, src1, src1Acc);
> >}
> > +
> > +  void Gen8Encoder::MADM(GenRegister dst, GenRegister src0, GenRegister 
> > src1, GenRegister src2,
> > +  uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t 
> > src2Acc)
> > +  {
> > +GenNativeInstruction *insn = this->next(GEN_OPCODE_MADM);
> > +Gen8NativeInstruction *gen8_insn = >gen8_insn;
> > +assert(dst.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(src0.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(src1.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(src2.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == 
> > GEN_HORIZONTAL_STRIDE_0);
> > +assert(src0.type == GEN_TYPE_DF || src0.type == GEN_TYPE_F);
> > +assert(src0.type == dst.type);
> > +assert(src0.type == src1.type);
> > +assert(src0.type == src2.type);
> > +int32_t dataType = src0.type == GEN_TYPE_DF ? 3 : 0;
> > +
> > +this->setHeader(insn);
> > +gen8_insn->bits1.da3srcacc.dest_reg_nr = dst.nr;
> > +gen8_insn->bits1.da3srcacc.dest_subreg_nr = dst.subnr / 16;
> > +gen8_insn->bits1.da3srcacc.dst_specal_acc = dstAcc;
> > +gen8_insn->bits1.da3srcacc.src_type = dataType;
> > +gen8_insn->bits1.da3srcacc.dest_type = dataType;
> > +gen8_insn->header.access_mode = GEN_ALIGN_16;
> > +
> > +assert(src0.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(src0.address_mode == GEN_ADDRESS_DIRECT);
> > +assert(src0.nr < 128);
> > +gen8_insn->bits2.da3srcacc.src0_specal_acc = src0Acc;
> > +gen8_insn->bits2.da3srcacc.src0_subreg_nr = src0.subnr / 4 ;
> > +gen8_insn->bits2.da3srcacc.src0_reg_nr = src0.nr;
> > +gen8_insn->bits1.da3srcacc.src0_abs = src0.absolute;
> > +gen8_insn->bits1.da3srcacc.src0_negate = src0.negation;
> > +gen8_insn->bits2.da3srcacc.src0_rep_ctrl = src0.vstride == 
> > GEN_VERTICAL_STRIDE_0;
> > +
> > +assert(src1.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(src1.address_mode == GEN_ADDRESS_DIRECT);
> > +assert(src1.nr < 128);
> > +gen8_insn->bits2.da3srcacc.src1_specal_acc = src1Acc;
> > +gen8_insn->bits2.da3srcacc.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
> > +gen8_insn->bits3.da3srcacc.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
> > +gen8_insn->bits2.da3srcacc.src1_rep_ctrl = src1.vstride == 
> > GEN_VERTICAL_STRIDE_0;
> > +gen8_insn->bits3.da3srcacc.src1_reg_nr = src1.nr;
> > +gen8_insn->bits1.da3srcacc.src1_abs = src1.absolute;
> > +gen8_insn->bits1.da3srcacc.src1_negate = src1.negation;
> > +
> > +assert(src2.file == GEN_GENERAL_REGISTER_FILE);
> > +assert(src2.address_mode == GEN_ADDRESS_DIRECT);
> > +assert(src2.nr < 128);
> > +gen8_insn->bits3.da3srcacc.src2_specal_acc = src2Acc;
> > +gen8_insn->bits3.da3srcacc.src2_subreg_nr = src2.subnr / 4;
> > +gen8_insn->bits3.da3srcacc.src2_rep_ctrl = src2.vstride == 
> > GEN_VERTICAL_STRIDE_0;
> > +gen8_insn->bits3.da3srcacc.src2_reg_nr = src2.nr;
> > +gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute;
> > +gen8_insn->bits1.da3srcacc.src2_negate = src2.negation;
> > +  }
> >  } /* End of the name space. */
> > diff --git a/b

Re: [Beignet] [PATCH 00/19 V2] Add Profiling support in beignet.

2015-09-10 Thread He Junyan
I think it would be better after we integrate the binary_to_soure
feature. Then I think it is easy and clear to describe how to use
the profiling feature.

On Thu, Sep 10, 2015 at 05:56:17AM +, Zou, Nanhai wrote:
> Date: Thu, 10 Sep 2015 05:56:17 +
> From: "Zou, Nanhai" 
> To: "junyan...@inbox.com" ,
>  "beignet@lists.freedesktop.org" 
> Subject: Re: [Beignet] [PATCH 00/19 V2] Add Profiling support in beignet.
> 
> It will be nice if you can add a simple how-to-profile-your-kernel document 
> in docs/howto
> 
> Thanks
> Zou Nanhai
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 19/19] runtime: Add fp16 extension to BDW later platform.

2015-07-01 Thread He Junyan

The half float can work for BSW,
I will send a standalone patch to enable it later.


On 2015年06月19日 15:18, Yang, Rong R wrote:

One concern: Does cherryview support half?

The other part of the patchset LGTM.


-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
junyan...@inbox.com
Sent: Thursday, June 11, 2015 19:26
To: beignet@lists.freedesktop.org
Cc: Junyan He
Subject: [Beignet] [PATCH 19/19] runtime: Add fp16 extension to BDW later
platform.

From: Junyan He junyan...@linux.intel.com

Signed-off-by: Junyan He junyan...@linux.intel.com
---
  src/cl_device_id.c   | 123 ++---
--
  src/cl_device_id.h   |   1 +
  src/cl_extensions.c  |  29 ++--
  src/cl_extensions.h  |   2 +
  src/cl_gt_device.h   |   1 +
  src/cl_platform_id.c |   2 +-
  6 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 215f7f2..09171f8
100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -26,6 +26,7 @@
  #include cl_khr_icd.h
  #include cl_thread.h
  #include CL/cl.h
+#include CL/cl_ext.h
  #include cl_gbe_loader.h
  #include cl_alloc.h

@@ -398,6 +399,8 @@ baytrail_t_device_break:
  case PCI_CHIP_BROADWLL_U_GT1:
DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name,
Intel(R) HD Graphics BroadWell ULX GT1);
  brw_gt1_break:
+  /* For Gen8 and later, half float is suppported and we will enable
cl_khr_fp16. */
+  cl_intel_platform_enable_fp16_extension(intel_platform);
intel_brw_gt1_device.vendor_id = device_id;
intel_brw_gt1_device.platform = intel_platform;
ret = intel_brw_gt1_device;
@@ -414,6 +417,7 @@ brw_gt1_break:
  case PCI_CHIP_BROADWLL_U_GT2:
DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name,
Intel(R) HD Graphics BroadWell ULX GT2);
  brw_gt2_break:
+  cl_intel_platform_enable_fp16_extension(intel_platform);
intel_brw_gt2_device.vendor_id = device_id;
intel_brw_gt2_device.platform = intel_platform;
ret = intel_brw_gt2_device;
@@ -430,6 +434,7 @@ brw_gt2_break:
  case PCI_CHIP_BROADWLL_U_GT3:
DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name,
Intel(R) HD Graphics BroadWell ULX GT2);
  brw_gt3_break:
+  cl_intel_platform_enable_fp16_extension(intel_platform);
intel_brw_gt3_device.vendor_id = device_id;
intel_brw_gt3_device.platform = intel_platform;
ret = intel_brw_gt3_device;
@@ -447,61 +452,65 @@ chv_break:
break;


- case PCI_CHIP_SKYLAKE_ULT_GT1:
-   DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device,
name, Intel(R) HD Graphics Skylake ULT GT1);
- case PCI_CHIP_SKYLAKE_ULX_GT1:
-   DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device,
name, Intel(R) HD Graphics Skylake ULX GT1);
- case PCI_CHIP_SKYLAKE_DT_GT1:
-   DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device,
name, Intel(R) HD Graphics Skylake Desktop GT1);
- case PCI_CHIP_SKYLAKE_HALO_GT1:
-   DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device,
name, Intel(R) HD Graphics Skylake Halo GT1);
- case PCI_CHIP_SKYLAKE_SRV_GT1:
-   DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device,
name, Intel(R) HD Graphics Skylake Server GT1);
+case PCI_CHIP_SKYLAKE_ULT_GT1:
+  DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R)
HD Graphics Skylake ULT GT1);
+case PCI_CHIP_SKYLAKE_ULX_GT1:
+  DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R)
HD Graphics Skylake ULX GT1);
+case PCI_CHIP_SKYLAKE_DT_GT1:
+  DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R)
HD Graphics Skylake Desktop GT1);
+case PCI_CHIP_SKYLAKE_HALO_GT1:
+  DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, Intel(R)
HD Graphics Skylake Halo GT1);
+case PCI_CHIP_SKYLAKE_SRV_GT1:
+  DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name,
+ Intel(R) HD Graphics Skylake Server GT1);
  skl_gt1_break:
-   intel_skl_gt1_device.vendor_id = device_id;
-   intel_skl_gt1_device.platform = intel_platform;
-   ret = intel_skl_gt1_device;
-   break;
-
- case PCI_CHIP_SKYLAKE_ULT_GT2:
-   DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device,
name, Intel(R) HD Graphics Skylake ULT GT2);
- case PCI_CHIP_SKYLAKE_ULT_GT2F:
-   DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device,
name, Intel(R) HD Graphics Skylake ULT GT2F);
- case PCI_CHIP_SKYLAKE_ULX_GT2:
-   DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device,
name, Intel(R) HD Graphics Skylake ULX GT2);
- case PCI_CHIP_SKYLAKE_DT_GT2:
-   DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device,
name, Intel(R) HD Graphics Skylake Desktop GT2);
- case PCI_CHIP_SKYLAKE_HALO_GT2:
-   DECL_INFO_STRING(skl_gt2_break

Re: [Beignet] thread safety and OpenMP

2015-06-30 Thread He, Junyan
So far as we know, beignet is thread safe.
Every thead has its own command buffer and do not have
relationship with each others.
Do you use subbuffer to divide the image?
If you can provide some source code or test case, it may
help a lot. 


-Original Message-
From: Song, Ruiling 
Sent: Wednesday, July 01, 2015 10:40 AM
To: Gerald Baier; beignet@lists.freedesktop.org
Cc: He, Junyan
Subject: RE: [Beignet] thread safety and OpenMP



 -Original Message-
 From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
 Gerald Baier
 Sent: Sunday, June 28, 2015 9:56 PM
 To: beignet@lists.freedesktop.org
 Subject: [Beignet] thread safety and OpenMP
 
 I'm using Beignet for image processing, where basically the image is
 subdivided into tiles which are then processed by several threads using
 OpenMP tasks. I noticed that some of the tiles are occasionally messed up. If


Using OpenMP or OpenCL?
Per OpenCL spec, opencl driver should be thread-safe, and Junyan implemented 
the multi-thread support in beignet. That is to say Beignet is thread-safe.
I am not sure whether Junyan has more comments on how to debug the issue. By 
the way, if you can provide a test case to reproduce the issue, it would be 
much helpful.

Thanks!
Ruiling
 I use only one thread everything works fine, also the same program runs as
 expected on nvidia GPUs with multiple threads. Hence the question whether
 Beignet is thread safe and how I could debug my program?
 
 Here's my configuration:
 device name: Intel(R) HD Graphics IvyBridge M GT2 device version: OpenCL
 1.2 beignet 1.0.3 (git-9e0ca6f)
 
 Best regards,
 Gerald
 
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/8] Backend: Add half float as a new type.

2015-05-25 Thread He Junyan

After research, I find the F16C feature is only supported on
IVB later platforms and needs at least GCC 4.6 support.
This will cause some compatible issues. The beignet project
may be cross-compiled on some old platform using old version
GCC, and some one may use other compiler to build it.
According to this, I still prefer to use the software imitation
for half float. It's verbose, but it will not have any side effect
for runtime performance.



On 2015年05月22日 14:28, He Junyan wrote:

Thanks for your information.
I will do some research for it.


On 2015年05月22日 05:51, Matt Turner wrote:

On Thu, May 21, 2015 at 1:25 AM, junyan...@inbox.com wrote:

From: Junyan He junyan...@linux.intel.com

Because the CPU of X86 does not support half float
instructions, there is no support for half float operations.
So we introduce the half class to handle the operations for
half float using llvm's APFloat utility.

Ivybridge and newer have the F16C instruction set
(http://en.wikipedia.org/wiki/F16C) which offers instructions to
convert half-precision - single-precision floats.

I don't know if it's valuable to use it, but it's there.
___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet




___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet




___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/2] [opencl-2.0] enable create image 2d from buffer in clCreateImage.

2015-05-22 Thread He Junyan

Some comments,

On 2015年04月03日 13:39, xionghu@intel.com wrote:

From: Luo Xionghu xionghu@intel.com

this patch allows create 2d image with a cl buffer.

Signed-off-by: Luo Xionghu xionghu@intel.com
---
  src/cl_api.c |  3 ++-
  src/cl_mem.c | 67 +++-
  2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index cd4020e..25e621a 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -549,8 +549,9 @@ clCreateImage(cl_context context,
  goto error;
}
/* buffer refers to a valid buffer memory object if image_type is
- CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
+ CL_MEM_OBJECT_IMAGE1D_BUFFER or CL_MEM_OBJECT_IMAGE2D. Otherwise it must 
be NULL. */
if (image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER 
+  image_desc-image_type != CL_MEM_OBJECT_IMAGE2D 
   image_desc-buffer) {
  err = CL_INVALID_IMAGE_DESCRIPTOR;
  goto error;
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b41ec14..3c5667e 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -971,26 +971,47 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, bpp)) != 
CL_SUCCESS))
  goto error;
  
-  // Per bspec, a image should has a at least 2 line vertical alignment,

-  // thus we can't simply attach a buffer to a 1d image surface which has the 
same size.
-  // We have to create a new image, and copy the buffer data to this new image.
-  // And replace all the buffer object's reference to this image.
-  image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type,
+  if(image_desc-image_type == CL_MEM_OBJECT_IMAGE2D) {

Spec says:
The restrictions are:
all the values specified in image_desc except for mem_object must match 
the image descriptor information associated with mem_object.
the channel data type specified in image_format must match the channel 
data type associated with mem_object.


So I think here we may need to add some check.


+image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type,
+ image_desc-image_width, image_desc-image_height, 
image_desc-image_depth,
+ image_desc-image_row_pitch, 
image_desc-image_slice_pitch,
+ image_desc-buffer, errcode_ret);
  ~~~ here, why 
image_desc-buffer?

+  } else if (image_desc-image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+// Per bspec, a image should has a at least 2 line vertical alignment,
+// thus we can't simply attach a buffer to a 1d image surface which has 
the same size.
+// We have to create a new image, and copy the buffer data to this new 
image.
+// And replace all the buffer object's reference to this image.
+image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type,
  mem_buffer-base.size / bpp, 0, 0, 0, 0, NULL, 
errcode_ret);
+  }
+  else
+assert(0);
+
if (image == NULL)
  return NULL;
-  void *src = cl_mem_map(buffer, 0);
-  void *dst = cl_mem_map(image, 1);
-  //
-  // FIXME, we could use copy buffer to image to do this on GPU latter.
-  // currently the copy buffer to image function doesn't support 1D image.
-  //
-  // There is a potential risk that this buffer was mapped and the caller
-  // still hold the pointer and want to access it again. This scenario is
-  // not explicitly forbidden in the spec, although it should not be permitted.
-  memcpy(dst, src, mem_buffer-base.size);
-  cl_mem_unmap(buffer);
-  cl_mem_unmap(image);
+
+  if(image_desc-image_type == CL_MEM_OBJECT_IMAGE2D)
+  {
+size_t origin[] = {0,0,0};
+size_t region[] = {image_desc-image_width, image_desc-image_height, 1};
+clEnqueueCopyBufferToImage(ctx-queues, buffer, image, 0, origin, region, 
0, NULL, NULL);
+  }
+  else if (image_desc-image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+  {
+// FIXME, we could use copy buffer to image to do this on GPU latter.
+// currently the copy buffer to image function doesn't support 1D image.
+//
+// There is a potential risk that this buffer was mapped and the caller
+// still hold the pointer and want to access it again. This scenario is
+// not explicitly forbidden in the spec, although it should not be 
permitted.
+void *src = cl_mem_map(buffer, 0);
+void *dst = cl_mem_map(image, 1);
+memcpy(dst, src, mem_buffer-base.size);
+cl_mem_unmap(image);
+cl_mem_unmap(buffer);
+  }
+  else
+assert(0);
  
if (err != 0)

  goto error;
@@ -1025,12 +1046,20 @@ cl_mem_new_image(cl_context context,
  {
switch (image_desc-image_type) {
case CL_MEM_OBJECT_IMAGE1D:
-  case CL_MEM_OBJECT_IMAGE2D:
case CL_MEM_OBJECT_IMAGE3D:
  return _cl_mem_new_image(context, flags, image_format, 
image_desc-image_type,
   image_desc-image_width, 

Re: [Beignet] [PATCH 1/8] Backend: Add half float as a new type.

2015-05-22 Thread He Junyan

Thanks for your information.
I will do some research for it.


On 2015年05月22日 05:51, Matt Turner wrote:

On Thu, May 21, 2015 at 1:25 AM,  junyan...@inbox.com wrote:

From: Junyan He junyan...@linux.intel.com

Because the CPU of X86 does not support half float
instructions, there is no support for half float operations.
So we introduce the half class to handle the operations for
half float using llvm's APFloat utility.

Ivybridge and newer have the F16C instruction set
(http://en.wikipedia.org/wiki/F16C) which offers instructions to
convert half-precision - single-precision floats.

I don't know if it's valuable to use it, but it's there.
___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet




___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH OpenCL 2.0] Backend: Update the workgroup instructions for llvm backend to gen.

2015-05-02 Thread He Junyan

I have modified it in the new patch set.
Just ignore this one. Thanks.


On 2015年04月30日 13:49, Zhigang Gong wrote:

Junyan,

I haven't found any new response to address this comment from Rong and me.
Do you miss this comment or do I miss your new patch?

Thanks,
Zhigang Gong.

On Thu, Apr 02, 2015 at 12:53:30PM +0800, Zhigang Gong wrote:

Right, especially for those builtin function which don't care about the sign.
Junyan, could you refine your patch accordingly?
Thanks.

On Tue, Mar 24, 2015 at 07:39:03AM +, Yang, Rong R wrote:

Zhigang have add function OCLIntrinsicMap.find to handle override function 
name, only need one DECL_LLVM_GEN_FUNCTION for one group  override functions, 
and in the GenWriter::emitCallInst to get the corresponding argument type. It 
reduce the DECL_LLVM_GEN_FUNCTION significant. Can you also use this method?


-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
junyan...@inbox.com
Sent: Tuesday, March 24, 2015 14:40
To: beignet@lists.freedesktop.org
Cc: Junyan He
Subject: [Beignet] [PATCH OpenCL 2.0] Backend: Update the workgroup
instructions for llvm backend to gen.

From: Junyan He junyan...@linux.intel.com

Signed-off-by: Junyan He junyan...@linux.intel.com
---
  backend/src/llvm/llvm_gen_ocl_function.hxx |   87

  1 file changed, 87 insertions(+)

diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 9536a3c..947fadc 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -160,3 +160,90 @@ DECL_LLVM_GEN_FUNCTION(REGION,
__gen_ocl_region)

  // printf function
  DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+
+// work group function
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_1D,
+_Z30__gen_ocl_work_group_broadcastij)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_2D,
+_Z30__gen_ocl_work_group_broadcastijj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_3D,
+_Z30__gen_ocl_work_group_broadcastijjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_1D,
+_Z30__gen_ocl_work_group_broadcastjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_2D,
+_Z30__gen_ocl_work_group_broadcastjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_3D,
+_Z30__gen_ocl_work_group_broadcast)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_1D,
+_Z30__gen_ocl_work_group_broadcastlj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_2D,
+_Z30__gen_ocl_work_group_broadcastljj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_lONG_3D,
+_Z30__gen_ocl_work_group_broadcastljjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_1D,
+_Z30__gen_ocl_work_group_broadcastmj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_2D,
+_Z30__gen_ocl_work_group_broadcastmjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_3D,
+_Z30__gen_ocl_work_group_broadcastmjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_1D,
+_Z30__gen_ocl_work_group_broadcastfj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_2D,
+_Z30__gen_ocl_work_group_broadcastfjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_3D,
+_Z30__gen_ocl_work_group_broadcastfjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_1D,
+_Z30__gen_ocl_work_group_broadcastdj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_2D,
+_Z30__gen_ocl_work_group_broadcastdjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_3D,
+_Z30__gen_ocl_work_group_broadcastdjjj)
+
+// work group reduce
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_DOUBLE,
+_Z31__gen_ocl_work_group_reduce_addd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_FLOAT,
+_Z31__gen_ocl_work_group_reduce_addf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_INT,
+_Z31__gen_ocl_work_group_reduce_addi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_UINT,
+_Z31__gen_ocl_work_group_reduce_addj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_LONG,
+_Z31__gen_ocl_work_group_reduce_addl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_ULONG,
+_Z31__gen_ocl_work_group_reduce_addm)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXD,
+_Z31__gen_ocl_work_group_reduce_maxd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXF,
+_Z31__gen_ocl_work_group_reduce_maxf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXI,
+_Z31__gen_ocl_work_group_reduce_maxi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXJ,
+_Z31__gen_ocl_work_group_reduce_maxj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXL,
+_Z31__gen_ocl_work_group_reduce_maxl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXM,
+_Z31__gen_ocl_work_group_reduce_maxm)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MIND,
+_Z31__gen_ocl_work_group_reduce_mind)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINF,
+_Z31__gen_ocl_work_group_reduce_minf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINI,
+_Z31__gen_ocl_work_group_reduce_mini)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINJ,
+_Z31__gen_ocl_work_group_reduce_minj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINL

Re: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.

2015-03-23 Thread He Junyan

OK, it's a better way to avid the usage of addc.
I think tmp_dst can also be avoided here to save
one tmp register.


On 2015年03月23日 15:44, Song, Ruiling wrote:

Good idea, the patch LGTM.


-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
Yang Rong
Sent: Monday, March 23, 2015 2:26 PM
To: beignet@lists.freedesktop.org
Cc: Yang, Rong R
Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.

HADD is equal to (src01) + (src11) + ((src00x1)  (src10x1)), and
RHADD is equal to (src01) + (src11) + ((src00x1) | (src10x1)).

Signed-off-by: Yang Rong rong.r.y...@intel.com
---
  backend/src/backend/gen8_context.cpp   | 114
-
  backend/src/backend/gen_insn_selection.cpp |   8 +-
  2 files changed, 20 insertions(+), 102 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp
b/backend/src/backend/gen8_context.cpp
index 3f57cf6..b136902 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -651,58 +651,21 @@ namespace gbe
  GenRegister tmp0 = ra-genReg(insn.dst(1));
  GenRegister tmp1 = ra-genReg(insn.dst(2));
  GenRegister tmp_dst = ra-genReg(insn.dst(3));
-int execWidth = p-curr.execWidth;

  /* Src0 and Src1 are always unsigned long type.*/
  GBE_ASSERT(src0.type == GEN_TYPE_UL  src1.type ==
GEN_TYPE_UL);
  dst.type = src0.type;
-tmp0.type = tmp1.type = GEN_TYPE_UD;
+tmp0.type = tmp1.type = GEN_TYPE_UL;
  tmp_dst.type = GEN_TYPE_UL;

  GBE_ASSERT(tmp_dst.subnr == 0);
-GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
-  GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr,
tmp_dst.subnr), GEN_TYPE_UD);
-GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-  GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4),
GEN_TYPE_UD) :
-  GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth /
8, tmp_dst.subnr), GEN_TYPE_UD);
-GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-  GenRegister::retype(src0, GEN_TYPE_UD) :
GenRegister::unpacked_ud(src0.nr, src0.subnr);
-GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-  GenRegister::retype(GenRegister::offset(src0, 0, 4),
GEN_TYPE_UD) :
-  GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
-GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-  GenRegister::retype(src1, GEN_TYPE_UD) :
GenRegister::unpacked_ud(src1.nr, src1.subnr);
-GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-  GenRegister::retype(GenRegister::offset(src1, 0, 4),
GEN_TYPE_UD) :
-  GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
-
-GenRegister acc0 = GenRegister::retype(GenRegister::acc(),
GEN_TYPE_D);
-p-push();
-p-curr.execWidth = 8;
-p-ADDC(dl, s0l, s1l);
-p-MOV(tmp0, acc0);
-p-ADDC(dh, s0h, s1h);
-p-MOV(tmp1, acc0);
-p-ADDC(dh, dh, tmp0);
-p-MOV(tmp0, acc0);
-p-ADD(tmp1, tmp0, tmp1);
-
-if (execWidth == 16) {
-  p-curr.quarterControl = 1;
-  p-ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1),
GenRegister::Qn(s1l, 1));
-  p-MOV(GenRegister::Qn(tmp0, 1), acc0);
-  p-ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1),
GenRegister::Qn(s1h, 1));
-  p-MOV(GenRegister::Qn(tmp1, 1), acc0);
-  p-ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1),
GenRegister::Qn(tmp0, 1));
-  p-MOV(GenRegister::Qn(tmp0, 1), acc0);
-  p-ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1),
GenRegister::Qn(tmp1, 1));
-}
-p-pop();
-
-packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD),
GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
-
-p-SHR(dst, dst, GenRegister::immud(1));
-p-SHL(tmp_dst, tmp1, GenRegister::immud(63));
+//hadd = (src01) + (src11) + ((src00x1)  (src10x1))
+p-AND(tmp0, src0, GenRegister::immud(1));
+p-AND(tmp1, src1, GenRegister::immud(1));
+p-AND(tmp_dst, tmp0, tmp1);
+p-SHR(tmp0, src0, GenRegister::immud(1));
+p-SHR(tmp1, src1, GenRegister::immud(1));
+p-ADD(dst, tmp0, tmp1);
  p-ADD(dst, dst, tmp_dst);
}

@@ -714,66 +677,21 @@ namespace gbe
  GenRegister tmp0 = ra-genReg(insn.dst(1));
  GenRegister tmp1 = ra-genReg(insn.dst(2));
  GenRegister tmp_dst = ra-genReg(insn.dst(3));
-int execWidth = p-curr.execWidth;

  /* Src0 and Src1 are always unsigned long type.*/
  GBE_ASSERT(src0.type == GEN_TYPE_UL  src1.type ==
GEN_TYPE_UL);
  dst.type = src0.type;
-tmp0.type = tmp1.type = GEN_TYPE_UD;
+tmp0.type = tmp1.type = GEN_TYPE_UL;
  tmp_dst.type = GEN_TYPE_UL;

  GBE_ASSERT(tmp_dst.subnr == 0);
-GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
-  GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr,
tmp_dst.subnr), GEN_TYPE_UD);
-GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-  

Re: [Beignet] [V2 PATCH 5/7] Backend: Handle the bswap using indirect mode access.

2015-03-08 Thread He Junyan


On 2015年03月09日 09:11, Zhigang Gong wrote:

On Fri, Mar 06, 2015 at 03:24:00PM +0800, junyan...@inbox.com wrote:

From: Junyan He junyan...@linux.intel.com

The swap for short will be like:
mov(1)   a01:UD0xe600e61UD{ align1 WE_all };
mov(1)   a0.11:UD  0xe620e63UD{ align1 WE_all };
mov(1)   a0.21:UD  0xe640e65UD{ align1 WE_all };
mov(1)   a0.31:UD  0xe660e67UD{ align1 WE_all };
mov(8)   g1141:UB  g[a0]VxH,1,0:UB  { align1 WE_all 1Q };
mov(8)   g114.81:UBg[a0 8]VxH,1,0:UB{ align1 WE_all 1Q };
mov(8)   g114.161:UB   g[a0 16]VxH,1,0:UB   { align1 WE_all 1Q };
mov(8)   g114.241:UB   g[a0 24]VxH,1,0:UB   { align1 WE_all 1Q };
mov(16)  g1131:UW  g1148,8,1:UW { align1 WE_normal 1H };

Signed-off-by: Junyan He junyan...@linux.intel.com
---
  backend/src/backend/gen_context.cpp|  112 
  backend/src/backend/gen_insn_selection.cpp |9 +++
  backend/src/backend/gen_insn_selection.hxx |1 +
  3 files changed, 122 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 6856510..46b4a06 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -297,6 +297,118 @@ namespace gbe
p-MOV(dst.top_half(this-simdWidth), GenRegister::immud(0));
  break;
}
+  case SEL_OP_BSWAP: {
+uint32_t simd = p-curr.execWidth;
+GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+uint16_t new_a0[16];
+memset(new_a0, 0, sizeof(new_a0));
+
+GBE_ASSERT(src.type == dst.type);
+uint32_t start_addr = src.nr*32 + src.subnr;
+
+if (simd == 1) {
+  GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+   dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+  if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+GBE_ASSERT(start_addr = 0);
+new_a0[0] = start_addr + 3;
+new_a0[1] = start_addr + 2;
+new_a0[2] = start_addr + 1;
+new_a0[3] = start_addr;
+this-setA0Content(new_a0, 0, 4);
+
+p-push();
+p-curr.execWidth = 4;
+p-curr.predicate = GEN_PREDICATE_NONE;
+p-curr.noMask = 1;
+GenRegister ind_src = 
GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+a0[0], new_a0[0] - a0[0]);
+GenRegister dst_ = dst;
+dst_.type = GEN_TYPE_UB;
+dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+dst_.width = GEN_WIDTH_4;
+dst_.vstride = GEN_VERTICAL_STRIDE_4;
+p-MOV(dst_, ind_src);
+p-pop();
+  } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+p-MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+GenRegister::retype(GenRegister::offset(src, 0, 1), 
GEN_TYPE_UB));
+p-MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), 
GEN_TYPE_UB),
+GenRegister::retype(src, GEN_TYPE_UB));
+  } else {
+GBE_ASSERT(0);
+  }
+} else {
+  if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+GBE_ASSERT(src.subnr == 0);

The above assertion is not correct. Because a valid simd8 or simd16 BSWAP 
instruction may have a
uniform source register. We can't assume the source register must not be 
uniform value.
I think the uniform case will be handled in  if (simd == 1)  case just 
above.
I find if src is uniform, the dst seems always to be uniform and the 
simd will be 1 here.

+GBE_ASSERT(dst.subnr == 0);
+GBE_ASSERT(tmp.subnr == 0);
+GBE_ASSERT(start_addr = 0);
+new_a0[0] = start_addr + 3;
+new_a0[1] = start_addr + 2;
+new_a0[2] = start_addr + 1;
+new_a0[3] = start_addr;
+new_a0[4] = start_addr + 7;
+new_a0[5] = start_addr + 6;
+new_a0[6] = start_addr + 5;
+new_a0[7] = start_addr + 4;
+this-setA0Content(new_a0, 56);
+
+p-push();
+p-curr.execWidth = 8;
+p-curr.predicate = GEN_PREDICATE_NONE;
+p-curr.noMask = 1;
+GenRegister ind_src = 
GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+a0[0], new_a0[0] - a0[0]);
+p-MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+for (int i = 1; i  4; i++) {
+  ind_src.addr_imm += 8;
+  p-MOV(GenRegister::offset(GenRegister::retype(tmp, 
GEN_TYPE_UB), 0, 8*i), ind_src);
+}
+if (simd == 16) {
+  for (int i = 0; i  4; i++) {
+ind_src.addr_imm += 8;
+p-MOV(GenRegister::offset(GenRegister::retype(tmp, 
GEN_TYPE_UB), 1, 8*i), ind_src);
+  }
+}
+p-pop();
+
+p-MOV(dst, tmp

Re: [Beignet] compiler_fill_image_1d_array intermittent failure

2015-02-03 Thread He Junyan

It's really a bug.
Because of the HW limitation, vertical stride is at least aligned to 2.
For 1D array image, the data has interval. The size calculated in memset 
is right,

but the image size is just twice as big as it.
Use clEnqueueWriteImage is safe and I will fix it later.

On 2015年02月04日 07:11, Rebecca N. Palmer wrote:
Both [3.18 kernel] and the 3.16 kernel have a different intermittent 
failure I have

yet to investigate:

compiler_fill_image_1d_array()[FAILED]
 Error: dst[j*w + i] == 0
   at file
/home/rnpalmer/Debian/builds/stackbuild/beignet/utests/compiler_fill_image_1d_array.cpp, 


function compiler_fill_image_1d_array, line 63


The kernel itself (fill j7,i32 with 0x03020100) is working 
correctly; the problem is that the initial memset() clear (line 30) 
sometimes only clears half the array (exactly half, i.e. j4 is filled 
with 0, but the rest is left at whatever it was before).


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/8] SKL: Add skl pci ids and device.

2015-01-29 Thread He Junyan

Hi,

It seems that gen9_context.hpp and gen9_context.cpp are lost and can not 
compile.



On 2015年01月29日 16:16, Yang Rong wrote:

SKL add the new GT4 type device.

Signed-off-by: Yang Rong rong.r.y...@intel.com
---
  src/cl_device_data.h |  45 +++
  src/cl_device_id.c   | 122 +--
  2 files changed, 164 insertions(+), 3 deletions(-)

diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 0d25ca4..d6f8209 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -230,5 +230,50 @@
  #define IS_BROADWELL(devid) (IS_BRW_GT1(devid) || IS_BRW_GT2(devid) || 
IS_BRW_GT3(devid))
  #define IS_GEN8(devid)  IS_BROADWELL(devid)
  
+/* SKL */

+#define PCI_CHIP_SKYLAKE_ULT_GT1   0x1906   /* Intel(R) Skylake ULT - GT1 
*/
+#define PCI_CHIP_SKYLAKE_ULT_GT2   0x1916   /* Intel(R) Skylake ULT - GT2 
*/
+#define PCI_CHIP_SKYLAKE_ULT_GT3   0x1926   /* Intel(R) Skylake ULT - GT3 
*/
+#define PCI_CHIP_SKYLAKE_ULT_GT2F  0x1921   /* Intel(R) Skylake ULT - GT2F 
*/
+#define PCI_CHIP_SKYLAKE_ULX_GT1   0x190E   /* Intel(R) Skylake ULX - GT1 
*/
+#define PCI_CHIP_SKYLAKE_ULX_GT2   0x191E   /* Intel(R) Skylake ULX - GT2 
*/
+#define PCI_CHIP_SKYLAKE_DT_GT10x1902   /* Intel(R) Skylake 
Desktop - GT1 */
+#define PCI_CHIP_SKYLAKE_DT_GT20x1912   /* Intel(R) Skylake 
Desktop - GT2 */
+#define PCI_CHIP_SKYLAKE_HALO_GT1  0x190B   /* Intel(R) Skylake HALO - GT1 
*/
+#define PCI_CHIP_SKYLAKE_HALO_GT2  0x191B   /* Intel(R) Skylake HALO - GT2 
*/
+#define PCI_CHIP_SKYLAKE_HALO_GT3  0x192B   /* Intel(R) Skylake HALO - GT3 
*/
+#define PCI_CHIP_SKYLAKE_HALO_GT4  0x193B   /* Intel(R) Skylake HALO - GT4 
*/
+#define PCI_CHIP_SKYLAKE_SRV_GT1   0x190A   /* Intel(R) Skylake Server - 
GT1 */
+#define PCI_CHIP_SKYLAKE_SRV_GT2   0x191A   /* Intel(R) Skylake Server - 
GT2 */
+#define PCI_CHIP_SKYLAKE_SRV_GT3   0x192A   /* Intel(R) Skylake Server - 
GT3 */
+#define PCI_CHIP_SKYLAKE_SRV_GT4   0x193A   /* Intel(R) Skylake Server - 
GT4 */
+
+#define IS_SKL_GT1(devid)   \
+  (devid == PCI_CHIP_SKYLAKE_ULT_GT1 ||   \
+   devid == PCI_CHIP_SKYLAKE_ULX_GT1 || \
+   devid == PCI_CHIP_SKYLAKE_DT_GT1 || \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT1 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT1)
+
+#define IS_SKL_GT2(devid)   \
+  (devid == PCI_CHIP_SKYLAKE_ULT_GT2 ||   \
+   devid == PCI_CHIP_SKYLAKE_ULT_GT2F ||   \
+   devid == PCI_CHIP_SKYLAKE_ULX_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_DT_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT2)
+
+#define IS_SKL_GT3(devid)   \
+  (devid == PCI_CHIP_SKYLAKE_ULT_GT3 ||   \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT3 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT3)
+
+#define IS_SKL_GT4(devid)   \
+  (devid == PCI_CHIP_SKYLAKE_HALO_GT4 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT4)
+
+#define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || 
IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
+#define IS_GEN9(devid)  IS_SKYLAKE(devid)
+
  #endif /* __CL_DEVICE_DATA_H__ */
  
diff --git a/src/cl_device_id.c b/src/cl_device_id.c

index 3032a38..9d83ab2 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -140,6 +140,51 @@ static struct _cl_device_id intel_brw_gt3_device = {
  #include cl_gen75_device.h
  };
  
+/* XXX we clone brw now */

+static struct _cl_device_id intel_skl_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 6,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include cl_gen75_device.h
+};
+
+static struct _cl_device_id intel_skl_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 24,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include cl_gen75_device.h
+};
+
+static struct _cl_device_id intel_skl_gt3_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 48,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 6,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include cl_gen75_device.h
+};
+
+static struct _cl_device_id intel_skl_gt4_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 72,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 9,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include cl_gen75_device.h
+};
+
  
  LOCAL cl_device_id

  cl_get_gt_device(void)
@@ -378,6 +423,62 @@ brw_gt3_break:
ret = intel_brw_gt3_device;
break;
  
+	  case PCI_CHIP_SKYLAKE_ULT_GT1:

+   DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, 
Intel(R) HD Graphics Skylake ULT GT1);
+ case PCI_CHIP_SKYLAKE_ULX_GT1:
+   DECL_INFO_STRING(skl_gt1_break, 

Re: [Beignet] [Patch V2 8/8] SKL: fix skl LD fail.

2015-01-29 Thread He Junyan

Except some format problem, this patchset LGTM and can pass all the
utest cases on my platform.


On 2015年01月30日 10:59, Yang Rong wrote:

Skl's LD message payload order is changed from u, lod, v, w to u, v, lod, w.
Add the Gen9Context and Selection9 to handle it.
Skl Still use Gen8Encoder.

Signed-off-by: Yang Rong rong.r.y...@intel.com
---
  backend/src/CMakeLists.txt |  2 +
  backend/src/backend/gen9_context.cpp   | 31 ++
  backend/src/backend/gen9_context.hpp   | 50 ++
  backend/src/backend/gen_insn_selection.cpp | 67 --
  backend/src/backend/gen_insn_selection.hpp |  7 
  backend/src/backend/gen_program.cpp|  3 +-
  6 files changed, 147 insertions(+), 13 deletions(-)
  create mode 100644 backend/src/backend/gen9_context.cpp
  create mode 100644 backend/src/backend/gen9_context.hpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index ce83c62..951 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -103,6 +103,8 @@ set (GBE_SRC
  backend/gen75_context.cpp
  backend/gen8_context.hpp
  backend/gen8_context.cpp
+backend/gen9_context.hpp
+backend/gen9_context.cpp
  backend/gen_program.cpp
  backend/gen_program.hpp
  backend/gen_program.h
diff --git a/backend/src/backend/gen9_context.cpp 
b/backend/src/backend/gen9_context.cpp
new file mode 100644
index 000..79ca275
--- /dev/null
+++ b/backend/src/backend/gen9_context.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see http://www.gnu.org/licenses/.
+ *
+ */
+
+/**
+ * \file gen9_context.cpp
+ */
+
+#include backend/gen9_context.hpp
+#include backend/gen_insn_selection.hpp
+
+namespace gbe
+{
+  void Gen9Context::newSelection(void) {
+this-sel = GBE_NEW(Selection9, *this);
+  }
+}
diff --git a/backend/src/backend/gen9_context.hpp 
b/backend/src/backend/gen9_context.hpp
new file mode 100644
index 000..672b4fc
--- /dev/null
+++ b/backend/src/backend/gen9_context.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see http://www.gnu.org/licenses/.
+ *
+ */
+
+/**
+ * \file gen9_context.hpp
+ */
+#ifndef __GBE_gen9_CONTEXT_HPP__
+#define __GBE_gen9_CONTEXT_HPP__
+
+#include backend/gen8_context.hpp
+#include backend/gen8_encoder.hpp
+
+namespace gbe
+{
+  /* This class is used to implement the HSW
+ specific logic for context. */
+  class Gen9Context : public Gen8Context
+  {
+  public:
+virtual ~Gen9Context(void) { };
+Gen9Context(const ir::Unit unit, const std::string name, uint32_t 
deviceID, bool relaxMath = false)
+: Gen8Context(unit, name, deviceID, relaxMath) {
+};
+   
+   protected:
+   virtual GenEncoder* generateEncoder(void) {
+   return GBE_NEW(Gen8Encoder, this-simdWidth, 9, 
deviceID);
+   }
+
+  private:
+virtual void newSelection(void);
+  };
+}
+#endif /* __GBE_GEN9_CONTEXT_HPP__ */
+
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 65842ff..4d0b979 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -249,6 +249,9 @@ namespace gbe
  this-vectorList.push_back(vec);
}
  
+#define LD_MSG_ORDER_IVB 7

+#define LD_MSG_ORDER_SKL 9
+
///
// Maximal munch selection on DAG
///
@@ -358,6 +361,8 @@ namespace gbe
  void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
  bool hasLongType() const { return 

Re: [Beignet] [PATCH 16/27] Modify the convert logic in gen selection.

2015-01-17 Thread He Junyan


On 四, 2015-01-08 at 13:14 +0800, Zhigang Gong wrote:
 On Tue, Jan 06, 2015 at 06:01:54PM +0800, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
  
  The conversion logic is too complicated.
  We split it more clearly for each case.
  Notice: For I64 to I8, the conversion can not be completed
  within one step because of the hardware hstride restriction.
  So we need to convert it to i32 and than convert it to i8.
typo here, should be then.
  
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
   backend/src/backend/gen8_context.cpp   |   8 +-
   backend/src/backend/gen_insn_selection.cpp | 195 
  -
   2 files changed, 168 insertions(+), 35 deletions(-)
  
  diff --git a/backend/src/backend/gen8_context.cpp 
  b/backend/src/backend/gen8_context.cpp
  index cffb10d..18a3425 100644
  --- a/backend/src/backend/gen8_context.cpp
  +++ b/backend/src/backend/gen8_context.cpp
  @@ -55,7 +55,9 @@ namespace gbe
 {
   switch (insn.opcode) {
 case SEL_OP_CONVI64_TO_I:
  -
  +/* Should never come to here, just use the common OPCODE. */
  +GBE_ASSERT(0);
  +break;
 default:
   GenContext::emitUnaryInstruction(insn);
   }
  @@ -65,7 +67,9 @@ namespace gbe
 {
   switch (insn.opcode) {
 case SEL_OP_CONVI_TO_I64:
  -
  +/* Should never come to here, just use the common OPCODE. */
  +GBE_ASSERT(0);
  +break;
 default:
   GenContext::emitUnaryWithTempInstruction(insn);
   }
  diff --git a/backend/src/backend/gen_insn_selection.cpp 
  b/backend/src/backend/gen_insn_selection.cpp
  index b6a13bf..60f45f7 100644
  --- a/backend/src/backend/gen_insn_selection.cpp
  +++ b/backend/src/backend/gen_insn_selection.cpp
  @@ -349,9 +349,17 @@ namespace gbe
 const ir::RegisterData regData = getRegisterData(reg);
 return regData.isUniform();
   }
  +INLINE bool isLongReg(const ir::Register reg) const {
  +  const ir::RegisterData regData = getRegisterData(reg);
  +  return regData.family == ir::FAMILY_QWORD;
  +}
  +
  +INLINE GenRegister unpacked_ud(const ir::Register reg) const {
  +  return GenRegister::unpacked_ud(reg, isScalarReg(reg));
  +}
   
   INLINE GenRegister unpacked_uw(const ir::Register reg) const {
  -  return GenRegister::unpacked_uw(reg, isScalarReg(reg));
  +  return GenRegister::unpacked_uw(reg, isScalarReg(reg), 
  isLongReg(reg));
   }
   
   INLINE GenRegister unpacked_ub(const ir::Register reg) const {
  @@ -3658,7 +3666,7 @@ namespace gbe
 sel.F32TO16(unpacked, src);
   sel.pop();
   sel.MOV(dst, unpacked);
  -  } else if (dstFamily != FAMILY_DWORD  dstFamily != FAMILY_QWORD  
  (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
  +  } else if (dstFamily != FAMILY_DWORD  dstFamily != FAMILY_QWORD  
  srcFamily == FAMILY_DWORD) {//convert i32 to small int
   GenRegister unpacked;
   if (dstFamily == FAMILY_WORD) {
 const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : 
  GEN_TYPE_W;
  @@ -3675,27 +3683,115 @@ namespace gbe
 } else
   unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), 
  type);
   }
  -if(srcFamily == FAMILY_QWORD) {
  +
  +sel.push();
  +if (sel.isScalarReg(insn.getSrc(0))) {
  +  sel.curr.execWidth = 1;
  +  sel.curr.predicate = GEN_PREDICATE_NONE;
  +  sel.curr.noMask = 1;
  +}
  +sel.MOV(unpacked, src);
  +sel.pop();
  +
  +if (unpacked.reg() != dst.reg())
  +  sel.MOV(dst, unpacked);
  +  } else if (dstFamily == FAMILY_WORD  srcFamily == FAMILY_QWORD) { 
  //convert i64 to i16
  +const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : 
  GEN_TYPE_W;
  +GenRegister unpacked;
  +if (!sel.isScalarReg(dst.reg())) {
  +  if (sel.hasLongType()) {
  +unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, 
  sel.isScalarReg(insn.getSrc(0;
  +  } else {
  +unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, 
  sel.isScalarReg(insn.getSrc(0;
  +  }
  +  unpacked = GenRegister::retype(unpacked, type);
  +} else {
  +  unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
  +}
  +
  +if(!sel.hasLongType()) {
 
 You already remove (|| srcFamily == FAMILY_QWORD at Line 3658, why still
 do the following code which is to convert I64 source operand to I32?
 It looks incorrect for me. The following else branch should be put here
 unconditional.
 
I think here we are converting 64bits to 16bits, we  first mov 64 bits
to 32bits and then mov it to 16bits.
I do not modify the origin logic here, but really we can optimize it.

 GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD

Re: [Beignet] [PATCH 2/6] Add long type support for disasm.

2015-01-06 Thread He Junyan
The Print imm logic is added in the  [PATCH 10/27]
of the new patchset.


On 二, 2015-01-06 at 00:50 +, Yang, Rong R wrote:
 Also need add long/ulong imm print.
 
  -Original Message-
  From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
  junyan...@inbox.com
  Sent: Wednesday, December 24, 2014 00:13
  To: beignet@lists.freedesktop.org
  Cc: Junyan He
  Subject: [Beignet] [PATCH 2/6] Add long type support for disasm.
  
  From: Junyan He junyan...@linux.intel.com
  
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
   backend/src/backend/gen/gen_mesa_disasm.c |   13 +
   1 file changed, 9 insertions(+), 4 deletions(-)
  
  diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
  b/backend/src/backend/gen/gen_mesa_disasm.c
  index 162d459..2ebbc98 100644
  --- a/backend/src/backend/gen/gen_mesa_disasm.c
  +++ b/backend/src/backend/gen/gen_mesa_disasm.c
  @@ -265,7 +265,7 @@ static const char *access_mode[2] = {
 [1] = align16,
   };
  
  -static const char *reg_encoding[8] = {
  +static const char *reg_encoding[10] = {
 [0] = :UD,
 [1] = :D,
 [2] = :UW,
  @@ -273,10 +273,12 @@ static const char *reg_encoding[8] = {
 [4] = :UB,
 [5] = :B,
 [6] = :DF,
  -  [7] = :F
  +  [7] = :F,
  +  [8] = :Q,
  +  [9] = :UQ
   };
  
  -int reg_type_size[8] = {
  +int reg_type_size[10] = {
 [0] = 4,
 [1] = 4,
 [2] = 2,
  @@ -284,7 +286,9 @@ int reg_type_size[8] = {
 [4] = 1,
 [5] = 1,
 [6] = 8,
  -  [7] = 4
  +  [7] = 4,
  +  [8] = 8,
  +  [9] = 8
   };
  
   static const char *reg_file[4] = {
  @@ -983,6 +987,7 @@ static int imm(FILE *file, uint32_t type, const void*
  inst)
 break;
   case GEN_TYPE_F:
 format(file, %-gF, GEN_BITS_FIELD(inst, bits3.f));
  +  break;
 }
 return 0;
   }
  --
  1.7.9.5
  
  
  
  ___
  Beignet mailing list
  Beignet@lists.freedesktop.org
  http://lists.freedesktop.org/mailman/listinfo/beignet
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Fix PrintfState copying.

2014-12-15 Thread He Junyan


On 二, 2014-12-09 at 12:41 +0800, Yan Wang wrote:
 PrintfState includes std::string object and shouldn't be copied by
 malloc/memcpy.
 
 Signed-off-by: Yan Wang yan.w...@linux.intel.com
 ---
  backend/src/ir/printf.hpp | 23 +++
  1 file changed, 19 insertions(+), 4 deletions(-)
 
 diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
 index b9f7619..8ea5976 100644
 --- a/backend/src/ir/printf.hpp
 +++ b/backend/src/ir/printf.hpp
 @@ -75,6 +75,23 @@ namespace gbe
char conversion_specifier;
int out_buf_sizeof_offset;  // Should *global_total_size to get the 
 full offset.
std::string str;//if %s, the string store here.
 +
 +  PrintfState(void) {
 +  }
I think if we consider the PrintfState as a object and use constructor
to init it, here we should better to init all the elements to default
value.

 +
 +  PrintfState(const PrintfState  other) {
 +left_justified = other.left_justified;
 +sign_symbol = other.sign_symbol;
 +alter_form = other.alter_form;
 +zero_padding = other.zero_padding;
 +vector_n = other.vector_n;
 +min_width = other.min_width;
 +precision = other.precision;
 +length_modifier = other.length_modifier;
 +conversion_specifier = other.conversion_specifier;
 +out_buf_sizeof_offset = other.out_buf_sizeof_offset;
 +str = other.str;
 +  }
  };
  
  enum {
 @@ -106,8 +123,7 @@ namespace gbe
  
PrintfSlot(PrintfState * st) {
  type = PRINTF_SLOT_TYPE_STATE;
 -state = (PrintfState *)malloc(sizeof(PrintfState));
 -memcpy(state, st, sizeof(PrintfState));
 +state = new PrintfState(*st);
}
  
PrintfSlot(const PrintfSlot  other) {
 @@ -119,8 +135,7 @@ namespace gbe
type = PRINTF_SLOT_TYPE_STRING;
  } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
type = PRINTF_SLOT_TYPE_STATE;
 -  state = (PrintfState *)malloc(sizeof(PrintfState));
 -  memcpy(state, other.state, sizeof(PrintfState));
 +  state = new PrintfState(*other.state);
  } else {
type = PRINTF_SLOT_TYPE_NONE;
ptr = NULL;



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] runtime: fix one bug in BDW image.

2014-11-11 Thread He Junyan
It's good for me

On 三, 2014-11-12 at 14:12 +0800, Zhigang Gong wrote:
 As we still have the image 1d array workaround, we need to
 fix it for BDW as well.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/intel/intel_gpgpu.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)
 
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index c6ea17f..b6e19db 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -1028,8 +1028,10 @@ intel_get_surface_type(cl_mem_object_type type)
  static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, 
 cl_mem_object_type type)
  {
uint32_t surface_type;
 -  if (((IS_IVYBRIDGE(gpgpu-drv-device_id) || 
 IS_HASWELL(gpgpu-drv-device_id))) 
 -  index = 128 + BTI_RESERVED_NUM 
 +  if (((IS_IVYBRIDGE(gpgpu-drv-device_id) ||
 +IS_HASWELL(gpgpu-drv-device_id) ||
 +IS_BROADWELL(gpgpu-drv-device_id))) 
 +  index = BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM 
type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
  surface_type = I965_SURFACE_2D;
else



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 6/6] BDW: Add function intel_gpgpu_bind_buf for gen8.

2014-10-24 Thread He Junyan
This patchset LGTM

On 一, 2014-09-29 at 13:37 +0800, Yang Rong wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Must call cl_bind_buf instead of intel_gpgpu_bind_buf directly in intel_gpgpu.
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  src/intel/intel_gpgpu.c | 36 +++-
  1 file changed, 27 insertions(+), 9 deletions(-)
 
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index 6b8fa38..eedfe31 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -818,13 +818,13 @@ intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, 
 drm_intel_bo *buf,
ss0-ss8_9.surface_base_addr_lo = (buf-offset64 + internal_offset)  
 0x;
ss0-ss8_9.surface_base_addr_hi = ((buf-offset64 + internal_offset)  
 32)  0x;
dri_bo_emit_reloc(gpgpu-aux_buf.bo,
 -  I915_GEM_DOMAIN_RENDER,
 -  I915_GEM_DOMAIN_RENDER,
 -  internal_offset,
 -  gpgpu-aux_offset.surface_heap_offset +
 -  heap-binding_table[index] +
 -  offsetof(gen8_surface_state_t, ss1),
 -  buf);
 +I915_GEM_DOMAIN_RENDER,
 +I915_GEM_DOMAIN_RENDER,
 +internal_offset,
 +gpgpu-aux_offset.surface_heap_offset +
 +heap-binding_table[index] +
 +offsetof(gen8_surface_state_t, ss1),
 +buf);
  }
  
  static int
 @@ -981,6 +981,18 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo 
 *buf, uint32_t offset,
intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
  }
  
 +static void
 +intel_gpgpu_bind_buf_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t 
 offset,
 +  uint32_t internal_offset, uint32_t size, uint8_t 
 bti)
 +{
 +  assert(gpgpu-binded_n  max_buf_n);
 +  gpgpu-binded_buf[gpgpu-binded_n] = buf;
 +  gpgpu-target_buf_offset[gpgpu-binded_n] = internal_offset;
 +  gpgpu-binded_offset[gpgpu-binded_n] = offset;
 +  gpgpu-binded_n++;
 +  intel_gpgpu_setup_bti_gen8(gpgpu, buf, internal_offset, size, bti);
 +}
 +
  static int
  intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
  {
 @@ -1011,7 +1023,7 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t 
 offset, uint32_t size, uint
drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr;
gpgpu-stack_b.bo = drm_intel_bo_alloc(bufmgr, STACK, size, 64);
  
 -  intel_gpgpu_bind_buf(gpgpu, gpgpu-stack_b.bo, offset, 0, size, bti);
 +  cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)gpgpu-stack_b.bo, offset, 
 0, size, bti);
  }
  
  static void
 @@ -1427,7 +1439,7 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, 
 uint32_t i, uint32_t size, uint
}
memset(bo-virtual, 0, size);
drm_intel_bo_unmap(bo);
 -  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, bti);
 +  cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)bo, offset, 0, size, bti);
return 0;
  }
  
 @@ -1526,6 +1538,12 @@ intel_set_gpgpu_callbacks(int device_id)
cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb 
 *)intel_gpgpu_set_printf_info;
cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb 
 *)intel_gpgpu_get_printf_info;
  
 +  if (IS_BROADWELL(device_id)) {
 +cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *)intel_gpgpu_bind_buf_gen8;
 +cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb 
 *)intel_gpgpu_get_cache_ctrl_gen8;
 +return;
 +  }
 +
if (IS_HASWELL(device_id)) {
  cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) 
 intel_gpgpu_bind_image_gen75;
  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) 
 intel_gpgpu_alloc_constant_buffer_gen75;



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] Problems with recent beignet

2014-10-17 Thread He Junyan
Yes, we have found this problem.
New version beignet requires libdrm at least 2.4.52
Old version of libdrm will cause the build fail.
We have already add the libdrm version check

On 四, 2014-10-16 at 21:53 -0700, Andi Kleen wrote:
 I tried the current beignet master on my OpenSUSE 13.1 HSW and 
 ran into the following new problems (older versions worked):
 
 - With my DRM version I have the build always fails with:
 
 src/intel/intel_gpgpu.c: In function
 ‘intel_gpgpu_setup_bti_gen8’:
 src/intel/intel_gpgpu.c:922:39: error:
 ‘drm_intel_bo’ has no member named ‘offset64’
ss0-ss8.surface_base_addr_lo = (buf-offset64 + internal_offset) 
0x;
^
 src/intel/intel_gpgpu.c:923:40: error:
 ‘drm_intel_bo’ has no member named ‘offset64’
ss0-ss9.surface_base_addr_hi = ((buf-offset64 + internal_offset) 
32)  0x;
 ^
 src//beignet/src/intel/intel_gpgpu.c: In function
 ‘intel_gpgpu_bind_image_gen8’:
 [ 25%] src/intel/intel_gpgpu.c:1112:40: error:
 ‘drm_intel_bo’ has no member named ‘offset64’
ss-ss8.surface_base_addr_lo = obj_bo-offset64  0x;
 ^
 /home/ak/src/beignet/src/intel/intel_gpgpu.c:1113:41: error:
 ‘drm_intel_bo’ has no member named ‘offset64’
ss-ss9.surface_base_addr_hi = (obj_bo-offset64  32)  0x;
 
 I just commented out these lines because they seem to be only used on GEN8.
 Probably would be good to have a cmake test that tests for these fields
 and disables gen8 or falls back to plain offset ?
 
 - With that fixed the first utest always bails out with:
 
 builtin_acos_float()utest_run:
 /home/ak/src/beignet/src/intel/intel_gpgpu.c:703:
 intel_gpgpu_check_binded_buf_address: Assertion
 `gpgpu-binded_buf[i]-offset != 0' failed.
 Interrupt signal (SIGABRT) received.
 summary:
 --
   total: 684
   run: 1
   pass: 0
   fail: 1
 
 
 The assert mentions the same name as above, but I believe it's a different 
 field.
 I tried to bisect that and ended up with the following commit. Not sure
 if that is correct? Unfortunately it doesn't cleanly revert from master
 for testing.
 
 commit 8c1ed91f0af6ab8284fe06b4c582b55c7d925816
 Author: Zhigang Gong zhigang.g...@intel.com
 Date:   Fri Sep 12 13:45:40 2014 +0800
 
 GBE: fix multiple files compilation bugs.
 
 If we want to link multiple files together, and one kernel
 function need refer other kernel functions in other files,
 we must not set those functions as linked once attribute.
 
 
 -Andi
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] BDW: Also need set Shader Channel Select for constant buffer in BDW.

2014-10-16 Thread He Junyan
OK, LGTM

On 四, 2014-10-16 at 15:11 +0800, Yang Rong wrote:
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  src/intel/intel_gpgpu.c | 6 ++
  1 file changed, 6 insertions(+)
 
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index 259882a..167d8d9 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -907,6 +907,12 @@ intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, 
 drm_intel_bo *buf, uint32_t int
memset(ss0, 0, sizeof(gen8_surface_state_t));
ss0-ss0.surface_type = I965_SURFACE_BUFFER;
ss0-ss0.surface_format = format;
 +  if(format != I965_SURFACEFORMAT_RAW) {
 +ss0-ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
 +ss0-ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
 +ss0-ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
 +ss0-ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
 +  }
ss0-ss2.width  = s  0x7f;   /* bits 6:0 of sz */
assert(ss0-ss2.width  0x03);
ss0-ss2.height = (s  7)  0x3fff; /* bits 20:7 of sz */



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 8/8] BDW: Correct scratch buffer of BDW.

2014-10-09 Thread He Junyan
This patch set will cause displacement_map_element case hang every time.
But no regression found on previous platforms.
We can find the bug later and fix it.


On 一, 2014-09-29 at 13:38 +0800, Yang Rong wrote:
 BDW's scratch buffer change to power 2 alignment from 1024.
 
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  backend/src/backend/gen8_context.cpp |  2 +-
  src/intel/intel_gpgpu.c  | 22 ++
  2 files changed, 19 insertions(+), 5 deletions(-)
 
 diff --git a/backend/src/backend/gen8_context.cpp 
 b/backend/src/backend/gen8_context.cpp
 index a8bed64..f7484ca 100644
 --- a/backend/src/backend/gen8_context.cpp
 +++ b/backend/src/backend/gen8_context.cpp
 @@ -46,7 +46,7 @@ namespace gbe
uint32_t Gen8Context::alignScratchSize(uint32_t size){
  if(size == 0)
return 0;
 -uint32_t i = 2048;
 +uint32_t i = 1024;
  while(i  size) i *= 2;
  return i;
}
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index fa7333e..d65b1a2 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -342,16 +342,28 @@ uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t 
 size) {
  }
  
  uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
 +//align in backend, if non pow2, must align when alloc scratch bo.
 +assert((size  (size - 1)) == 0);
  size = size  11;
  uint32_t index = 0;
  while((size = 1)  0)
index++;   //get leading one
  
 -//non pow 2 size
 -if(size  (size - 1)) index++;
  return index;
  }
  
 +uint32_t intel_gpgpu_get_scratch_index_gen8(uint32_t size) {
 +//align in backend, if non pow2, must align when alloc scratch bo.
 +assert((size  (size - 1)) == 0);
 +size = size  10;
 +uint32_t index = 0;
 +while((size = 1)  0)
 +  index++;   //get leading one
 +
 +return index;
 +}
 +
 +
  static cl_int
  intel_gpgpu_get_max_curbe_size(uint32_t device_id)
  {
 @@ -1142,7 +1154,9 @@ intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, 
 cl_gpgpu_kernel *kernel)
/* group_threads_num should not be set to 0 even if the barrier is 
 disabled per bspec */
desc-desc6.group_threads_num = kernel-thread_n;
desc-desc6.barrier_enable = kernel-use_slm;
 -  if (slm_sz = 4*KB)
 +  if (slm_sz == 0)
 +slm_sz = 0;
 +  else if (slm_sz = 4*KB)
  slm_sz = 4*KB;
else if (slm_sz = 8*KB)
  slm_sz = 8*KB;
 @@ -1666,7 +1680,7 @@ intel_set_gpgpu_callbacks(int device_id)
  cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) 
 intel_gpgpu_bind_image_gen75;
  intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
  cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb 
 *)intel_gpgpu_get_cache_ctrl_gen8;
 -intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
 +intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
  intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
  intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
  intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8;



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 5/5] BDW: Add class Gen8Context.

2014-10-08 Thread He Junyan
This patchset is OK and will not cause regression on previous platform.
In this patch set, the GenEncoder will be a pure virtual class and
all platform encoders will derive from it.
But the GenContext still represents the Gen7 context. I think it is
better to follow the same way as the encoder to make the architecture
clearer.



On 一, 2014-09-29 at 13:37 +0800, Yang Rong wrote:
 Now Gen8Context is almost same as Gen75Context, but still derive Gen8Context 
 from GenContext for clearly.
 
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  backend/src/CMakeLists.txt   |   2 +
  backend/src/backend/gen8_context.cpp | 113 
 +++
  backend/src/backend/gen8_context.hpp |  63 +++
  backend/src/backend/gen_program.cpp  |   3 +
  4 files changed, 181 insertions(+)
  create mode 100644 backend/src/backend/gen8_context.cpp
  create mode 100644 backend/src/backend/gen8_context.hpp
 
 diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
 index 2daa630..c5d388e 100644
 --- a/backend/src/CMakeLists.txt
 +++ b/backend/src/CMakeLists.txt
 @@ -96,6 +96,8 @@ set (GBE_SRC
  backend/gen_context.cpp
  backend/gen75_context.hpp
  backend/gen75_context.cpp
 +backend/gen8_context.hpp
 +backend/gen8_context.cpp
  backend/gen_program.cpp
  backend/gen_program.hpp
  backend/gen_program.h
 diff --git a/backend/src/backend/gen8_context.cpp 
 b/backend/src/backend/gen8_context.cpp
 new file mode 100644
 index 000..a9914f6
 --- /dev/null
 +++ b/backend/src/backend/gen8_context.cpp
 @@ -0,0 +1,113 @@
 +/*
 + * Copyright © 2012 Intel Corporation
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library. If not, see 
 http://www.gnu.org/licenses/.
 + *
 + */
 +
 +/**
 + * \file gen8_context.cpp
 + */
 +
 +#include backend/gen8_context.hpp
 +#include backend/gen8_encoder.hpp
 +#include backend/gen_program.hpp
 +#include backend/gen_defs.hpp
 +#include backend/gen_encoder.hpp
 +#include backend/gen_insn_selection.hpp
 +#include backend/gen_insn_scheduling.hpp
 +#include backend/gen_reg_allocation.hpp
 +#include sys/cvar.hpp
 +#include ir/function.hpp
 +#include ir/value.hpp
 +#include cstring
 +
 +namespace gbe
 +{
 +  void Gen8Context::emitSLMOffset(void) {
 +if(kernel-getUseSLM() == false)
 +  return;
 +
 +const GenRegister slm_offset = 
 ra-genReg(GenRegister::ud1grf(ir::ocl::slmoffset));
 +const GenRegister slm_index = GenRegister::ud1grf(0, 0);
 +//the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to 
 get byte unit
 +p-push();
 +  p-curr.execWidth = 1;
 +  p-curr.predicate = GEN_PREDICATE_NONE;
 +  p-SHR(slm_offset, slm_index, GenRegister::immud(12));
 +p-pop();
 +  }
 +
 +  void Gen8Context::allocSLMOffsetCurbe(void) {
 +if(fn.getUseSLM())
 +  allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
 +  }
 +
 +  uint32_t Gen8Context::alignScratchSize(uint32_t size){
 +if(size == 0)
 +  return 0;
 +uint32_t i = 2048;
 +while(i  size) i *= 2;
 +return i;
 +  }
 +
 +  void Gen8Context::emitStackPointer(void) {
 +using namespace ir;
 +
 +// Only emit stack pointer computation if we use a stack
 +if (kernel-getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) = 0)
 +  return;
 +
 +// Check that everything is consistent in the kernel code
 +const uint32_t perLaneSize = kernel-getStackSize();
 +const uint32_t perThreadSize = perLaneSize * this-simdWidth;
 +GBE_ASSERT(perLaneSize  0);
 +GBE_ASSERT(isPowerOf2(perLaneSize) == true);
 +GBE_ASSERT(isPowerOf2(perThreadSize) == true);
 +
 +// Use shifts rather than muls which are limited to 32x16 bit sources
 +const uint32_t perLaneShift = logi2(perLaneSize);
 +const uint32_t perThreadShift = logi2(perThreadSize);
 +const GenRegister selStatckPtr = this-simdWidth == 8 ?
 +  GenRegister::ud8grf(ir::ocl::stackptr) :
 +  GenRegister::ud16grf(ir::ocl::stackptr);
 +const GenRegister stackptr = ra-genReg(selStatckPtr);
 +const GenRegister selStackBuffer = 
 GenRegister::ud1grf(ir::ocl::stackbuffer);
 +const GenRegister bufferptr = ra-genReg(selStackBuffer);
 +
 +// We compute the per-lane stack pointer here
 +p-push();
 +  p-curr.execWidth = 1;
 +  p-curr.predicate = GEN_PREDICATE_NONE;
 +  //p-AND(GenRegister::ud1grf(126,0), 

Re: [Beignet] [PATCH] Add long support for printf

2014-09-17 Thread He Junyan
Sorry, this if V2

V2:
Replace all the long and ulong to int64_t

On 四, 2014-09-18 at 12:39 +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  backend/src/ir/printf.cpp   |   25 -
  backend/src/llvm/llvm_printf_parser.cpp |   22 +++---
  kernels/test_printf.cl  |3 +++
  3 files changed, 38 insertions(+), 12 deletions(-)
 
 diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
 index 9d60402..e99aad5 100644
 --- a/backend/src/ir/printf.cpp
 +++ b/backend/src/ir/printf.cpp
 @@ -149,20 +149,35 @@ namespace gbe
  switch (slot.state-conversion_specifier) {
case PRINTF_CONVERSION_D:
case PRINTF_CONVERSION_I:
 -PRINT_SOMETHING(int, d);
 +if (slot.state-length_modifier == PRINTF_LM_L)
 +  PRINT_SOMETHING(uint64_t, d);
 +else
 +  PRINT_SOMETHING(int, d);
  break;
  
case PRINTF_CONVERSION_O:
 -PRINT_SOMETHING(int, o);
 +if (slot.state-length_modifier == PRINTF_LM_L)
 +  PRINT_SOMETHING(uint64_t, o);
 +else
 +  PRINT_SOMETHING(int, o);
  break;
case PRINTF_CONVERSION_U:
 -PRINT_SOMETHING(int, u);
 +if (slot.state-length_modifier == PRINTF_LM_L)
 +  PRINT_SOMETHING(uint64_t, u);
 +else
 +  PRINT_SOMETHING(int, u);
  break;
case PRINTF_CONVERSION_X:
 -PRINT_SOMETHING(int, X);
 +if (slot.state-length_modifier == PRINTF_LM_L)
 +  PRINT_SOMETHING(uint64_t, X);
 +else
 +  PRINT_SOMETHING(int, X);
  break;
case PRINTF_CONVERSION_x:
 -PRINT_SOMETHING(int, x);
 +if (slot.state-length_modifier == PRINTF_LM_L)
 +  PRINT_SOMETHING(uint64_t, x);
 +else
 +  PRINT_SOMETHING(int, x);
  break;
  
case PRINTF_CONVERSION_C:
 diff --git a/backend/src/llvm/llvm_printf_parser.cpp 
 b/backend/src/llvm/llvm_printf_parser.cpp
 index 00e1ef8..29684ba 100644
 --- a/backend/src/llvm/llvm_printf_parser.cpp
 +++ b/backend/src/llvm/llvm_printf_parser.cpp
 @@ -640,14 +640,22 @@ error:
case PRINTF_CONVERSION_U:
case PRINTF_CONVERSION_x:
case PRINTF_CONVERSION_X:
 -/* If the bits change, we need to consider the signed. */
 -if (arg-getType() != Type::getInt32Ty(module-getContext())) {
 -  arg = builder-CreateIntCast(arg, 
 Type::getInt32Ty(module-getContext()), sign);
 -}
 +if (slot.state-length_modifier == PRINTF_LM_L) { /* we would 
 rather print long. */
 +  if (arg-getType() != Type::getInt64Ty(module-getContext())) {
 +arg = builder-CreateIntCast(arg, 
 Type::getInt64Ty(module-getContext()), sign);
 +  }
 +  dst_type = Type::getInt64PtrTy(module-getContext(), 1);
 +  sizeof_size = sizeof(int64_t);
 +} else {
 +  /* If the bits change, we need to consider the signed. */
 +  if (arg-getType() != Type::getInt32Ty(module-getContext())) {
 +arg = builder-CreateIntCast(arg, 
 Type::getInt32Ty(module-getContext()), sign);
 +  }
  
 -/* Int to Int, just store. */
 -dst_type = Type::getInt32PtrTy(module-getContext(), 1);
 -sizeof_size = sizeof(int);
 +  /* Int to Int, just store. */
 +  dst_type = Type::getInt32PtrTy(module-getContext(), 1);
 +  sizeof_size = sizeof(int);
 +}
  return true;
  
case PRINTF_CONVERSION_C:
 diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
 index 84bb478..c2844f4 100644
 --- a/kernels/test_printf.cl
 +++ b/kernels/test_printf.cl
 @@ -7,6 +7,7 @@ test_printf(void)
uint a = 'x';
float f = 5.0f;
int3 vec;
 +  ulong cc = 1004294967296;
vec.x = x;
vec.y = y;
vec.z = z;
 @@ -15,6 +16,8 @@ test_printf(void)
  printf(--- Welcome to the printf test of %s ---\n, Intel Beignet);
  
  printf(### output a char is %c\n, a);
 +
 +printf(@@@ A long value is %ld\n, cc);
}
  
if (x % 15 == 0)



___
Beignet mailing list
Beignet@lists.freedesktop.org
http

Re: [Beignet] [PATCH] GBE/libocl: fix build dependency issue.

2014-09-17 Thread He Junyan
LGTM

On 四, 2014-09-18 at 08:35 +0800, Zhigang Gong wrote:
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  backend/src/libocl/CMakeLists.txt | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/backend/src/libocl/CMakeLists.txt 
 b/backend/src/libocl/CMakeLists.txt
 index f015eec..b0074b3 100644
 --- a/backend/src/libocl/CMakeLists.txt
 +++ b/backend/src/libocl/CMakeLists.txt
 @@ -72,7 +72,7 @@ MACRO(GENERATE_HEADER_PY _mod)
   COMMAND ${PYTHON_EXECUTABLE} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py 
 ${def_name} ${output_name} 1
   #COMMAND echo echo \\#endif  ${output_name}
   COMMAND echo \\#endif  ${output_name}
 - DEPENDS ${tmpl_name}
 + DEPENDS ${tmpl_name} ${def_name} 
 ${LIBOCL_SOURCE_DIR}/script/gen_vector.py
   COMMENT Generate the header by python: ${output_name}
   )
  ENDMACRO(GENERATE_HEADER_PY)
 @@ -85,7 +85,7 @@ MACRO(GENERATE_SOURCE_PY _mod)
   COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
   COMMAND cat ${tmpl_name}  ${output_name}
   COMMAND ${PYTHON_EXECUTABLE} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py 
 ${def_name} ${output_name} 0
 - DEPENDS ${tmpl_name}
 + DEPENDS ${tmpl_name} ${def_name} 
 ${LIBOCL_SOURCE_DIR}/script/gen_vector.py
   COMMENT Generate the source by python: ${output_name}
   )
  ENDMACRO(GENERATE_SOURCE_PY)



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] GBE: Output linkModules's error message.

2014-09-16 Thread He Junyan
LGTM
thanks

On 三, 2014-09-17 at 11:33 +0800, Ruiling Song wrote:
 Signed-off-by: Ruiling Song ruiling.s...@intel.com
 ---
  backend/src/llvm/llvm_bitcode_link.cpp |5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)
 
 diff --git a/backend/src/llvm/llvm_bitcode_link.cpp 
 b/backend/src/llvm/llvm_bitcode_link.cpp
 index d845479..1365b32 100644
 --- a/backend/src/llvm/llvm_bitcode_link.cpp
 +++ b/backend/src/llvm/llvm_bitcode_link.cpp
 @@ -204,9 +204,10 @@ namespace gbe
  
  /* We use beignet's bitcode as dst because it will have a lot of
 lazy functions which will not be loaded. */
 -if(Linker::LinkModules(clonedLib, mod, Linker::DestroySource, NULL)) {
 +std::string errorMsg;
 +if(Linker::LinkModules(clonedLib, mod, Linker::DestroySource, 
 errorMsg)) {
delete clonedLib;
 -  printf(Fatal Error: link the bitcode error\n);
 +  printf(Fatal Error: link the bitcode error:\n%s\n, errorMsg.c_str());
return NULL;
  }
  



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] GBE/libocl: fix a regression after libocl change.

2014-09-15 Thread He Junyan
LGTM, thanks

On 二, 2014-09-16 at 09:57 +0800, Zhigang Gong wrote:
 Ping for review.
 
 On Fri, Sep 12, 2014 at 05:38:06PM +0800, Zhigang Gong wrote:
  Signed-off-by: Zhigang Gong zhigang.g...@intel.com
  ---
   backend/src/libocl/tmpl/ocl_math.tmpl.cl | 8 
   1 file changed, 4 insertions(+), 4 deletions(-)
  
  diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl 
  b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
  index c397ca2..f61d107 100644
  --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
  +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
  @@ -3204,9 +3204,6 @@ OVERLOADABLE float pown(float x, int n) {
   }
   
   OVERLOADABLE float rootn(float x, int n) {
  -  if (__ocl_math_fastpath_flag)
  -return __gen_ocl_internal_fastpath_rootn(x, n);
  -
 float ax,re;
 int sign = 0;
 if( n == 0 )return NAN;
  @@ -3233,7 +3230,10 @@ OVERLOADABLE float rootn(float x, int n) {
 ax = __gen_ocl_fabs(x);
 if(x 0.0f  (n1))
   sign = 1;
  -  re = __gen_ocl_internal_pow(ax,1.f/n);
  +  if (__ocl_math_fastpath_flag)
  +re = __gen_ocl_pow(ax, 1.f/n);
  +  else
  +re = __gen_ocl_internal_pow(ax,1.f/n);
 if(sign)
   re = -re;
 return re;
  -- 
  1.8.3.2
  
  ___
  Beignet mailing list
  Beignet@lists.freedesktop.org
  http://lists.freedesktop.org/mailman/listinfo/beignet
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] GBE/libocl: add missing vector builtin definition for fma.

2014-09-15 Thread He Junyan
LGTM, thanks

On 二, 2014-09-16 at 09:57 +0800, Zhigang Gong wrote:
 Ping for review.
 
 On Fri, Sep 12, 2014 at 05:18:16PM +0800, Zhigang Gong wrote:
  Signed-off-by: Zhigang Gong zhigang.g...@intel.com
  ---
   backend/src/libocl/script/ocl_math.def | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
  
  diff --git a/backend/src/libocl/script/ocl_math.def 
  b/backend/src/libocl/script/ocl_math.def
  index ff1d5d6..4baded4 100644
  --- a/backend/src/libocl/script/ocl_math.def
  +++ b/backend/src/libocl/script/ocl_math.def
  @@ -26,7 +26,7 @@ gentype fabs (gentype)
   gentype fdim (gentype x, gentype y)
   gentype floor (gentype)
   # XXX we use madd for fma
  -#gentype fma (gentype a, gentype b, gentype c)
  +gentype fma (gentype a, gentype b, gentype c)
   gentype fmax (gentype x, gentype y)
   gentypef fmax (gentypef x, float y)
   gentyped fmax (gentyped x, double y)
  -- 
  1.8.3.2
  
  ___
  Beignet mailing list
  Beignet@lists.freedesktop.org
  http://lists.freedesktop.org/mailman/listinfo/beignet
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/3] GBE: fix multiple files compilation bugs.

2014-09-12 Thread He Junyan
The patchset is OK


On 五, 2014-09-12 at 14:33 +0800, Zhigang Gong wrote:
 If we want to link multiple files together, and one kernel
 function need refer other kernel functions in other files,
 we must not set those functions as linked once attribute.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  backend/src/backend/gen_program.cpp | 4 +++-
  backend/src/llvm/llvm_to_gen.cpp| 5 +++--
  2 files changed, 6 insertions(+), 3 deletions(-)
 
 diff --git a/backend/src/backend/gen_program.cpp 
 b/backend/src/backend/gen_program.cpp
 index 3e16fd6..5324587 100644
 --- a/backend/src/backend/gen_program.cpp
 +++ b/backend/src/backend/gen_program.cpp
 @@ -56,6 +56,7 @@
  #include backend/gen_reg_allocation.hpp
  #include ir/unit.hpp
  #include llvm/llvm_to_gen.hpp
 +#include llvm/llvm_gen_backend.hpp
  
  #include clang/CodeGen/CodeGenAction.h
  
 @@ -371,9 +372,10 @@ namespace gbe {
}
  
for (llvm::Module::iterator I = src-begin(), E = src-end(); I != E; 
 ++I) {
 +llvm::Function *F = llvm::dyn_castllvm::Function(I);
 +if (F  isKernelFunction(*F)) continue;
  I-setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
}
 -
llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)-module;
llvm::Linker::LinkModules( dst,
   src,
 diff --git a/backend/src/llvm/llvm_to_gen.cpp 
 b/backend/src/llvm/llvm_to_gen.cpp
 index 755793e..3cb0e5b 100644
 --- a/backend/src/llvm/llvm_to_gen.cpp
 +++ b/backend/src/llvm/llvm_to_gen.cpp
 @@ -86,10 +86,11 @@ namespace gbe
  FPM.add(new DataLayout(DL));
  #endif
  
 +// XXX remove the verifier pass to workaround a non-fatal error.
  #if LLVM_VERSION_MAJOR == 3  LLVM_VERSION_MINOR =5
 -FPM.add(createVerifierPass(true));
 +//FPM.add(createVerifierPass(true));
  #else
 -FPM.add(createVerifierPass());
 +//FPM.add(createVerifierPass());
  #endif
  FPM.add(new TargetLibraryInfo(*libraryInfo));
  FPM.add(createTypeBasedAliasAnalysisPass());



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] Another build failure with current master

2014-09-12 Thread He Junyan
I suggest that you delete the PCH file in the system install dir
and try again.

On 五, 2014-09-12 at 07:52 -0400, Yichao Yu wrote:
 On Fri, Sep 12, 2014 at 7:32 AM, Yichao Yu yyc1...@gmail.com wrote:
  Hi,
 
  I've got an error when generating src/kernels/cl_internal_*_str.c's.
 
  The error message reads
 
  error: OpenCL version was  in PCH file but is currently ��
  build the file 
  /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/src/kernels//cl_internal_built_in_kernel.cl
  failed
 
  Note the non-ascii (non-utf8) characters even when I set locale to C.
 
  The command line executed was
 
  cd /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/src
   
  OCL_BITCODE_BIN=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libocl/lib/beignet.bc
  OCL_HEADER_DIR=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libocl/include/
  OCL_PCH_OBJECT=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libocl/lib/beignet.pch
  LD_LIBRARY_PATH=/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src
  /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/gbe_bin_generater
  -s 
  /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/src/kernels//cl_internal_built_in_kernel.cl
  -o/home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/src/kernels//cl_internal_built_in_kernel_str.c
 
  Doesn't work with either CLANG or GCC (probably unrelated). llvm version 
  3.5.0.
 
 Ooops, I forgot to save my PKGBUILD before recompile the package. With
 -DCOMPILER=CLANG the gbe_bin_generater (btw I think it should spell as
 generator...) actually segfault with the following backtrace (and
 although the gcc version doesn't segfault, the error message does look
 like some sort of memory corruption)
 
 #0  0x77285335 in clang::Diagnostic::FormatDiagnostic(char
 const*, char const*, llvm::SmallVectorImplchar) const ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #1  0x772860f3 in clang::Diagnostic::FormatDiagnostic(char
 const*, char const*, llvm::SmallVectorImplchar) const ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #2  0x7694a77a in
 clang::TextDiagnosticPrinter::HandleDiagnostic(clang::DiagnosticsEngine::Level,
 clang::Diagnostic const) ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #3  0x77287b50 in
 clang::DiagnosticIDs::EmitDiag(clang::DiagnosticsEngine,
 clang::DiagnosticIDs::Level) const ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #4  0x77287cc6 in
 clang::DiagnosticIDs::ProcessDiag(clang::DiagnosticsEngine) const ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #5  0x77280c4c in
 clang::DiagnosticsEngine::EmitCurrentDiagnostic(bool) ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #6  0x7698af9c in clang::DiagnosticBuilder::Emit() [clone .part.41] ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #7  0x7698eecd in checkLanguageOptions(clang::LangOptions
 const, clang::LangOptions const, clang::DiagnosticsEngine*) ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #8  0x769a4b90 in
 clang::ASTReader::ParseLanguageOptions(llvm::SmallVectorunsigned
 long, 64u const, bool, clang::ASTReaderListener) ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #9  0x769a7409 in
 clang::ASTReader::ReadControlBlock(clang::serialization::ModuleFile,
 llvm::SmallVectorImplclang::ASTReader::ImportedModule,
 clang::serialization::ModuleFile const*, unsigned int) () from
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #10 0x769a7e8a in
 clang::ASTReader::ReadASTCore(llvm::StringRef,
 clang::serialization::ModuleKind, clang::SourceLocation,
 clang::serialization::ModuleFile*,
 llvm::SmallVectorImplclang::ASTReader::ImportedModule, long, long,
 unsigned int) ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #11 0x769c19c3 in clang::ASTReader::ReadAST(std::string
 const, clang::serialization::ModuleKind, clang::SourceLocation,
 unsigned int) ()
from 
 /home/yuyichao/projects/mlinux/pkg/all/beignet-git/src/beignet/build/backend/src/libgbe.so
 #12 0x768f152d in
 clang::CompilerInstance::createPCHExternalASTSource(llvm::StringRef,
 std::string const, bool, bool, clang::Preprocessor,
 clang::ASTContext, void*, bool, bool, bool) () from
 

Re: [Beignet] [PATCH] GBE/libocl: fix the wrong prototype of scalar native_powr.

2014-09-10 Thread He Junyan
It's my typo, thanks for your fixing


On 三, 2014-09-10 at 16:23 +0800, Zhigang Gong wrote:
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  backend/src/libocl/tmpl/ocl_math.tmpl.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h 
 b/backend/src/libocl/tmpl/ocl_math.tmpl.h
 index 7a7d12f..1d860b7 100644
 --- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
 +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
 @@ -109,7 +109,7 @@ OVERLOADABLE float native_exp10(float x);
  OVERLOADABLE float native_log(float x);
  OVERLOADABLE float native_log2(float x);
  OVERLOADABLE float native_log10(float x);
 -OVERLOADABLE float native_powr(float x);
 +OVERLOADABLE float native_powr(float x, float y);
  OVERLOADABLE float native_recip(float x);
  OVERLOADABLE float native_rsqrt(float x);
  OVERLOADABLE float native_sin(float x);



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH v2] GBE: remove the user defined macro cl_khr_fp64.

2014-09-04 Thread He Junyan
LGTM,
thanks.

On 四, 2014-09-04 at 13:59 +0800, Zhigang Gong wrote:
 This is not a predefined macro according to the spec. Let's not
 define it by default. This patch also disable the fp64 when enter
 user kernels.
 
 v2:
 Some internal .cl files require cl_khr_fp64 enabled. Fixed that issue
 by move the enable macro to ocl_types.h.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 Reviewed-by: Junyan He junyan...@linux.intel.com
 ---
  backend/src/backend/program.cpp|  2 --
  backend/src/libocl/CMakeLists.txt  |  2 +-
  backend/src/libocl/include/ocl.h   |  1 +
  backend/src/libocl/include/ocl_types.h |  3 ---
  backend/src/libocl/src/ocl_async.cl|  1 +
  backend/src/libocl/src/ocl_image.cl| 26 +-
  backend/src/libocl/src/ocl_vload.cl|  1 +
  7 files changed, 17 insertions(+), 19 deletions(-)
 
 diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
 index 42cd989..98e8a34 100644
 --- a/backend/src/backend/program.cpp
 +++ b/backend/src/backend/program.cpp
 @@ -516,8 +516,6 @@ namespace gbe {
  }
  
  args.push_back(-cl-kernel-arg-info);
 -args.push_back(-Dcl_khr_fp64);
 -
  args.push_back(-mllvm);
  args.push_back(-inline-threshold=20);
  #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
 diff --git a/backend/src/libocl/CMakeLists.txt 
 b/backend/src/libocl/CMakeLists.txt
 index d4e3a53..fb93da1 100644
 --- a/backend/src/libocl/CMakeLists.txt
 +++ b/backend/src/libocl/CMakeLists.txt
 @@ -129,7 +129,7 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
  ENDFOREACH(M) 
  
 
 -SET (CLANG_OCL_FLAGS -fno-builtin -Dcl_khr_fp64 -ffp-contract=off 
 -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
 +SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info 
 -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
  
  MACRO(ADD_CL_TO_BC_TARGET _file)
  # CMake seems can not add pattern rule, use MACRO to replace.
 diff --git a/backend/src/libocl/include/ocl.h 
 b/backend/src/libocl/include/ocl.h
 index a7d03e6..d4a8805 100644
 --- a/backend/src/libocl/include/ocl.h
 +++ b/backend/src/libocl/include/ocl.h
 @@ -19,5 +19,6 @@
  #include ocl_sync.h
  #include ocl_vload.h
  #include ocl_workitem.h
 +#pragma OPENCL EXTENSION cl_khr_fp64 : disable
  
  #endif
 diff --git a/backend/src/libocl/include/ocl_types.h 
 b/backend/src/libocl/include/ocl_types.h
 index 05a2dae..87e9bf5 100644
 --- a/backend/src/libocl/include/ocl_types.h
 +++ b/backend/src/libocl/include/ocl_types.h
 @@ -1,10 +1,7 @@
  #ifndef __OCL_TYPES_H__
  #define __OCL_TYPES_H__
  
 -#ifdef cl_khr_fp64
  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 -#endif
 -
  #include ocl_defines.h
  
  #define NULL ((void*)0)
 diff --git a/backend/src/libocl/src/ocl_async.cl 
 b/backend/src/libocl/src/ocl_async.cl
 index 57d6859..e6f9a36 100644
 --- a/backend/src/libocl/src/ocl_async.cl
 +++ b/backend/src/libocl/src/ocl_async.cl
 @@ -1,3 +1,4 @@
 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
  #include ocl_async.h
  #include ocl_sync.h
  #include ocl_workitem.h
 diff --git a/backend/src/libocl/src/ocl_image.cl 
 b/backend/src/libocl/src/ocl_image.cl
 index 00c3e8f..7202802 100644
 --- a/backend/src/libocl/src/ocl_image.cl
 +++ b/backend/src/libocl/src/ocl_image.cl
 @@ -188,7 +188,7 @@ OVERLOADABLE int __gen_compute_array_index(int index, 
 image2d_array_t image)
  #define FIXUP_FLOAT_COORD(tmpCoord)\
{\
  if (tmpCoord  0  tmpCoord  -0x1p-20f)  \
 -  tmpCoord += -0x1p-9; \
 +  tmpCoord += -0x1p-9f; \
}
  
  DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
 @@ -229,7 +229,7 @@ DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
  #define FIXUP_FLOAT_COORD(tmpCoord)\
{\
  if (tmpCoord.s0  0  tmpCoord.s0  -0x1p-20f)\
 -  tmpCoord.s0 += -0x1p-9;  \
 +  tmpCoord.s0 += -0x1p-9f;  \
  if (tmpCoord.s1  0  tmpCoord.s1  -0x1p-20f)\
tmpCoord.s1 += -0x1p-9f; \
}
 @@ -258,7 +258,7 @@ DECL_IMAGE(0, image2d_t, float4, f, 2)
  #define FIXUP_FLOAT_COORD(tmpCoord)\
{\
  if (tmpCoord.s0  0  tmpCoord.s0  -0x1p-20f)\
 -  tmpCoord.s0 += -0x1p-9;  \
 +  tmpCoord.s0 += -0x1p-9f;  \
}
  
  DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
 @@ -306,12 +306,12 @@ OVERLOADABLE size_t 
 get_image_array_size(image1d_array_t image)
  
  #define FIXUP_FLOAT_COORD(tmpCoord) \
{ \
 -if (tmpCoord.s0  0

Re: [Beignet] [PATCH] Fix a bug for runtime_barrier_list.cpp, local var not inited.

2014-09-01 Thread He Junyan
That's better than memset,
It's OK 


On 一, 2014-09-01 at 09:27 +0800, Zhigang Gong wrote:
 I just checked the test case. This may not the best fix. The issue should be
 the first time to access all the events which also inclues the uninitialized
 event 3,4,5, And the following patch should be better.
 -  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
 +  for (cl_uint i = 0; i  3; ++i) {
 
 clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), 
 status, NULL);
 OCL_ASSERT(status = CL_SUBMITTED);
   }
 
 Any thought?
 
 On Mon, Sep 01, 2014 at 10:04:46AM +0800, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
  
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
   utests/runtime_barrier_list.cpp |3 +++
   1 file changed, 3 insertions(+)
  
  diff --git a/utests/runtime_barrier_list.cpp 
  b/utests/runtime_barrier_list.cpp
  index 6987d5e..e176771 100644
  --- a/utests/runtime_barrier_list.cpp
  +++ b/utests/runtime_barrier_list.cpp
  @@ -1,3 +1,4 @@
  +#include string.h
   #include utest_helper.hpp
   
   #define BUFFERSIZE  32*1024
  @@ -10,6 +11,8 @@ void runtime_barrier_list(void)
 cl_int status = 0;
 cl_int value = 34;
   
  +  memset(ev, 0, sizeof(cl_event)*5);
  +
 // Setup kernel and buffers
 OCL_CREATE_KERNEL(compiler_event);
 OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
  -- 
  1.7.9.5
  
  
  
  ___
  Beignet mailing list
  Beignet@lists.freedesktop.org
  http://lists.freedesktop.org/mailman/listinfo/beignet
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] utests: fix two utest bugs.

2014-09-01 Thread He Junyan
That OK

On 二, 2014-09-02 at 10:36 +0800, Zhigang Gong wrote:
 Similar as the bug found by junyan, some events are
 accessed before assigned.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  utests/runtime_event.cpp   | 2 +-
  utests/runtime_marker_list.cpp | 2 +-
  2 files changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
 index b974f6a..f8170a3 100644
 --- a/utests/runtime_event.cpp
 +++ b/utests/runtime_event.cpp
 @@ -28,7 +28,7 @@ void runtime_event(void)
locals[0] = 32;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, ev[0], 
 ev[2]);
  
 -  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
 +  for (cl_uint i = 0; i  3; ++i) {
  clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), 
 status, NULL);
  OCL_ASSERT(status = CL_SUBMITTED);
}
 diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp
 index fc77156..f64b1d1 100644
 --- a/utests/runtime_marker_list.cpp
 +++ b/utests/runtime_marker_list.cpp
 @@ -34,7 +34,7 @@ void runtime_marker_list(void)
  
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, ev[0], 
 ev[2]);
  
 -  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
 +  for (cl_uint i = 0; i  3; ++i) {
  clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), 
 status, NULL);
  OCL_ASSERT(status = CL_SUBMITTED);
}



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 00/18] Using bit code ocl lib to replace the huge header file.

2014-08-28 Thread He Junyan
The dependencies of these files are complex here and may cause some
inconvenience because the CMake lacks of the flexibility.
After discussion with Zhigang, I notice that it is important to keep it
complying with the whole project and I will rewrite the all the building
files using CMake.


On 三, 2014-08-27 at 23:17 -0400, Yichao Yu wrote:
 On Wed, Aug 27, 2014 at 10:50 PM, Song, Ruiling ruiling.s...@intel.com 
 wrote:
  Out-of-source build (mkdir build; cmake ../; make) does not work after 
  apply your patch. It simply print  No targets specified and no makefile 
  found.  Stop.  and stop building.
 
 Do you mean `mkdir build; cd build; cmake ../; make` ? It feels like
 no makefile found error cannot be caused by changes in the cmake files
 without configure time error.
 
 
  After apply your patch, utest compiler_copy_image1 failed on my machine.
 
  And seems that you miss Copyright header in the new files you added.
 
  -Original Message-
  From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
  junyan...@inbox.com
  Sent: Tuesday, August 12, 2014 3:31 PM
  To: beignet@lists.freedesktop.org
  Cc: Junyan He
  Subject: [Beignet] [PATCH 00/18] Using bit code ocl lib to replace the huge 
  header file.
 
  From: Junyan He junyan...@linux.intel.com
 
  The PCH file is growing too big. It contains too many defines and macros, 
  which need a lot of time to parse and may cause some conflicts with the 
  user defined macros. This patch set will extract the function protocols 
  from its definition into the header files and compile all the functions' 
  definition into a llvm bitcode file as a cl library. This manner is also 
  compatible with libclc and we can switch to libclc if libclc is stable and 
  extensive enough.
  This patch set may cause the compiling time slower than the PCH version, 
  because the header file's parsing and bitcode's linking are not optimized 
  yet, and we will continue to improve it.
 
  TODO:
  Math functions' fast and standard switch in the lib.
  Header file parsing optimization, may use PCH or PTH.
  Linking optimization.
 
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
   backend/CMakeLists.txt| 3 +
   backend/src/CMakeLists.txt|   141 +-
   backend/src/GBEConfig.h.in| 2 +
   backend/src/backend/program.cpp   |   239 +-
   backend/src/builtin_vector_proto.def  |   295 -
   backend/src/gen_as.sh |   101 -
   backend/src/gen_builtin_vector.py |   414 -
   backend/src/gen_convert.sh|   553 -
   backend/src/libocl/Makefile.in|81 +
   backend/src/libocl/include/ocl.h  |23 +
   backend/src/libocl/include/ocl_async.h|49 +
   backend/src/libocl/include/ocl_atom.h |84 +
   backend/src/libocl/include/ocl_common.inh |21 +
   backend/src/libocl/include/ocl_defines.inh|23 +
   backend/src/libocl/include/ocl_float.h|79 +
   backend/src/libocl/include/ocl_geometric.h|39 +
   backend/src/libocl/include/ocl_image.h|   161 +
   backend/src/libocl/include/ocl_integer.inh|   160 +
   backend/src/libocl/include/ocl_math.inh   |   103 +
   backend/src/libocl/include/ocl_misc.h |   122 +
   backend/src/libocl/include/ocl_printf.h   |15 +
   backend/src/libocl/include/ocl_relational.inh |78 +
   backend/src/libocl/include/ocl_sync.h |18 +
   backend/src/libocl/include/ocl_types.h|   104 +
   backend/src/libocl/include/ocl_vload.h|   143 +
   backend/src/libocl/include/ocl_workitem.h |15 +
   backend/src/libocl/lib/ocl_async.cl   |69 +
   backend/src/libocl/lib/ocl_atom.cl|   122 +
   backend/src/libocl/lib/ocl_barrier.ll |39 +
   backend/src/libocl/lib/ocl_common.inc |49 +
   backend/src/libocl/lib/ocl_geometric.cl   |96 +
   backend/src/libocl/lib/ocl_image.cl   |   412 +
   backend/src/libocl/lib/ocl_integer.inc|   352 +
   backend/src/libocl/lib/ocl_math.inc   |  3316 +
   backend/src/libocl/lib/ocl_memcpy.ll  |   336 +
   backend/src/libocl/lib/ocl_memset.ll  |   127 +
   backend/src/libocl/lib/ocl_misc.cl|   201 +
   backend/src/libocl/lib/ocl_relational.inc |   151 +
   backend/src/libocl/lib/ocl_sync.cl|14 +
   backend/src/libocl/lib/ocl_vload.cl   |   257 +
   backend/src/libocl/lib/ocl_workitem.cl|40 +
   backend/src/libocl/script/gen_as.sh   |   124 +
   backend/src/libocl/script/gen_common.inc  |11 +
   backend/src/libocl/script/gen_convert.sh  |   653 +
   backend/src/libocl/script/gen_vector.py   |   382 +
   backend/src/libocl/script/ocl_common.def  |22 +
   backend/src/libocl/script/ocl_integer.def |31 +
   backend/src/libocl/script

Re: [Beignet] [PATCH] GBE: clear deadprintfs when current function is done.

2014-08-26 Thread He Junyan
OK,
Thanks for finding this bug.

On 二, 2014-08-26 at 15:39 +0800, Ruiling Song wrote:
 It should be cleared, to prevent invalid pointers staying there
 when processing next Function.
 
 Signed-off-by: Ruiling Song ruiling.s...@intel.com
 ---
  backend/src/llvm/llvm_printf_parser.cpp |1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/backend/src/llvm/llvm_printf_parser.cpp 
 b/backend/src/llvm/llvm_printf_parser.cpp
 index e02f5aa..00e1ef8 100644
 --- a/backend/src/llvm/llvm_printf_parser.cpp
 +++ b/backend/src/llvm/llvm_printf_parser.cpp
 @@ -616,6 +616,7 @@ error:
  prf.first-eraseFromParent();
  }
  
 +deadprintfs.clear();
  delete builder;
  
  return changed;



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

2014-07-10 Thread He Junyan
Some comments,

On 五, 2014-07-04 at 12:00 +0800, Lv Meng wrote:
 Signed-off-by: Lv Meng meng...@intel.com
 ---
  src/CMakeLists.txt  |  3 ++-
  src/cl_context.h|  1 +
  src/cl_mem.c| 27 
 +++--
  src/kernels/cl_internal_copy_buf_rect_align4.cl | 15 ++
  4 files changed, 43 insertions(+), 3 deletions(-)
  create mode 100644 src/kernels/cl_internal_copy_buf_rect_align4.cl
 
 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
 index 46426d9..dff8fdf 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
 @@ -41,7 +41,8 @@ set (KERNEL_STR_FILES)
  set (KERNEL_NAMES cl_internal_copy_buf_align4
  cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
  cl_internal_copy_buf_unalign_dst_offset 
 cl_internal_copy_buf_unalign_src_offset
 -cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d 
 cl_internal_copy_image_2d_to_2d
 +cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4
 +cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
  cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d 
 cl_internal_copy_image_3d_to_3d
  cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
  cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
 diff --git a/src/cl_context.h b/src/cl_context.h
 index 75afbf6..f8342d3 100644
 --- a/src/cl_context.h
 +++ b/src/cl_context.h
 @@ -47,6 +47,7 @@ enum _cl_internal_ker_type {
CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
CL_ENQUEUE_COPY_BUFFER_RECT,
 +  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d
CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d
CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d
 diff --git a/src/cl_mem.c b/src/cl_mem.c
 index 70bc3eb..b78258f 100644
 --- a/src/cl_mem.c
 +++ b/src/cl_mem.c
 @@ -1396,9 +1396,20 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem 
 src_buf, cl_mem dst_buf,
 size_t dst_row_pitch, size_t dst_slice_pitch) {
cl_int ret;
cl_kernel ker;
 +  cl_int index;
size_t global_off[] = {0,0,0};
size_t global_sz[] = {1,1,1};
size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
 +  // the src and dst mem rect is continuous, the copy is degraded to buf 
 copy  
 +  if((region[0] == dst_row_pitch)  (region[0] == src_row_pitch) 
 +  (region[1] * src_row_pitch == src_slice_pitch)  (region[1] * 
 dst_row_pitch == dst_slice_pitch)){
 +cl_int src_offset = src_origin[2]*src_slice_pitch + 
 src_origin[1]*src_row_pitch + src_origin[0];
 +cl_int dst_offset = dst_origin[2]*dst_slice_pitch + 
 dst_origin[1]*dst_row_pitch + dst_origin[0];
 +cl_int size = region[0]*region[1]*region[2];
 +ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
 +return ret;
 +  }
 +
if(region[1] == 1) local_sz[1] = 1;
if(region[2] == 1) local_sz[2] = 1;
global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 @@ -1413,8 +1424,20 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem 
 src_buf, cl_mem dst_buf,
/* setup the kernel and run. */
extern char cl_internal_copy_buf_rect_str[];
extern size_t cl_internal_copy_buf_rect_str_size;
 -
 -  ker = cl_context_get_static_kernel_from_bin(queue-ctx, 
 CL_ENQUEUE_COPY_BUFFER_RECT,
 +  index = CL_ENQUEUE_COPY_BUFFER_RECT;
 +  if( (src_offset % 4== 0)  (dst_offset % 4== 0)  (src_row_pitch % 4== 
 0)  (dst_row_pitch % 4== 0)
 +   (src_slice_pitch % 4== 0)  (dst_slice_pitch % 4== 0)  (global_sz[0] 
 % 4 == 0) ){  
 +global_sz[0] /= 4;
 +src_offset /= 4;
 +dst_offset /= 4;
 +src_row_pitch /= 4;
 +dst_row_pitch /= 4;
 +src_slice_pitch /= 4;
 +dst_slice_pitch /= 4;
 +index = CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4;
 +  }
 +
 +  ker = cl_context_get_static_kernel_from_bin(queue-ctx, index,
cl_internal_copy_buf_rect_str, 
 (size_t)cl_internal_copy_buf_rect_str_size, NULL);

I think here you use the wrong source string. 
For align 4 rect buffer, you should use
cl_internal_copy_buf_rect_align4_str here, which I notice that already
exists in the cl_internal_copy_buf_rect_align4_str.c
I think you separate align and unalign cases as cl_mem_fill

  
if (!ker)
 diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl 
 b/src/kernels/cl_internal_copy_buf_rect_align4.cl
 new file mode 100644
 index 000..fbfe7b2
 --- /dev/null
 +++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl
 @@ -0,0 +1,15 @@
 +kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst,
 +  unsigned int region0, unsigned int 
 region1, unsigned int region2,
 +  unsigned int src_offset, unsigned 
 int dst_offset,
 +  

Re: [Beignet] [PATCH v2] runtime: fix a gpgpu event and thread local gpgpu handling bug.

2014-07-03 Thread He Junyan
OK, that's LGTM

On 四, 2014-07-03 at 14:14 +0800, Zhigang Gong wrote:
 When pending a command queue, we need to record the whole gpgpu
 structure not just the batch buffer. For the following reason:
 
 1. We need to keep those private buffer, for example those printf buffers.
 2. We need to make sure this gpgpu will not be reused by other enqueuement.
 
 v2:
 Don't try to flush all user event attached to the queue.
 Just need to flush the current event when doing command queue flush.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/cl_api.c  |  3 +-
  src/cl_command_queue.c| 14 +++--
  src/cl_command_queue.h|  4 +++
  src/cl_driver.h   |  8 ++
  src/cl_driver_defs.c  |  4 +--
  src/cl_enqueue.c  |  2 +-
  src/cl_event.c| 26 ++---
  src/cl_event.h|  7 +++--
  src/cl_thread.c   | 20 +
  src/cl_thread.h   |  3 ++
  src/intel/intel_batchbuffer.c | 13 -
  src/intel/intel_batchbuffer.h |  1 -
  src/intel/intel_gpgpu.c   | 66 
 +--
  13 files changed, 97 insertions(+), 74 deletions(-)
 
 diff --git a/src/cl_api.c b/src/cl_api.c
 index d54ada6..8759027 100644
 --- a/src/cl_api.c
 +++ b/src/cl_api.c
 @@ -69,7 +69,7 @@ handle_events(cl_command_queue queue, cl_int num, const 
 cl_event *wait_list,
cl_event* event, enqueue_data* data, cl_command_type type)
  {
cl_int status = cl_event_wait_events(num, wait_list, queue);
 -  cl_event e;
 +  cl_event e = NULL;
if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
  e = cl_event_new(queue-ctx, queue, type, event!=NULL);
  
 @@ -85,6 +85,7 @@ handle_events(cl_command_queue queue, cl_int num, const 
 cl_event *wait_list,
cl_event_new_enqueue_callback(e, data, num, wait_list);
  }
}
 +  queue-current_event = e;
return status;
  }
  
 diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
 index 8426c4e..cd268aa 100644
 --- a/src/cl_command_queue.c
 +++ b/src/cl_command_queue.c
 @@ -28,6 +28,7 @@
  #include cl_alloc.h
  #include cl_driver.h
  #include cl_khr_icd.h
 +#include cl_event.h
  #include performance.h
  
  #include assert.h
 @@ -421,10 +422,9 @@ error:
return err;
  }
  
 -LOCAL cl_int
 -cl_command_queue_flush(cl_command_queue queue)
 +LOCAL void
 +cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
  {
 -  GET_QUEUE_THREAD_GPGPU(queue);
size_t global_wk_sz[3];
void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
  
 @@ -447,7 +447,15 @@ cl_command_queue_flush(cl_command_queue queue)
  global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
  cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
}
 +}
  
 +LOCAL cl_int
 +cl_command_queue_flush(cl_command_queue queue)
 +{
 +  GET_QUEUE_THREAD_GPGPU(queue);
 +  cl_command_queue_flush_gpgpu(queue, gpgpu);
 +  if (queue-current_event)
 +cl_event_flush(queue-current_event);
cl_invalid_thread_gpgpu(queue);
return CL_SUCCESS;
  }
 diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
 index b79d63a..bd70f25 100644
 --- a/src/cl_command_queue.h
 +++ b/src/cl_command_queue.h
 @@ -41,6 +41,7 @@ struct _cl_command_queue {
cl_intwait_events_num;   /* Number of Non-complete user events 
 */
cl_intwait_events_size;  /* The size of array that wait_events 
 point to */
cl_event  last_event;/* The last event in the queue, for 
 enqueue mark used */
 +  cl_event  current_event; /* Current event. */
cl_command_queue_properties  props;  /* Queue properties */
cl_command_queue prev, next; /* We chain the command queues 
 together */
void *thread_data;   /* Used to store thread context data 
 */
 @@ -82,6 +83,9 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, 
 cl_mem);
  /* Flush for the command queue */
  extern cl_int cl_command_queue_flush(cl_command_queue);
  
 +/* Flush for the specified gpgpu */
 +extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
 +
  /* Wait for the completion of the command queue */
  extern cl_int cl_command_queue_finish(cl_command_queue);
  
 diff --git a/src/cl_driver.h b/src/cl_driver.h
 index 2999eb7..3d1d8d8 100644
 --- a/src/cl_driver.h
 +++ b/src/cl_driver.h
 @@ -197,13 +197,9 @@ extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
  typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
  extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
  
 -/* pending flush the batch buffer of this event */
 -typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event);
 -extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending;
 -
  /* flush the batch buffer of this event */
 -typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event);
 -extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
 +typedef void 

Re: [Beignet] [PATCH] runtime: recover the maximum read image args to 128.

2014-07-03 Thread He Junyan
That's OK

On 四, 2014-07-03 at 12:53 +0800, Zhigang Gong wrote:
 To comply with the full profile.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/cl_gt_device.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
 index 97ba7e2..63c9047 100644
 --- a/src/cl_gt_device.h
 +++ b/src/cl_gt_device.h
 @@ -39,7 +39,7 @@
  .address_bits = 32,
  .max_mem_alloc_size = 256 * 1024 * 1024,
  .image_support = CL_TRUE,
 -.max_read_image_args = 16,
 +.max_read_image_args = 128,
  .max_write_image_args = 8,
  .image_max_array_size = 2048,
  .image2d_max_width = 8192,



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 3/5] Add %f and %c support for printf.

2014-06-20 Thread He Junyan


On 五, 2014-06-20 at 07:18 +, Yang, Rong R wrote:
 Two comments.
 
 -Original Message-
 From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
 junyan...@inbox.com
 Sent: Wednesday, June 18, 2014 2:42 PM
 To: beignet@lists.freedesktop.org
 Cc: Junyan He
 Subject: [Beignet] [PATCH 3/5] Add %f and %c support for printf.
 
 From: Junyan He junyan...@linux.intel.com
 
 Add the %c and %f support for printf.
 Also add the int to float and int to char conversion.
 Some minor errors such as wrong index flags have been fixed.
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  backend/src/ir/printf.cpp   | 69 +++
  backend/src/ir/printf.hpp   |  4 ++
  backend/src/llvm/llvm_printf_parser.cpp | 72 
 +
  3 files changed, 93 insertions(+), 52 deletions(-)
 
 diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp index 
 0a943ac..4bd7f2d 100644
 --- a/backend/src/ir/printf.cpp
 +++ b/backend/src/ir/printf.cpp
 @@ -17,18 +17,18 @@
   */
  
  /**
 - * \file sampler.cpp
 + * \file printf.cpp
   *
   */
  
  #include stdarg.h
  #include printf.hpp
 -#include ocl_common_defines.h
  
  namespace gbe
  {
namespace ir
{
 +
  pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
  
  uint32_t PrintfSet::append(PrintfFmt* fmt, Unit unit) @@ -43,35 +43,21 
 @@ namespace gbe
}
  
/* Update the total size of size. */
 -  sizeOfSize = slots.back()-state-out_buf_sizeof_offset
 -   + getPrintfBufferElementSize(slots.size() - 1);
 +  if (slots.size()  0)
 +sizeOfSize = slots.back()-state-out_buf_sizeof_offset
 + + getPrintfBufferElementSize(slots.size() - 1);
  
return (uint32_t)fmts.size();
  }
  
 -/* ugly here. We can not build the va_list dynamically:(
 -   And I have tried
 -   va_list arg; arg = some_ptr;
 -   This works very OK on 32bits platform but can not even
 -   pass the compiling in the 64bits platform.
 -   sizeof(arg) = 4 in 32bits platform but
 -   sizeof(arg) = 24 in 64bits platform.
 -   We can not assume the platform here. */
 -void vfprintf_wrap(std::string fmt, vectorint contents)
 -{
 -  int* ptr = NULL;
 -  size_t num = contents.size()  32 ? contents.size() : 32;
 -  ptr = (int *)calloc(32, sizeof(int)); //should be enough
 -  for (size_t i = 0; i  num; i++) {
 -ptr[i] = contents[i];
 -  }
 -
 -  printf(fmt.c_str(), ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], 
 ptr[6], ptr[7],
 - ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], 
 ptr[15], ptr[16],
 - ptr[17], ptr[18], ptr[19], ptr[20], ptr[21], ptr[22], ptr[23], 
 ptr[24], ptr[25],
 - ptr[26], ptr[27], ptr[28], ptr[29], ptr[30], ptr[31]);
 -  free(ptr);
 -}
 +#define PRINT_SOMETHING(target_ty, conv)  do {  \
 +  pf_str = pf_str + std::string(#conv); \
 +  printf(pf_str.c_str(),\
 + ((target_ty *)((char *)buf_addr + 
 slot.state-out_buf_sizeof_offset * \
 +global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
 + [k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i]);\
 +  pf_str = ;  \
 +} while (0)
  
  void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t 
 global_wk_sz0,
   size_t global_wk_sz1, size_t global_wk_sz2) 
 @@ -79,15 +65,15 @@ namespace gbe
LockOutput lock;
size_t i, j, k;
std::string pf_str;
 -  vectorint* contents = NULL;
 +  int stmt = 0;
 +
for (auto pf : fmts) {
  for (i = 0; i  global_wk_sz0; i++) {
for (j = 0; j  global_wk_sz1; j++) {
  for (k = 0; k  global_wk_sz2; k++) {
 -  int flag = ((int *)index_addr)[k*global_wk_sz0*global_wk_sz1 + 
 j*global_wk_sz0 + i];
 +  int flag = ((int 
 + *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2 + 
 + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
if (flag) {
  pf_str = ;
 -contents = new vectorint();
  for (auto slot : pf) {
if (slot.type == PRINTF_SLOT_TYPE_STRING) {
  pf_str = pf_str + std::string(slot.str); @@ -98,23 
 +84,34 @@ namespace gbe
switch (slot.state-conversion_specifier) {
  case PRINTF_CONVERSION_D:
  case PRINTF_CONVERSION_I:
 -  contents-push_back(((int *)((char *)buf_addr + 
 slot.state-out_buf_sizeof_offset
 -   * global_wk_sz0 * 
 global_wk_sz1 * global_wk_sz2))
 -  [k

Re: [Beignet] [PATCH 2/2] runtime: fix image1d buffer allocation.

2014-06-20 Thread He Junyan
Spec says:

For a 1D image buffer
object, the image pixels are taken from the buffer object’s data store.
When the contents of a
buffer object’s data store are modified, those changes are reflected in
the contents of the 1D
image buffer object and vice-versa at corresponding sychronization
points. 

NOTE:
Concurrent reading from, writing to and copying between both a buffer
object and 1D image
buffer object associated with the buffer object is undefined. Only
reading from both a buffer
object and 1D image buffer object associated with the buffer object is
defined.


So
corresponding sychronization points seems very important.
if the user hold the mapped buffer address, this may cause some problem.


On 五, 2014-06-20 at 15:47 +0800, Zhigang Gong wrote:
 Per bspec, a image should has a at least 2 line vertical alignment,
 thus we can't simply attach a buffer to a 1d image surface which has the same 
 size.
 We have to create a new image, and copy the buffer data to this new image.
 And replace all the buffer object's reference to this image.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/cl_mem.c | 73 
 
  1 file changed, 54 insertions(+), 19 deletions(-)
 
 diff --git a/src/cl_mem.c b/src/cl_mem.c
 index a1d3b25..b27e64a 100644
 --- a/src/cl_mem.c
 +++ b/src/cl_mem.c
 @@ -480,6 +480,23 @@ error:
goto exit;
  }
  
 +void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
 +{
 +  cl_buffer_unreference(buffer-bo);
 +  buffer-bo = new_bo;
 +  cl_buffer_reference(new_bo);
 +  if (buffer-type != CL_MEM_SUBBUFFER_TYPE)
 +return;
 +
 +  struct _cl_mem_buffer *it = ((struct _cl_mem_buffer*)buffer)-sub_next;
 +  for( ; it != (struct _cl_mem_buffer*)buffer; it = it-sub_next)
 +  {
 +cl_buffer_unreference(it-base.bo);
 +it-base.bo = new_bo;
 +cl_buffer_reference(new_bo);
 +  }
 +}
 +
  void
  cl_mem_copy_image_region(const size_t *origin, const size_t *region,
   void *dst, size_t dst_row_pitch, size_t 
 dst_slice_pitch,
 @@ -598,10 +615,12 @@ _cl_mem_new_image(cl_context ctx,
  
if (UNLIKELY(w == 0)) DO_IMAGE_ERROR;
if (UNLIKELY(h == 0  (image_type != CL_MEM_OBJECT_IMAGE1D 
 -  image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY)))
 +  image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY 
 +  image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)))
  DO_IMAGE_ERROR;
  
 -  if (image_type == CL_MEM_OBJECT_IMAGE1D) {
 +  if (image_type == CL_MEM_OBJECT_IMAGE1D ||
 +  image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
  size_t min_pitch = bpp * w;
  if (data  pitch == 0)
pitch = min_pitch;
 @@ -809,27 +828,43 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
  merged_flags = 
 ~(CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
  merged_flags |= flags  
 (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
}
 -
 -  /* Because the buffer is NO_TILING, the image should be no tiling. */
 -  image = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, merged_flags, 
 CL_FALSE, err);
 -  if (image == NULL || err != CL_SUCCESS)
 -goto error;
 -
 -  cl_buffer_reference(buffer-bo);
 -  image-bo = buffer-bo;
 -  image-size = buffer-size;
 -  /* If it is a sub buffer, we need to start from the sub offset. */
 +  struct _cl_mem_buffer *mem_buffer = (struct _cl_mem_buffer*)buffer;
if (buffer-type == CL_MEM_SUBBUFFER_TYPE) {
  offset = ((struct _cl_mem_buffer *)buffer)-sub_offset;
 +mem_buffer = mem_buffer-parent;
}
 -  if (image-flags  CL_MEM_USE_HOST_PTR) {
 -/* Now point to the right offset if buffer is a SUB_BUFFER. */
 -image-host_ptr = buffer-host_ptr + offset;
 -  }
 +  /* Get the size of each pixel */
 +  if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, bpp)) != 
 CL_SUCCESS))
 +goto error;
  
 -  cl_mem_image_init(cl_mem_image(image), image_desc-image_width, 1, 
 image_desc-image_type,
 -1, *image_format, intel_fmt, bpp, 
 image_desc-image_width*bpp, 0, CL_NO_TILE,
 -0, 0, offset);
 +  // Per bspec, a image should has a at least 2 line vertical alignment,
 +  // thus we can't simply attach a buffer to a 1d image surface which has 
 the same size.
 +  // We have to create a new image, and copy the buffer data to this new 
 image.
 +  // And replace all the buffer object's reference to this image.
 +  image = _cl_mem_new_image(ctx, flags, image_format, image_desc-image_type,
 +mem_buffer-base.size / bpp, 0, 0, 0, 0, NULL, 
 errcode_ret);
 +  if (image == NULL)
 +return NULL;
 +  void *src = cl_mem_map(buffer);
 +  void *dst = cl_mem_map(image);
 +  //
 +  // FIXME, we could use copy buffer to image to do this on GPU latter.
 +  // currently the copy buffer to image function doesn't support 1D image.
 +  memcpy(dst, src, mem_buffer-base.size);
 +  cl_mem_unmap(buffer);
 +  cl_mem_unmap(image);
 +
 +  if (err != 0)
 +goto error;
 + 
 +  // Now 

Re: [Beignet] [PATCH] driver: fix a potential Null reference.

2014-06-17 Thread He Junyan
Really has risk here.
Thanks for fixing it 


On Tue, 2014-06-17 at 11:18 +0800, Zhigang Gong wrote:
 cl_gpgpu_flush may be called when the batch buffer has been
 released. We need to check whether there is a valid buffer
 before we really take the following actions.
 
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/intel/intel_gpgpu.c | 2 ++
  1 file changed, 2 insertions(+)
 
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index 1da6400..6af6e40 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -555,6 +555,8 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
  static void
  intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
  {
 +  if (!gpgpu-batch || !gpgpu-batch-buffer)
 +return;
intel_batchbuffer_emit_mi_flush(gpgpu-batch);
intel_batchbuffer_flush(gpgpu-batch);
intel_gpgpu_check_binded_buf_address(gpgpu);



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/2] runtime: fix some image array related bugs.

2014-06-17 Thread He Junyan
Thanks for find the problem.
I think the key point is that for array image it always uses the 
slice_pitch rather than the image_row_pitch.
This patch set is good for me.

As you mentioned, I will improve my utest case later, and you can
push my first 2 patches firstly to make your patch set work.


On Wed, 2014-06-18 at 10:25 +0800, Zhigang Gong wrote:
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/cl_api.c   | 5 -
  src/cl_device_id.c | 1 +
  src/cl_device_id.h | 1 +
  src/cl_gt_device.h | 1 +
  4 files changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/src/cl_api.c b/src/cl_api.c
 index 327f02b..d91 100644
 --- a/src/cl_api.c
 +++ b/src/cl_api.c
 @@ -674,7 +674,10 @@ clGetSupportedImageFormats(cl_context ctx,
  err = CL_INVALID_VALUE;
  goto error;
}
 -  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE2D 
 +  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D 
 +   image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY 
 +   image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY 
 +   image_type != CL_MEM_OBJECT_IMAGE2D 
 image_type != CL_MEM_OBJECT_IMAGE3D)) {
  err = CL_INVALID_VALUE;
  goto error;
 diff --git a/src/cl_device_id.c b/src/cl_device_id.c
 index af8e90c..578b548 100644
 --- a/src/cl_device_id.c
 +++ b/src/cl_device_id.c
 @@ -363,6 +363,7 @@ cl_get_device_info(cl_device_id device,
  DECL_FIELD(IMAGE_SUPPORT, image_support)
  DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args)
  DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args)
 +DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size)
  DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width)
  DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height)
  DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width)
 diff --git a/src/cl_device_id.h b/src/cl_device_id.h
 index a5449a7..769bfd2 100644
 --- a/src/cl_device_id.h
 +++ b/src/cl_device_id.h
 @@ -51,6 +51,7 @@ struct _cl_device_id {
cl_uint  max_read_image_args;
cl_uint  max_write_image_args;
size_t   image2d_max_width;
 +  size_t   image_max_array_size;
size_t   image2d_max_height;
size_t   image3d_max_width;
size_t   image3d_max_height;
 diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
 index b8bda5e..6d03123 100644
 --- a/src/cl_gt_device.h
 +++ b/src/cl_gt_device.h
 @@ -41,6 +41,7 @@
  .image_support = CL_TRUE,
  .max_read_image_args = 128,
  .max_write_image_args = 8,
 +.image_max_array_size = 2048,
  .image2d_max_width = 8192,
  .image2d_max_height = 8192,
  .image3d_max_width = 8192,



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 3/3] Add the test cases for 1D Image Array

2014-06-16 Thread He Junyan
for this patch set, the compare_image_2d_and_1d_array output:

## x y is (30, 15), color1 is (2 1 30 15), color2 is (2 1 30 30)
## x y is (31, 0), color1 is (2 1 31 0), color2 is (2 1 31 0)
## x y is (31, 1), color1 is (2 1 31 1), color2 is (2 1 31 2)
## x y is (31, 2), color1 is (2 1 31 2), color2 is (2 1 31 4)
## x y is (31, 3), color1 is (2 1 31 3), color2 is (2 1 31 6)
## x y is (31, 4), color1 is (2 1 31 4), color2 is (2 1 31 8)
## x y is (31, 5), color1 is (2 1 31 5), color2 is (2 1 31 10)
## x y is (31, 6), color1 is (2 1 31 6), color2 is (2 1 31 12)
## x y is (31, 7), color1 is (2 1 31 7), color2 is (2 1 31 14)
## x y is (31, 8), color1 is (2 1 31 8), color2 is (2 1 31 16)
## x y is (31, 9), color1 is (2 1 31 9), color2 is (2 1 31 18)
## x y is (31, 10), color1 is (2 1 31 10), color2 is (2 1 31 20)
## x y is (31, 11), color1 is (2 1 31 11), color2 is (2 1 31 22)
## x y is (31, 12), color1 is (2 1 31 12), color2 is (2 1 31 24)
## x y is (31, 13), color1 is (2 1 31 13), color2 is (2 1 31 26)
## x y is (31, 14), color1 is (2 1 31 14), color2 is (2 1 31 28)
## x y is (31, 15), color1 is (2 1 31 15), color2 is (2 1 31 30)

color1 is the result of image2d_t and color2 is the result of
image1d_array_t.
The h of the image1d_array_t seems always twice of the image2d_t.
I can not find the problem by now, any idea?




On Tue, 2014-06-17 at 12:07 +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  kernels/compare_image_2d_and_1d_array.cl | 12 +
  kernels/test_get_image_info_array.cl | 25 ++
  utests/CMakeLists.txt|  2 +
  utests/compare_image_2d_and_1d_array.cpp | 78 
 
  utests/compiler_get_image_info_array.cpp | 64 ++
  5 files changed, 181 insertions(+)
  create mode 100644 kernels/compare_image_2d_and_1d_array.cl
  create mode 100644 kernels/test_get_image_info_array.cl
  create mode 100644 utests/compare_image_2d_and_1d_array.cpp
  create mode 100644 utests/compiler_get_image_info_array.cpp
 
 diff --git a/kernels/compare_image_2d_and_1d_array.cl 
 b/kernels/compare_image_2d_and_1d_array.cl
 new file mode 100644
 index 000..ff25834
 --- /dev/null
 +++ b/kernels/compare_image_2d_and_1d_array.cl
 @@ -0,0 +1,12 @@
 +__kernel void
 +compare_image_2d_and_1d_array(image2d_t a1, image1d_array_t a2, sampler_t 
 sampler)
 +{
 +  int2 coord;
 +  int4 color1;
 +  int4 color2;
 +  coord.x =  get_global_id(0);
 +  coord.y = get_global_id(1);
 +  color1 = read_imagei(a1, sampler, coord);
 +  color2 = read_imagei(a2, sampler, coord);
 +  printf(## x y is (%d, %d), color1 is (%d %d %d %d), color2 is (%d 
 %d %d %d)\n, coord.x, coord.y, color1.x, color1.y, color1.z, color1.w, 
 color2.x, color2.y, color2.z, color2.w);
 +}
 diff --git a/kernels/test_get_image_info_array.cl 
 b/kernels/test_get_image_info_array.cl
 new file mode 100644
 index 000..333da77
 --- /dev/null
 +++ b/kernels/test_get_image_info_array.cl
 @@ -0,0 +1,25 @@
 +__kernel void
 +test_get_image_info_array(__write_only image1d_array_t a1, __write_only 
 image2d_array_t a2, __global int *result)
 +{
 +  int w, h, array_sz;
 +
 +  w = get_image_width(a1);
 +  array_sz = (int)get_image_array_size(a1);
 +  int channel_data_type = get_image_channel_data_type(a1);
 +  int channel_order = get_image_channel_order(a1);
 +  result[0] = w;
 +  result[1] = array_sz;
 +  result[2] = channel_data_type;
 +  result[3] = channel_order;
 +
 +  w = get_image_width(a2);
 +  h = get_image_height(a2);
 +  array_sz = (int)get_image_array_size(a2);
 +  channel_data_type = get_image_channel_data_type(a2);
 +  channel_order = get_image_channel_order(a2);
 +  result[4] = w;
 +  result[5] = h;
 +  result[6] = array_sz;
 +  result[7] = channel_data_type;
 +  result[8] = channel_order;
 +}
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
 index f0e62e2..641a73b 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
 @@ -122,6 +122,7 @@ set (utests_sources
compiler_volatile.cpp
compiler_copy_image1.cpp
compiler_get_image_info.cpp
 +  compiler_get_image_info_array.cpp
compiler_vect_compare.cpp
compiler_vector_load_store.cpp
compiler_vector_inc.cpp
 @@ -182,6 +183,7 @@ set (utests_sources
enqueue_fill_buf.cpp
enqueue_built_in_kernels.cpp
image_1D_buffer.cpp
 +  compare_image_2d_and_1d_array.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
 diff --git a/utests/compare_image_2d_and_1d_array.cpp 
 b/utests/compare_image_2d_and_1d_array.cpp
 new file mode 100644
 index 000..f989049
 --- /dev/null
 +++ b/utests/compare_image_2d_and_1d_array.cpp
 @@ -0,0 +1,78 @@
 +#include string.h
 +#include utest_helper.hpp
 +
 +static void compare_image_2d_and_1d_array(void)
 +{
 +  const int w = 64

Re: [Beignet] [PATCH] HSW: Fix potential issue of GT3 when calc stack address.

2014-06-12 Thread He Junyan
Tested on my HSW platform, 
no obvious regression found.


On Thu, 2014-06-12 at 19:42 +0800, Yang Rong wrote:
 GT3 have 4 half slice, so should shift left 2 bits, and also should enlarge 
 the stack buffer size,
 otherwize, if thread generate is non-balance, may out of bound.
 Per bspec, scratch size need set 2X of desired.
 
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  backend/src/backend/gen75_context.cpp | 4 ++--
  src/cl_command_queue_gen7.c   | 6 ++
  src/intel/intel_gpgpu.c   | 3 +++
  3 files changed, 11 insertions(+), 2 deletions(-)
 
 diff --git a/backend/src/backend/gen75_context.cpp 
 b/backend/src/backend/gen75_context.cpp
 index aedd4d3..da0db85 100644
 --- a/backend/src/backend/gen75_context.cpp
 +++ b/backend/src/backend/gen75_context.cpp
 @@ -92,12 +92,12 @@ namespace gbe
p-curr.predicate = GEN_PREDICATE_NONE;
//p-AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), 
 GenRegister::immud(0x1ff));
p-AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), 
 GenRegister::immud(0x7f));
 -  p-AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), 
 GenRegister::immud(0x80));
 +  p-AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), 
 GenRegister::immud(0x180));
p-SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), 
 GenRegister::immud(7));
p-curr.execWidth = this-simdWidth;
p-SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
p-curr.execWidth = 1;
 -  p-SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), 
 GenRegister::immud(1));
 +  p-SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), 
 GenRegister::immud(2));
p-ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), 
 GenRegister::ud1grf(126, 4));
p-SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), 
 GenRegister::immud(perThreadShift));
p-curr.execWidth = this-simdWidth;
 diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
 index 9680535..af3030c 100644
 --- a/src/cl_command_queue_gen7.c
 +++ b/src/cl_command_queue_gen7.c
 @@ -244,6 +244,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
assert(offset = 0);
stack_sz *= gbe_kernel_get_simd_width(ker-opaque);
stack_sz *= device-max_compute_unit;
 +  /* Because HSW calc stack offset per thread is relative with half slice, 
 when
 + thread schedule in half slice is not balance, would out of bound. 
 Because
 + the max half slice is 4 in GT4, multiply stack size with 4 for safe.
 +   */
 +  if(cl_driver_get_ver(ctx-drv) == 75)
 +stack_sz *= 4;
cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
  }
  
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index 5093583..cae843b 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -833,6 +833,9 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t 
 per_thread_size)
drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr;
drm_intel_bo* old = gpgpu-scratch_b.bo;
uint32_t total = per_thread_size * gpgpu-max_threads;
 +  /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may 
 hang */
 +  if (IS_HASWELL(gpgpu-drv-device_id))
 +  total *= 2;
  
gpgpu-per_thread_scratch = per_thread_size;
  



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Fix the 3D failed problem because the un-inited parameters

2014-06-12 Thread He Junyan
Sorry, this patch is for opencl-1.2

On Thu, 2014-06-12 at 14:55 +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  utests/compiler_fill_image_3d.cpp   | 4 
  utests/compiler_fill_image_3d_2.cpp | 4 
  2 files changed, 8 insertions(+)
 
 diff --git a/utests/compiler_fill_image_3d.cpp 
 b/utests/compiler_fill_image_3d.cpp
 index ac0b7e0..ec96e80 100644
 --- a/utests/compiler_fill_image_3d.cpp
 +++ b/utests/compiler_fill_image_3d.cpp
 @@ -1,3 +1,4 @@
 +#include string.h
  #include utest_helper.hpp
  
  static void compiler_fill_image_3d(void)
 @@ -9,6 +10,9 @@ static void compiler_fill_image_3d(void)
cl_image_format format;
cl_image_desc desc;
  
 +  memset(desc, 0x0, sizeof(cl_image_desc));
 +  memset(format, 0x0, sizeof(cl_image_format));
 +
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_UNSIGNED_INT8;
desc.image_type = CL_MEM_OBJECT_IMAGE3D;
 diff --git a/utests/compiler_fill_image_3d_2.cpp 
 b/utests/compiler_fill_image_3d_2.cpp
 index 4c56036..410ace8 100644
 --- a/utests/compiler_fill_image_3d_2.cpp
 +++ b/utests/compiler_fill_image_3d_2.cpp
 @@ -1,3 +1,4 @@
 +#include string.h
  #include utest_helper.hpp
  
  static void compiler_fill_image_3d_2(void)
 @@ -8,6 +9,9 @@ static void compiler_fill_image_3d_2(void)
cl_image_format format;
cl_image_desc desc;
  
 +  memset(desc, 0x0, sizeof(cl_image_desc));
 +  memset(format, 0x0, sizeof(cl_image_format));
 +
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_UNSIGNED_INT8;
desc.image_type = CL_MEM_OBJECT_IMAGE3D;



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [[OpenCL-1.2]] GBE: Enable some implemented Opencl 1.2 functions in icd table.

2014-06-10 Thread He Junyan
OK, 
if it is mandatory, I agree.
LGTM.


On Tue, 2014-06-10 at 14:07 +0800, Zhigang Gong wrote:
 Just as we discussed, for the mandatory APIs, we 'd better to just keep it
 as NULL if we haven't implemented it.
 
 And if we want to set some stub function, we should not set the stub
 function here. We should implement a dummy
 function in the cl_api.c and just put a NOT_SUPPORT there. And then remove
 the CL_1_2_NOTYET here. In one word,
 we don't need to add a fake stub function here.
 
 Any further comments?
 
  -Original Message-
  From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
 He
  Junyan
  Sent: Tuesday, June 10, 2014 12:33 PM
  To: Zhigang Gong
  Cc: beignet@lists.freedesktop.org
  Subject: Re: [Beignet] [[OpenCL-1.2]] GBE: Enable some implemented Opencl
  1.2 functions in icd table.
  
  hi,
  
  I want to add a fake stub function here, printf the warning of not
 implement,
  and return CL_SOME_ERRORXX, the null function always cause the program
  crash.
  
  On Tue, 2014-06-10 at 09:01 +0800, Zhigang Gong wrote:
   Signed-off-by: Zhigang Gong zhigang.g...@intel.com
   ---
src/cl_khr_icd.c | 14 +++---
1 file changed, 7 insertions(+), 7 deletions(-)
  
   diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c index
   d601134..3a7dec2 100644
   --- a/src/cl_khr_icd.c
   +++ b/src/cl_khr_icd.c
   @@ -142,19 +142,19 @@ struct _cl_icd_dispatch const
   cl_khr_icd_dispatch = {  #ifdef CL_VERSION_1_2
  (void *) NULL,
  CL_1_2_NOTYET(clCreateSubDevices),
   -  CL_1_2_NOTYET(clRetainDevice),
   -  CL_1_2_NOTYET(clReleaseDevice),
   -  CL_1_2_NOTYET(clCreateImage),
   -  CL_1_2_NOTYET(clCreateProgramWithBuiltInKernels),
   +  clRetainDevice,
   +  clReleaseDevice,
   +  clCreateImage,
   +  clCreateProgramWithBuiltInKernels,
  CL_1_2_NOTYET(clCompileProgram),
  CL_1_2_NOTYET(clLinkProgram),
  CL_1_2_NOTYET(clUnloadPlatformCompiler),
  CL_1_2_NOTYET(clGetKernelArgInfo),
   -  CL_1_2_NOTYET(clEnqueueFillBuffer),
   +  clEnqueueFillBuffer,
  CL_1_2_NOTYET(clEnqueueFillImage),
  CL_1_2_NOTYET(clEnqueueMigrateMemObjects),
   -  CL_1_2_NOTYET(clEnqueueMarkerWithWaitList),
   -  CL_1_2_NOTYET(clEnqueueBarrierWithWaitList),
   +  clEnqueueMarkerWithWaitList,
   +  clEnqueueBarrierWithWaitList,
  CL_1_2_NOTYET(clGetExtensionFunctionAddressForPlatform),
  CL_GL_INTEROP(clCreateFromGLTexture),
  (void *) NULL,
  
  
  
  ___
  Beignet mailing list
  Beignet@lists.freedesktop.org
  http://lists.freedesktop.org/mailman/listinfo/beignet
 



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Handle the difference timestamp count, got from drm_intel_reg_read, between 32bit system and 64bit system.

2014-06-10 Thread He Junyan
I verified this patch on both my 32bits and 64bits IVB platforms.
Both results are OK and pass the profiling_exec test case.


On Tue, 2014-06-10 at 16:17 +0800, Yang Rong wrote:
 In x86_64 system, the low 32bits of timestamp count are stored in the high 32 
 bits of result which
 got from drm_intel_reg_read, and 32-35 bits are lost; but in i386 system, the 
 timestamp count match bspec.
 It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain 32 
 bits data in i386.
 
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  src/intel/intel_gpgpu.c | 31 ---
  1 file changed, 16 insertions(+), 15 deletions(-)
 
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index a1bd672..7aa5563 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -1170,12 +1170,16 @@ 
 intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* 
 ret_ts)
drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr;
  
drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, result);
 -  if (IS_HASWELL(gpgpu-drv-device_id)) {
 -result = result  0x000F;
 -  } else {
 -result = result  0xF000;
 -result = result  28;
 -  }
 +  /* In x86_64 system, the low 32bits of timestamp count are stored in the 
 high 32 bits of
 + result which got from drm_intel_reg_read, and 32-35 bits are lost; but 
 match bspec in
 + i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, 
 and only remain
 + 32 bits data in i386.
 +  */
 +#ifdef __i386__
 +  result = result  0x0;
 +#else
 +  result = result  32;
 +#endif  /* __i386__  */
result *= 80;
  
*ret_ts = result;
 @@ -1195,15 +1199,12 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* 
 gpgpu, intel_event_t *event,
uint64_t* ptr = event-ts_buf-virtual;
result = ptr[index];
  
 -  if (IS_HASWELL(gpgpu-drv-device_id))
 -result = (result  0xF) * 80; //convert to nanoseconds
 -  else
 -/* According to BSpec, the timestamp counter should be 36 bits,
 -   but comparing to the timestamp counter from IO control reading,
 -   we find the first 4 bits seems to be fake. In order to keep the
 -   timestamp counter conformable, we just skip the first 4 bits.
 - */
 -result = ((result  0x0)  4) * 80; //convert to nanoseconds
 +  /* According to BSpec, the timestamp counter should be 36 bits,
 + but comparing to the timestamp counter from IO control reading,
 + we find the first 4 bits seems to be fake. In order to keep the
 + timestamp counter conformable, we just skip the first 4 bits.
 +  */
 +  result = (result  0x0) * 80; //convert to nanoseconds
*ret_ts = result;
  
drm_intel_gem_bo_unmap_gtt(event-ts_buf);



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [[OpenCL-1.2]] GBE: Enable some implemented Opencl 1.2 functions in icd table.

2014-06-09 Thread He Junyan
hi,

I want to add a fake stub function here, printf the warning of not
implement, and return CL_SOME_ERRORXX, the null function always cause
the program crash.

On Tue, 2014-06-10 at 09:01 +0800, Zhigang Gong wrote:
 Signed-off-by: Zhigang Gong zhigang.g...@intel.com
 ---
  src/cl_khr_icd.c | 14 +++---
  1 file changed, 7 insertions(+), 7 deletions(-)
 
 diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
 index d601134..3a7dec2 100644
 --- a/src/cl_khr_icd.c
 +++ b/src/cl_khr_icd.c
 @@ -142,19 +142,19 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
  #ifdef CL_VERSION_1_2
(void *) NULL,
CL_1_2_NOTYET(clCreateSubDevices),
 -  CL_1_2_NOTYET(clRetainDevice),
 -  CL_1_2_NOTYET(clReleaseDevice),
 -  CL_1_2_NOTYET(clCreateImage),
 -  CL_1_2_NOTYET(clCreateProgramWithBuiltInKernels),
 +  clRetainDevice,
 +  clReleaseDevice,
 +  clCreateImage,
 +  clCreateProgramWithBuiltInKernels,
CL_1_2_NOTYET(clCompileProgram),
CL_1_2_NOTYET(clLinkProgram),
CL_1_2_NOTYET(clUnloadPlatformCompiler),
CL_1_2_NOTYET(clGetKernelArgInfo),
 -  CL_1_2_NOTYET(clEnqueueFillBuffer),
 +  clEnqueueFillBuffer,
CL_1_2_NOTYET(clEnqueueFillImage),
CL_1_2_NOTYET(clEnqueueMigrateMemObjects),
 -  CL_1_2_NOTYET(clEnqueueMarkerWithWaitList),
 -  CL_1_2_NOTYET(clEnqueueBarrierWithWaitList),
 +  clEnqueueMarkerWithWaitList,
 +  clEnqueueBarrierWithWaitList,
CL_1_2_NOTYET(clGetExtensionFunctionAddressForPlatform),
CL_GL_INTEROP(clCreateFromGLTexture),
(void *) NULL,



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH V2] add [opencl-1.2] API clCreateSubDevice.

2014-06-09 Thread He Junyan
That's OK


On Fri, 2014-06-06 at 05:07 +0800, xionghu@intel.com wrote:
 From: Luo xionghu@intel.com
 
 creates an array of sub-devices that each reference a non-intersecting
 set of compute units within in_device, according to a partition scheme
 given by properties.
 ---
  src/cl_api.c   | 10 --
  src/cl_device_id.c |  6 ++
  src/cl_device_id.h |  7 +++
  src/cl_gt_device.h |  7 ++-
  4 files changed, 27 insertions(+), 3 deletions(-)
 
 diff --git a/src/cl_api.c b/src/cl_api.c
 index 8598088..8264970 100644
 --- a/src/cl_api.c
 +++ b/src/cl_api.c
 @@ -242,8 +242,14 @@ clCreateSubDevices(cl_device_id 
 in_device,
 cl_device_id *   out_devices,
 cl_uint *num_devices_ret)
  {
 -  NOT_IMPLEMENTED;
 -  return 0;
 +  /* Check parameter consistency */
 +  if (UNLIKELY(out_devices == NULL  num_devices_ret == NULL))
 +return CL_INVALID_VALUE;
 +  if (UNLIKELY(in_device == NULL  properties == NULL))
 +return CL_INVALID_VALUE;
 +
 +  *num_devices_ret = 0;
 +  return CL_INVALID_DEVICE_PARTITION_COUNT;
  }
  
  cl_int
 diff --git a/src/cl_device_id.c b/src/cl_device_id.c
 index 8ec7741..df37519 100644
 --- a/src/cl_device_id.c
 +++ b/src/cl_device_id.c
 @@ -393,6 +393,12 @@ cl_get_device_info(cl_device_id device,
  DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
  DECL_STRING_FIELD(EXTENSIONS, extensions);
  DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
 +DECL_FIELD(PARENT_DEVICE, parent_device)
 +DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device)
 +DECL_FIELD(PARTITION_PROPERTIES, partition_property)
 +DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
 +DECL_FIELD(PARTITION_TYPE, partition_type)
 +DECL_FIELD(REFERENCE_COUNT, device_reference_count)
  
  case CL_DRIVER_VERSION:
if (param_value_size_ret) {
 diff --git a/src/cl_device_id.h b/src/cl_device_id.h
 index 2bbe98e..a5449a7 100644
 --- a/src/cl_device_id.h
 +++ b/src/cl_device_id.h
 @@ -98,6 +98,13 @@ struct _cl_device_id {
/* Kernel specific info that we're assigning statically */
size_t wg_sz;
size_t preferred_wg_sz_mul;
 +  /* SubDevice specific info */
 +  cl_device_id parent_device;
 +  cl_uint  partition_max_sub_device;
 +  cl_device_partition_property partition_property[3];
 +  cl_device_affinity_domainaffinity_domain;
 +  cl_device_partition_property partition_type[3];
 +  cl_uint  device_reference_count;
  };
  
  /* Get a device from the given platform */
 diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
 index cab2c58..b8bda5e 100644
 --- a/src/cl_gt_device.h
 +++ b/src/cl_gt_device.h
 @@ -101,5 +101,10 @@ DECL_INFO_STRING(built_in_kernels, 
 __cl_copy_region_align4;
  
  DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
  #undef DECL_INFO_STRING
 -
 +.parent_device = NULL,
 +.partition_max_sub_device = 1,
 +.partition_property = {0},
 +.affinity_domain = 0,
 +.partition_type = {0},
 +.device_reference_count = 1,
  



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 5/8] HSW: Use the drm flag I915_EXEC_ENABLE_SLM to set L3 control config.

2014-06-05 Thread He Junyan
Hi

I find the 
drm_intel_gem_context_create, which will call
DRM_IOCTL_I915_GEM_CONTEXT_CREATE IOCtrl of kernel.
This is implemented after kernel 3.10 version.

So if the kernel version is before 3.10,
assert(driver-ctx);
will happen.

So our beignet will not support old kernel version now,
I think we should update the README. 


On Mon, 2014-05-12 at 23:12 +0800, Yang Rong wrote:
 Because LRI commands will be converted to NOOP, add the I915_EXEC_ENABLE_SLM
 flag to the drm kernal driver, to enable SLM in the L3. Set the flag when
 application use slm. Still keep the L3 config in the batch buffer for fulsim.
 Also create and use the openCL own context when exec, to avoid affect the 
 other context.
 
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  src/intel/intel_batchbuffer.c | 10 +-
  src/intel/intel_batchbuffer.h |  3 +++
  src/intel/intel_driver.c  | 19 ++
  src/intel/intel_driver.h  |  1 +
  src/intel/intel_gpgpu.c   | 46 
 +++
  5 files changed, 74 insertions(+), 5 deletions(-)
 
 diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
 index 62eedd0..19dc901 100644
 --- a/src/intel/intel_batchbuffer.c
 +++ b/src/intel/intel_batchbuffer.c
 @@ -74,6 +74,7 @@ intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t 
 sz)
batch-ptr = batch-map;
batch-atomic = 0;
batch-last_bo = batch-buffer;
 +  batch-enable_slm = 0;
  }
  
  LOCAL void
 @@ -119,7 +120,14 @@ intel_batchbuffer_flush(intel_batchbuffer_t *batch)
if (!is_locked)
  intel_driver_lock_hardware(batch-intel);
  
 -  dri_bo_exec(batch-buffer, used, 0, 0, 0);
 +  int flag = I915_EXEC_RENDER;
 +  if(batch-enable_slm) {
 +/* use the hard code here temp, must change to
 + * I915_EXEC_ENABLE_SLM when it drm accept the patch */
 +flag |= (113);
 +  }
 +  drm_intel_gem_bo_context_exec(batch-buffer, batch-intel-ctx, used, 
 flag);
 +
if (!is_locked)
  intel_driver_unlock_hardware(batch-intel);
  
 diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
 index 74f1790..0c3bc13 100644
 --- a/src/intel/intel_batchbuffer.h
 +++ b/src/intel/intel_batchbuffer.h
 @@ -83,6 +83,9 @@ typedef struct intel_batchbuffer
uint32_t size;
uint8_t *map;
uint8_t *ptr;
 +  /** HSW: can't set LRI in batch buffer, set I915_EXEC_ENABLE_SLM
 +   *  flag when call exec. */
 +  uint8_t enable_slm;
int atomic;
  } intel_batchbuffer_t;
  
 diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
 index ef97835..08d6bc0 100644
 --- a/src/intel/intel_driver.c
 +++ b/src/intel/intel_driver.c
 @@ -106,6 +106,7 @@ intel_driver_delete(intel_driver_t *driver)
  {
if (driver == NULL)
  return;
 +
if (driver-bufmgr)
  drm_intel_bufmgr_destroy(driver-bufmgr);
cl_free(driver);
 @@ -139,6 +140,21 @@ intel_driver_memman_init(intel_driver_t *driver)
drm_intel_bufmgr_gem_enable_reuse(driver-bufmgr);
  }
  
 +static void
 +intel_driver_context_init(intel_driver_t *driver)
 +{
 +  driver-ctx = drm_intel_gem_context_create(driver-bufmgr);
 +  assert(driver-ctx);
 +}
 +
 +static void
 +intel_driver_context_destroy(intel_driver_t *driver)
 +{
 +  if(driver-ctx)
 +drm_intel_gem_context_destroy(driver-ctx);
 +  driver-ctx = NULL;
 +}
 +
  static void 
  intel_driver_init(intel_driver_t *driver, int dev_fd)
  {
 @@ -151,6 +167,7 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
intel_driver_get_param(driver, I915_PARAM_CHIPSET_ID, driver-device_id);
assert(res);
intel_driver_memman_init(driver);
 +  intel_driver_context_init(driver);
  
  #if EMULATE_GEN
driver-gen_ver = EMULATE_GEN;
 @@ -364,6 +381,7 @@ intel_get_device_id(void)
assert(driver != NULL);
intel_driver_open(driver, NULL);
intel_device_id = driver-device_id;
 +  intel_driver_context_destroy(driver);
intel_driver_close(driver);
intel_driver_terminate(driver);
intel_driver_delete(driver);
 @@ -376,6 +394,7 @@ cl_intel_driver_delete(intel_driver_t *driver)
  {
if (driver == NULL)
  return;
 +  intel_driver_context_destroy(driver);
intel_driver_close(driver);
intel_driver_terminate(driver);
intel_driver_delete(driver);
 diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
 index a01d881..34efbbb 100644
 --- a/src/intel/intel_driver.h
 +++ b/src/intel/intel_driver.h
 @@ -78,6 +78,7 @@ typedef struct _XDisplay Display;
  typedef struct intel_driver
  {
dri_bufmgr *bufmgr;
 +  drm_intel_context *ctx;
int fd;
int device_id;
int gen_ver;
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index 603a075..103a4b2 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -118,6 +118,8 @@ struct intel_gpgpu
  
  typedef struct intel_gpgpu intel_gpgpu_t;
  
 +typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
 +intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
  
  static void
  

Re: [Beignet] [PATCH] Fix cl_event_get_timestamp for submit and queued

2014-06-04 Thread He Junyan
So, you apply the same logic in the function
intel_gpgpu_event_get_exec_timestamp here.

BSpec really have some mistakes.
But for most of the IVB platform, 
-result = result  0xF000;
-result = result  28;
works very well.

So I think if you want to correct this, you should add the PCIID check
like HSW.
 IS_(gpgpu-drv-device_id) should be added here.

If you do not know how to do it, please notify the PCIID of your
Baytrail-I E3827 



On Wed, 2014-06-04 at 16:15 +, michael.j.fergu...@l-3com.com wrote:
 commit a9ab94503348068579e8e816e80eb62598fd7f5f
 Author: Michael Ferguson michael.j.fergu...@l-3com.com
 Date:   Fri May 30 11:32:36 2014 -0600
 
 Fix cl_event_get_timestamp for submit and queued
 
 The cl_gpgpu_event_get_gpu_cur_timestamp function did not apply the same 
 logic as the cl_gpgpu_event_get_exec_timestamp regarding the timestamp 
 counter on the Baytrail, which resulted in a bogus GPU current timestamp.
 
 Tests on the Baytrail-I E3827 indicated the following clock values in the 
 profiling_exec test before this patch:
 
 queued = 1920
 submit = 1920
 start  = 2762442307840
 end= 2762442351360
 
 Obviously these values were not correct for the queued and submit 
 counters. After applying this patch the values in the profiling_exec test 
 indicated:
 
 queued = 320306542080
 submit = 320306617600
 start  = 320308817920
 end= 320308857600
 
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index bde9bd5..22e04f5 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -1138,8 +1138,12 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* 
 gpgpu, uint64_t* ret_ts)
if (IS_HASWELL(gpgpu-drv-device_id)) {
  result = result  0x000F;
} else {
 -result = result  0xF000;
 -result = result  28;
 +/* According to BSpec, the timestamp counter should be 36 bits,
 +   but comparing to the timestamp counter from IO control reading,
 +   we find the first 4 bits seems to be fake. In order to keep the
 +   timestamp counter conformable, we just skip the first 4 bits.
 + */
 +result = (result  0x0)  4;
}
result *= 80;
 
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Fix timestamp on HASWELL

2014-05-30 Thread He Junyan
OK, This patch LGTM

On Mon, 2014-05-26 at 19:25 +0800, Li Peng wrote:
 The GPU timestamp should be lower 36 bit on HASWELL
 
 Signed-off-by: Li Peng peng...@intel.com
 ---
  src/cl_driver.h |  2 +-
  src/cl_event.c  |  4 ++--
  src/intel/intel_gpgpu.c | 26 +-
  3 files changed, 20 insertions(+), 12 deletions(-)
 
 diff --git a/src/cl_driver.h b/src/cl_driver.h
 index 9dc2330..3e01c92 100644
 --- a/src/cl_driver.h
 +++ b/src/cl_driver.h
 @@ -193,7 +193,7 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
  extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
  
  /* Get a event time stamp */
 -typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, 
 uint64_t*);
 +typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, 
 cl_gpgpu_event, int, uint64_t*);
  extern cl_gpgpu_event_get_exec_timestamp_cb 
 *cl_gpgpu_event_get_exec_timestamp;
  
  /* Get current GPU time stamp */
 diff --git a/src/cl_event.c b/src/cl_event.c
 index 727ee1f..30e0e06 100644
 --- a/src/cl_event.c
 +++ b/src/cl_event.c
 @@ -514,11 +514,11 @@ cl_int cl_event_get_timestamp(cl_event event, 
 cl_profiling_info param_name)
  event-timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
  return CL_SUCCESS;
} else if(param_name == CL_PROFILING_COMMAND_START) {
 -cl_gpgpu_event_get_exec_timestamp(event-gpgpu_event, 0, ret_val);
 +cl_gpgpu_event_get_exec_timestamp(gpgpu, event-gpgpu_event, 0, 
 ret_val);
  event-timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
  return CL_SUCCESS;
} else if (param_name == CL_PROFILING_COMMAND_END) {
 -cl_gpgpu_event_get_exec_timestamp(event-gpgpu_event, 1, ret_val);
 +cl_gpgpu_event_get_exec_timestamp(gpgpu, event-gpgpu_event, 1, 
 ret_val);
  event-timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
  return CL_SUCCESS;
}
 diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
 index b7b712f..2ab2bb7 100644
 --- a/src/intel/intel_gpgpu.c
 +++ b/src/intel/intel_gpgpu.c
 @@ -1110,8 +1110,12 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* 
 gpgpu, uint64_t* ret_ts)
drm_intel_bufmgr *bufmgr = gpgpu-drv-bufmgr;
  
drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, result);
 -  result = result  0xF000;
 -  result = result  28;
 +  if (IS_HASWELL(gpgpu-drv-device_id)) {
 +result = result  0x000F;
 +  } else {
 +result = result  0xF000;
 +result = result  28;
 +  }
result *= 80;
  
*ret_ts = result;
 @@ -1120,8 +1124,8 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* 
 gpgpu, uint64_t* ret_ts)
  
  /* Get the GPU execute time. */
  static void
 -intel_gpgpu_event_get_exec_timestamp(intel_event_t *event,
 -int index, uint64_t* ret_ts)
 +intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t 
 *event,
 +  int index, uint64_t* ret_ts)
  {
uint64_t result = 0;
  
 @@ -1131,11 +1135,15 @@ intel_gpgpu_event_get_exec_timestamp(intel_event_t 
 *event,
uint64_t* ptr = event-ts_buf-virtual;
result = ptr[index];
  
 -  /* According to BSpec, the timestamp counter should be 36 bits,
 - but comparing to the timestamp counter from IO control reading,
 - we find the first 4 bits seems to be fake. In order to keep the
 - timestamp counter conformable, we just skip the first 4 bits. */
 -  result = ((result  0x0)  4) * 80; //convert to nanoseconds
 +  if (IS_HASWELL(gpgpu-drv-device_id))
 +result = (result  0xF) * 80; //convert to nanoseconds
 +  else
 +/* According to BSpec, the timestamp counter should be 36 bits,
 +   but comparing to the timestamp counter from IO control reading,
 +   we find the first 4 bits seems to be fake. In order to keep the
 +   timestamp counter conformable, we just skip the first 4 bits.
 + */
 +result = ((result  0x0)  4) * 80; //convert to nanoseconds
*ret_ts = result;
  
drm_intel_gem_bo_unmap_gtt(event-ts_buf);



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 4/5] HSW: enable the surface's cache in HSW.

2014-05-29 Thread He Junyan
I tested the whole patch suite on my HSW platform and
didn't find obvious regression.


On Fri, 2014-05-30 at 09:22 +0800, Zhigang Gong wrote:
 On Fri, May 30, 2014 at 12:37:33AM +0800, Yang Rong wrote:
  HSW's surface cache control is changed, correct it. And also disable
  exec flag for slm. When kernel parse cmd finish, need remove it totally
  
  Signed-off-by: Yang Rong rong.r.y...@intel.com
  ---
   src/cl_command_queue.c  |  4 +--
   src/cl_command_queue_gen7.c |  4 +--
   src/cl_device_id.c  |  2 +-
   src/cl_driver.h | 19 +-
   src/cl_driver_defs.c|  1 +
   src/intel/intel_gpgpu.c | 61 
  -
   6 files changed, 62 insertions(+), 29 deletions(-)
  
   LOCAL cl_int
  diff --git a/src/cl_device_id.c b/src/cl_device_id.c
  index 018da95..538c88a 100644
  --- a/src/cl_device_id.c
  +++ b/src/cl_device_id.c
  @@ -86,7 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = {
 .max_compute_unit = 140,
 .max_thread_per_unit = 7,
 .max_work_item_sizes = {512, 512, 512},
  -  .max_work_group_size = 512,
  +  .max_work_group_size = 1024,
 Why change max work group size in this patch?
 
   static void
   intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   {
  -  const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
  +  const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache 
  Control value */
 BEGIN_BATCH(gpgpu-batch, 10);
 OUT_BATCH(gpgpu-batch, CMD_STATE_BASE_ADDRESS | 8);
 /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write 
  Back */
  @@ -233,12 +245,12 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
 ADVANCE_BATCH(gpgpu-batch);
   }
   
  -uint32_t get_scratch_index_gen7(uint32_t size) {
  +uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
 return size / 1024 - 1;
   }
   
  -uint32_t get_scratch_index_gen75(uint32_t size) {
  -size = size  12;
  +uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
  +size = size  11;
 So this patch also fix the scratch configuration? right? If it is expected, I 
 think you
 may need to add related info into the commit log.
 
  @@ -411,25 +421,29 @@ static void
   intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
   {
 /* still set L3 in batch buffer for fulsim. */
  -  BEGIN_BATCH(gpgpu-batch, 6);
  +  BEGIN_BATCH(gpgpu-batch, 9);
  +  OUT_BATCH(gpgpu-batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
  +  OUT_BATCH(gpgpu-batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
  +  OUT_BATCH(gpgpu-batch, 0x0061);
  +
 OUT_BATCH(gpgpu-batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
 OUT_BATCH(gpgpu-batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
  +
 if (use_slm)
  -OUT_BATCH(gpgpu-batch, gpgpu_l3_config_reg1[8]);
  +OUT_BATCH(gpgpu-batch, gpgpu_l3_config_reg1[12]);
 can we change to use a specific value here rather than to pick a value from 
 magic array?
 Just as baytrail, if the register definition is published on 01.org, a 
 meaningful comment
 is also nice to have.
 
 Other part LGTM.
 
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Refine the cl thread implement for queue.

2014-05-26 Thread He Junyan
I think the batch buffer is just one kind of resource.
The other resources such as Image buffers, data buffers are also
needed to keep intact for each thread.
Unless you add a lock at the NDRQueue entrance, which begin to prepare
all the required resources. But I think that lock should be too heavy.



On Mon, 2014-05-26 at 09:37 +0800, Zhigang Gong wrote:
 And I just checked the clFinish and clFlush, they only need to access the
 batch buffer.
 So the root cause is that we always allocate a new batch buffer for a new
 kernel submitting
 for a queue. Even if there are many kernel enqueuing on the same queue.
 
 If we can maintain a uniform batch buffer for the single queue, then this
 issue will be
 solved clearly and gracefully.
 
 IMO, this is not the OpenCL spec issue. This is a implementation issue which
 we should solved
 In the future. What's your opinion?
 
 BTW, I'm ok with current implementation. But I found you may missed some
 minor comments
 Which embedded in my first email. Could you recheck it and solve those
 comment and
 Send a new version of the patch?
 
  -Original Message-
  From: Zhigang Gong [mailto:zhigang.g...@linux.intel.com]
  Sent: Monday, May 26, 2014 9:31 AM
  To: 'He Junyan'
  Cc: 'Junyan He'; 'beignet@lists.freedesktop.org'
  Subject: RE: [Beignet] [PATCH] Refine the cl thread implement for queue.
  
  Ok. The key issue is that the private gpgpu data structure is still needed
 after
  each kernel execution.
  And the gpgpu data is different for each kernel execution, right?
  Could you list all of the scenarios where we need to use the gpgpu data
 after
  the kernel submitting?
  I can think of the following two:
  1. clFinish
  2. clFlush
  
  Is there any other cases?
  
   -Original Message-
   From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
   Of He Junyan
   Sent: Friday, May 23, 2014 11:36 PM
   To: Zhigang Gong
   Cc: Junyan He; beignet@lists.freedesktop.org
   Subject: Re: [Beignet] [PATCH] Refine the cl thread implement for queue.
  
   I think it is hard to avoid using thread local data.
   Because when the queue creating, we do not know how many threads will
   use this queue later.
   The GPGPU resources will be hold in the queue, but at least every
   thread should have a local data to store the index to find the GPGPU
 data in
  the queue.
   And the thread should need to destroy the GPGPU resource when the
   thread exit, while the queue's life time may be much longer than the
 thread.
  
   OpenCL spec fail to define the relationship between the queue and the
  threads.
   This cause the dilemma.
  
   Please give some good advices if any.
  
   On Fri, 2014-05-23 at 16:33 +0800, Zhigang Gong wrote:
Some minor comments as below.
   
One thought about the usage of thread local data we are using here.
The original reason why we want to use thread local data is to avoid
lock as much as possible. But finally, we found to satisfy all the
use scenario, we can't avoid lock any way. Now we introduce lock
eventually. Then is there still good reason why we should use these
thread local data any more?
   
One possible question is as below:
   
If one queue is used in another thread to enqueue task, does it make
sense to create a thread local new gpgpu data and in this thread. Or
we can just simply lock and wait for other thread to unlock the queue?
   
On Tue, May 20, 2014 at 02:26:47PM +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com

 Because the cl_command_queue can be used in several threads
 simultaneously but without add ref to it, we now handle it like
 this:
 Keep one threads_slot_array, every time the thread get gpgpu or
 batch buffer, if it does not have a slot, assign it.
 The resources are keeped in queue private, and resize it if needed.
 When the thread exit, the slot will be set invalid.
 When queue released, all the resources will be released. If user
 still enqueue, flush or finish the queue after it has been
 released, the
   behavior is undefined.
 TODO: Need to shrink the slot map.

 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  src/cl_command_queue.c  |   6 +-
  src/cl_command_queue_gen7.c |   2 +-
  src/cl_context.c|   2 +-
  src/cl_device_id.c  |   2 +-
  src/cl_thread.c | 261
   +---
  src/cl_thread.h |   6 +-
  6 files changed, 205 insertions(+), 74 deletions(-)

 diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index
 6a699c0..802d313 100644
 --- a/src/cl_command_queue.c
 +++ b/src/cl_command_queue.c
 @@ -60,6 +60,7 @@ cl_command_queue_new(cl_context ctx)
/* The queue also belongs to its context */
cl_context_add_ref(ctx);

 +
useless new line.
  exit:
return queue

Re: [Beignet] [PATCH V2] gbe_bin_generater: fix two bugs.

2014-05-23 Thread He Junyan
Some PCI ID do not have Gen keyword, such as ruiling's
 IVBridge XX
So need to refine the command line

On Fri, 2014-05-23 at 19:04 +0800, Zhigang Gong wrote:
 From: Zhigang Gong zhigang.g...@linux.intel.com
 
 The pci id detecting method is broken on some system.
 And the gen pci id parsing in gbe_bin_generater is incorrect when
 the pci id has a-f hex digit.
 
 v2:
 Add VGA to filter out some nonVGA devices.
 Signed-off-by: Zhigang Gong zhigang.g...@linux.intel.com
 ---
  backend/src/gbe_bin_generater.cpp | 7 +--
  src/GetGenID.sh   | 2 +-
  2 files changed, 6 insertions(+), 3 deletions(-)
 
 diff --git a/backend/src/gbe_bin_generater.cpp 
 b/backend/src/gbe_bin_generater.cpp
 index 50020b5..898e2f2 100644
 --- a/backend/src/gbe_bin_generater.cpp
 +++ b/backend/src/gbe_bin_generater.cpp
 @@ -34,6 +34,7 @@
  #include vector
  #include algorithm
  #include stdlib.h
 +#include iostream
  #include stdio.h
  
  #include backend/program.h
 @@ -46,7 +47,7 @@ using namespace std;
  #define FILE_BUILD_FAILED 3
  #define FILE_SERIALIZATION_FAILED 4
  
 -static int gen_pci_id = 0;
 +static uint32_t gen_pci_id = 0;
  
  class program_build_instance {
  
 @@ -296,7 +297,9 @@ int main (int argc, const char **argv)
  return 1;
  }
  
 -gen_pci_id = (s[0] - '0')  12 | (s[1] - '0')  8 | (s[2] - 
 '0')  4 | (s[3] - '0');
 +std::stringstream str(s);
 +str  std::hex  gen_pci_id;
 +
  used_index[optind-1] = 1;
  // We must set the image base index here, as we invoke the 
 backend in a non-standard way.
  gbe_set_image_base_index(3);
 diff --git a/src/GetGenID.sh b/src/GetGenID.sh
 index 3114bd8..f8cb0a7 100755
 --- a/src/GetGenID.sh
 +++ b/src/GetGenID.sh
 @@ -1,2 +1,2 @@
  #!/bin/bash
 -lspci -nn | grep Gen .* Graphics -i  | grep \[8086:.*\] -o | awk -F : 
 '{print $2}' | awk -F ] '{print $1}'
 +lspci -nn | grep VGA.*Gen.*\[8086: -i  | grep \[8086:.*\] -o | awk -F : 
 '{print $2}' | awk -F ] '{print $1}'



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Add the pci id support for gbe_generate

2014-05-20 Thread He Junyan
Yes, it's on my plan and this patch is just the first step to make the
HSW's buffer copy and image copy workable.


On Tue, 2014-05-20 at 07:57 +, Yang, Rong R wrote:
 This patch detect the building platform's pci id and generate the bin for 
 host when building, it is necessary to generate the corresponding binary for 
 IVB and HSW.
 I think support cross platform bin generate and use a string in command line 
 is next step.
 
 -Original Message-
 From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
 Song, Ruiling
 Sent: Tuesday, May 20, 2014 2:35 PM
 To: He Junyan; beignet@lists.freedesktop.org
 Cc: Junyan He
 Subject: Re: [Beignet] [PATCH] Add the pci id support for gbe_generate
 
 You directly use pcid, right?
 What about changing to use a string as the command argument. Like 'ivb', 
 'hsw'?
 That would be meaningful for users.
 
 Thanks!
 Ruiling
 
 -Original Message-
 From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of He 
 Junyan
 Sent: Tuesday, May 20, 2014 1:33 PM
 To: beignet@lists.freedesktop.org
 Cc: Junyan He
 Subject: Re: [Beignet] [PATCH] Add the pci id support for gbe_generate
 
 ping for review
 
 On Tue, 2014-05-13 at 09:34 +0800, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
  
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
   backend/src/gbe_bin_generater.cpp |   20 +++-
   src/CMakeLists.txt|8 +++-
   src/GetGenID.sh   |2 ++
   utests/CMakeLists.txt |7 ++-
   4 files changed, 34 insertions(+), 3 deletions(-)  create mode 100755 
  src/GetGenID.sh
  
  diff --git a/backend/src/gbe_bin_generater.cpp
  b/backend/src/gbe_bin_generater.cpp
  index 15bdbd1..b6248ae 100644
  --- a/backend/src/gbe_bin_generater.cpp
  +++ b/backend/src/gbe_bin_generater.cpp
  @@ -46,6 +46,8 @@ using namespace std;  #define FILE_BUILD_FAILED 3 
  #define FILE_SERIALIZATION_FAILED 4
   
  +static int gen_pci_id = 0;
  +
   class program_build_instance {
   
   protected:
  @@ -249,7 +251,7 @@ int main (int argc, const char **argv)
   argv_saved.push_back(string(argv[i]));
   }
   
  -while ( (oc = getopt(argc, (char * const *)argv, o:p:s)) != -1 ) {
  +while ( (oc = getopt(argc, (char * const *)argv, t:o:p:s)) !=
  + -1 ) {
   switch (oc) {
   case 'p':
   {
  @@ -283,6 +285,22 @@ int main (int argc, const char **argv)
   used_index[optind-1] = 1;
   break;
   
  +case 't':
  +{
  +char *s = optarg;
  +if (optarg[0] == '0'  (optarg[1] == 'x' || optarg[1] == 'X'))
  +s += 2;
  +
  +if (s[0]  '0' || s[0]  '9') {
  +cout  Invalid target option argument  endl;
  +return 1;
  +}
  +
  +gen_pci_id = (s[0] - '0')  12 | (s[1] - '0')  8 | (s[2] - 
  '0')  4 | (s[3] - '0');
  +used_index[optind-1] = 1;
  +break;
  +}
  +
   case 's':
   program_build_instance::set_str_fmt_out(true);
   used_index[optind-1] = 1; diff --git a/src/CMakeLists.txt 
  b/src/CMakeLists.txt index 8164a44..f93ddcd 100644
  --- a/src/CMakeLists.txt
  +++ b/src/CMakeLists.txt
  @@ -4,6 +4,12 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
   ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
   ${CMAKE_CURRENT_SOURCE_DIR}/../include
   ${MESA_SOURCE_INCLUDES})
  +
  +set(GEN_PCI_ID)
  +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh
  +OUTPUT_VARIABLE GEN_PCI_ID)
  +message(STATUS Platform Gen PCI id is  ${GEN_PCI_ID})
  +
   macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)  foreach (KF
  ${KERNEL_FILES})
 set (input_file ${KERNEL_PATH}/${KF}.cl) @@ -12,7 +18,7 @@ foreach 
  (KF ${KERNEL_FILES})
 add_custom_command(
   OUTPUT ${output_file}
   COMMAND rm -rf ${output_file}
  -COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
  +COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} 
  + -t${GEN_PCI_ID}
   DEPENDS ${input_file}
  ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
   endforeach (KF)
   endmacro (MakeKernelBinStr)
  diff --git a/src/GetGenID.sh b/src/GetGenID.sh new file mode 100755 
  index 000..3114bd8
  --- /dev/null
  +++ b/src/GetGenID.sh
  @@ -0,0 +1,2 @@
  +#!/bin/bash
  +lspci -nn | grep Gen .* Graphics -i  | grep \[8086:.*\] -o | awk -F : 
  '{print $2}' | awk -F ] '{print $1}'
  diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index
  704438d..2a9ea66 100644
  --- a/utests/CMakeLists.txt
  +++ b/utests/CMakeLists.txt
  @@ -180,10 +180,15 @@ set (utests_sources
 utest_file_map.cpp
 utest_helper.cpp)
   
  +set(GEN_PCI_ID)
  +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh
  +OUTPUT_VARIABLE GEN_PCI_ID)
  +
  +message

Re: [Beignet] [PATCH] Add the pci id support for gbe_generate

2014-05-19 Thread He Junyan
ping for review

On Tue, 2014-05-13 at 09:34 +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  backend/src/gbe_bin_generater.cpp |   20 +++-
  src/CMakeLists.txt|8 +++-
  src/GetGenID.sh   |2 ++
  utests/CMakeLists.txt |7 ++-
  4 files changed, 34 insertions(+), 3 deletions(-)
  create mode 100755 src/GetGenID.sh
 
 diff --git a/backend/src/gbe_bin_generater.cpp 
 b/backend/src/gbe_bin_generater.cpp
 index 15bdbd1..b6248ae 100644
 --- a/backend/src/gbe_bin_generater.cpp
 +++ b/backend/src/gbe_bin_generater.cpp
 @@ -46,6 +46,8 @@ using namespace std;
  #define FILE_BUILD_FAILED 3
  #define FILE_SERIALIZATION_FAILED 4
  
 +static int gen_pci_id = 0;
 +
  class program_build_instance {
  
  protected:
 @@ -249,7 +251,7 @@ int main (int argc, const char **argv)
  argv_saved.push_back(string(argv[i]));
  }
  
 -while ( (oc = getopt(argc, (char * const *)argv, o:p:s)) != -1 ) {
 +while ( (oc = getopt(argc, (char * const *)argv, t:o:p:s)) != -1 ) {
  switch (oc) {
  case 'p':
  {
 @@ -283,6 +285,22 @@ int main (int argc, const char **argv)
  used_index[optind-1] = 1;
  break;
  
 +case 't':
 +{
 +char *s = optarg;
 +if (optarg[0] == '0'  (optarg[1] == 'x' || optarg[1] == 'X'))
 +s += 2;
 +
 +if (s[0]  '0' || s[0]  '9') {
 +cout  Invalid target option argument  endl;
 +return 1;
 +}
 +
 +gen_pci_id = (s[0] - '0')  12 | (s[1] - '0')  8 | (s[2] - 
 '0')  4 | (s[3] - '0');
 +used_index[optind-1] = 1;
 +break;
 +}
 +
  case 's':
  program_build_instance::set_str_fmt_out(true);
  used_index[optind-1] = 1;
 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
 index 8164a44..f93ddcd 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
 @@ -4,6 +4,12 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
  ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
  ${CMAKE_CURRENT_SOURCE_DIR}/../include
  ${MESA_SOURCE_INCLUDES})
 +
 +set(GEN_PCI_ID)
 +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh
 +OUTPUT_VARIABLE GEN_PCI_ID)
 +message(STATUS Platform Gen PCI id is  ${GEN_PCI_ID})
 +
  macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
  foreach (KF ${KERNEL_FILES})
set (input_file ${KERNEL_PATH}/${KF}.cl)
 @@ -12,7 +18,7 @@ foreach (KF ${KERNEL_FILES})
add_custom_command(
  OUTPUT ${output_file}
  COMMAND rm -rf ${output_file}
 -COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
 +COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} 
 -t${GEN_PCI_ID}
  DEPENDS ${input_file} 
 ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
  endforeach (KF)
  endmacro (MakeKernelBinStr)
 diff --git a/src/GetGenID.sh b/src/GetGenID.sh
 new file mode 100755
 index 000..3114bd8
 --- /dev/null
 +++ b/src/GetGenID.sh
 @@ -0,0 +1,2 @@
 +#!/bin/bash
 +lspci -nn | grep Gen .* Graphics -i  | grep \[8086:.*\] -o | awk -F : 
 '{print $2}' | awk -F ] '{print $1}'
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
 index 704438d..2a9ea66 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
 @@ -180,10 +180,15 @@ set (utests_sources
utest_file_map.cpp
utest_helper.cpp)
  
 +set(GEN_PCI_ID)
 +execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh
 +OUTPUT_VARIABLE GEN_PCI_ID)
 +
 +message(STATUS Platform Gen PCI id is  ${GEN_PCI_ID})
  SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
  ADD_CUSTOM_COMMAND(
  OUTPUT ${kernel_bin}.bin
 -COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
 +COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin 
 -t${GEN_PCI_ID}
  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater 
 ${kernel_bin}.cl
  )
  



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/8] HSW: align buffer's size to DWORD.

2014-05-15 Thread He Junyan
It seems a bit randomly.
You extend the sz, which may have some overlap with others', specially
in multi-thread multi-buffer cases.
I found it once but really can not dup it now.
I think it is OK now, and we can fix this bug later if we find.


On Wed, 2014-05-14 at 07:26 +, Yang, Rong R wrote:
 Run the buffer test 50+ times, all pass.
 
 -Original Message-
 From: He Junyan [mailto:junyan...@inbox.com] 
 Sent: Tuesday, May 13, 2014 3:16 PM
 To: Yang, Rong R
 Cc: beignet@lists.freedesktop.org
 Subject: Re: [Beignet] [PATCH 1/8] HSW: align buffer's size to DWORD.
 
 This patch will cause some regression in buffer tests
 
 
 On Mon, 2014-05-12 at 23:11 +0800, Yang Rong wrote:
  HSW: Byte scattered Read/Write require that the buffer size must be a 
  multiple of 4 bytes.
   So simply alignment all buffer size to 4. Pass utest 
  compiler_function_constant0.
  
  Because it is very light work around, align it without not check device.
  
  Signed-off-by: Yang Rong rong.r.y...@intel.com
  ---
   src/cl_mem.c | 4 
   1 file changed, 4 insertions(+)
  
  diff --git a/src/cl_mem.c b/src/cl_mem.c index 44482f7..5feda74 100644
  --- a/src/cl_mem.c
  +++ b/src/cl_mem.c
  @@ -334,6 +334,10 @@ cl_mem_new_buffer(cl_context ctx,
   goto error;
 }
   
  +  /* HSW: Byte scattered Read/Write has limitation that
  + the buffer size must be a multiple of 4 bytes. */  sz = 
  + ALIGN(sz, 4);
  +
 /* Create the buffer in video memory */
 mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, 
  err);
 if (mem == NULL || err != CL_SUCCESS)
 
 
 



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/3] [opencl-1.2] Add checks for clCreateImage and add 1d image creating logic

2014-05-15 Thread He Junyan
Sorry, this patch set is for Opencl-1.2 branch


On Thu, 2014-05-15 at 16:43 +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Add more check for Image creating according to the spec.
 Update the according image utest cases to pass it.
 The 1d image creating is also be added.
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  src/cl_api.c  | 36 
  src/cl_mem.c  | 24 ++--
  utests/compiler_copy_image.cpp|  4 
  utests/compiler_copy_image1.cpp   |  4 
  utests/compiler_copy_image_3d.cpp |  3 +++
  utests/compiler_fill_image.cpp|  4 
  utests/compiler_fill_image0.cpp   |  4 
  7 files changed, 73 insertions(+), 6 deletions(-)
 
 diff --git a/src/cl_api.c b/src/cl_api.c
 index 9c22819..b26936e 100644
 --- a/src/cl_api.c
 +++ b/src/cl_api.c
 @@ -506,7 +506,43 @@ clCreateImage(cl_context context,
cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
 +  if (image_format == NULL) {
 +err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
 +goto error;
 +  }
 +  if (image_format-image_channel_order  CL_R ||
 +  image_format-image_channel_order  CL_RGBx) {
 +err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
 +goto error;
 +  }
 +  if (image_format-image_channel_data_type  CL_SNORM_INT8 ||
 +  image_format-image_channel_data_type  CL_FLOAT) {
 +err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
 +goto error;
 +  }
 +
 +  if (image_desc == NULL) {
 +err = CL_INVALID_IMAGE_DESCRIPTOR;
 +goto error;
 +  }
 +  if (image_desc-image_type = CL_MEM_OBJECT_BUFFER ||
 +  image_desc-image_type  CL_MEM_OBJECT_IMAGE1D_BUFFER) {
 +err = CL_INVALID_IMAGE_DESCRIPTOR;
 +goto error;
 +  }
 +  /* buffer refers to a valid buffer memory object if image_type is
 + CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
 +  if (image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER 
 + image_desc-buffer) {
 +err = CL_INVALID_IMAGE_DESCRIPTOR;
 +goto error;
 +  }
 +  if (image_desc-num_mip_levels || image_desc-num_samples) {
 +err = CL_INVALID_IMAGE_DESCRIPTOR;
 +goto error;
 +  }
  
 +  /* Other details check for image_desc will leave to image create. */
mem = cl_mem_new_image(context,
   flags,
   image_format,
 diff --git a/src/cl_mem.c b/src/cl_mem.c
 index 3f1b389..0250f0a 100644
 --- a/src/cl_mem.c
 +++ b/src/cl_mem.c
 @@ -544,10 +544,22 @@ _cl_mem_new_image(cl_context ctx,
  err = CL_INVALID_IMAGE_SIZE;  \
  goto error;   \
} while (0);
 +
if (UNLIKELY(w == 0)) DO_IMAGE_ERROR;
 -  if (UNLIKELY(h == 0)) DO_IMAGE_ERROR;
 +  if (UNLIKELY(h == 0  image_type != CL_MEM_OBJECT_IMAGE1D)) 
 DO_IMAGE_ERROR;
  
 -  if (image_type == CL_MEM_OBJECT_IMAGE2D) {
 +  if (image_type == CL_MEM_OBJECT_IMAGE1D) {
 +size_t min_pitch = bpp * w;
 +if (data  pitch == 0)
 +  pitch = min_pitch;
 +
 +depth = 1;
 +h = 1;
 +if (UNLIKELY(w  ctx-device-image2d_max_width)) DO_IMAGE_ERROR;
 +if (UNLIKELY(data  min_pitch  pitch)) DO_IMAGE_ERROR;
 +if (UNLIKELY(!data  pitch != 0)) DO_IMAGE_ERROR;
 +tiling = CL_NO_TILE;
 +  } else if (image_type == CL_MEM_OBJECT_IMAGE2D) {
  size_t min_pitch = bpp * w;
  if (data  pitch == 0)
pitch = min_pitch;
 @@ -560,9 +572,7 @@ _cl_mem_new_image(cl_context ctx,
  if (cl_driver_get_ver(ctx-drv) != 6)
tiling = CL_TILE_Y;
  depth = 1;
 -  }
 -
 -  if (image_type == CL_MEM_OBJECT_IMAGE3D) {
 +  } else if (image_type == CL_MEM_OBJECT_IMAGE3D) {
  size_t min_pitch = bpp * w;
  if (data  pitch == 0)
pitch = min_pitch;
 @@ -580,7 +590,9 @@ _cl_mem_new_image(cl_context ctx,
  /* Pick up tiling mode (we do only linear on SNB) */
  if (cl_driver_get_ver(ctx-drv) != 6)
tiling = CL_TILE_Y;
 -  }
 +  } else
 +assert(0);
 +
  #undef DO_IMAGE_ERROR
  
/* Tiling requires to align both pitch and height */
 diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
 index 04c9544..dac8d50 100644
 --- a/utests/compiler_copy_image.cpp
 +++ b/utests/compiler_copy_image.cpp
 @@ -1,3 +1,4 @@
 +#include string.h
  #include utest_helper.hpp
  
  static void compiler_copy_image(void)
 @@ -8,6 +9,9 @@ static void compiler_copy_image(void)
cl_image_desc desc;
cl_sampler sampler;
  
 +  memset(desc, 0x0, sizeof(cl_image_desc));
 +  memset(format, 0x0, sizeof(cl_image_format));
 +
// Setup kernel and images
OCL_CREATE_KERNEL(test_copy_image);
buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
 diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
 index a9ef3f4..fe52dbf 100644
 --- a/utests/compiler_copy_image1.cpp
 +++ b/utests/compiler_copy_image1.cpp
 @@ -1,3 +1,4 @@
 +#include string.h
  #include utest_helper.hpp
  
  static void compiler_copy_image1

Re: [Beignet] [PATCH 1/8] HSW: align buffer's size to DWORD.

2014-05-13 Thread He Junyan
This patch will cause some regression in buffer tests


On Mon, 2014-05-12 at 23:11 +0800, Yang Rong wrote:
 HSW: Byte scattered Read/Write require that the buffer size must be a 
 multiple of 4 bytes.
  So simply alignment all buffer size to 4. Pass utest 
 compiler_function_constant0.
 
 Because it is very light work around, align it without not check device.
 
 Signed-off-by: Yang Rong rong.r.y...@intel.com
 ---
  src/cl_mem.c | 4 
  1 file changed, 4 insertions(+)
 
 diff --git a/src/cl_mem.c b/src/cl_mem.c
 index 44482f7..5feda74 100644
 --- a/src/cl_mem.c
 +++ b/src/cl_mem.c
 @@ -334,6 +334,10 @@ cl_mem_new_buffer(cl_context ctx,
  goto error;
}
  
 +  /* HSW: Byte scattered Read/Write has limitation that
 + the buffer size must be a multiple of 4 bytes. */
 +  sz = ALIGN(sz, 4);
 +
/* Create the buffer in video memory */
mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, err);
if (mem == NULL || err != CL_SUCCESS)



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/2] move enqueue_copy_image kernels outside of runtime code.

2014-05-13 Thread He Junyan
2/2 is OK

On Mon, 2014-05-12 at 12:41 +0800, xionghu@intel.com wrote:
 From: Luo xionghu@intel.com
 
 seperate the kernel code from host code to make it clean; build the
 kernels offline by gbe_bin_generator to improve the performance.
 ---
  src/CMakeLists.txt |  23 ++-
  src/cl_context.h   |  24 ++-
  src/cl_gt_device.h |  23 ++-
  src/cl_mem.c   | 214 
 ++---
  src/kernels/cl_internal_copy_buf_align1.cl |   8 -
  src/kernels/cl_internal_copy_buf_align16.cl|   2 +-
  src/kernels/cl_internal_copy_buf_align4.cl |   2 +-
  src/kernels/cl_internal_copy_buf_rect.cl   |  15 ++
  .../cl_internal_copy_buf_unalign_dst_offset.cl |   2 +-
  .../cl_internal_copy_buf_unalign_same_offset.cl|   2 +-
  .../cl_internal_copy_buf_unalign_src_offset.cl |   2 +-
  src/kernels/cl_internal_copy_buffer_to_image_2d.cl |  18 ++
  src/kernels/cl_internal_copy_buffer_to_image_3d.cl |  19 ++
  src/kernels/cl_internal_copy_image_2d_to_2d.cl |  21 ++
  src/kernels/cl_internal_copy_image_2d_to_3d.cl |  22 +++
  src/kernels/cl_internal_copy_image_2d_to_buffer.cl |  19 ++
  src/kernels/cl_internal_copy_image_3d_to_2d.cl |  22 +++
  src/kernels/cl_internal_copy_image_3d_to_3d.cl |  23 +++
  src/kernels/cl_internal_copy_image_3d_to_buffer.cl |  22 +++
  19 files changed, 308 insertions(+), 175 deletions(-)
  delete mode 100644 src/kernels/cl_internal_copy_buf_align1.cl
  create mode 100644 src/kernels/cl_internal_copy_buf_rect.cl
  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d.cl
  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d.cl
  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_2d.cl
  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_3d.cl
  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer.cl
  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_2d.cl
  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_3d.cl
  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer.cl
 
 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
 index 8164a44..ecc04ab 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
 @@ -17,11 +17,30 @@ foreach (KF ${KERNEL_FILES})
  endforeach (KF)
  endmacro (MakeKernelBinStr)
  
 +macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES)
 +  set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl)
 +  set (file_content)
 +  file (REMOVE ${output_file})
 +  foreach (KF ${KERNEL_NAMES})
 +set (input_file ${KERNEL_PATH}/${KF}.cl)
 +file(READ ${input_file} file_content )
 +STRING(REGEX REPLACE ; ; file_content ${file_content})
 +file(APPEND ${output_file} ${file_content})
 +  endforeach (KF)
 +endmacro (MakeBuiltInKernelStr)
 +
  set (KERNEL_STR_FILES)
 -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
 +set (KERNEL_NAMES cl_internal_copy_buf_align4
  cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
 -cl_internal_copy_buf_unalign_dst_offset 
 cl_internal_copy_buf_unalign_src_offset)
 +cl_internal_copy_buf_unalign_dst_offset 
 cl_internal_copy_buf_unalign_src_offset
 +cl_internal_copy_buf_rect cl_internal_copy_image_2d_to_2d 
 cl_internal_copy_image_3d_to_2d
 +cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
 +cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
 +cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d)
 +set (BUILT_IN_NAME  cl_internal_built_in_kernel)
 +MakeBuiltInKernelStr (${CMAKE_CURRENT_SOURCE_DIR}/kernels/ 
 ${KERNEL_NAMES})
  MakeKernelBinStr (${CMAKE_CURRENT_SOURCE_DIR}/kernels/ ${KERNEL_NAMES})
 +MakeKernelBinStr (${CMAKE_CURRENT_SOURCE_DIR}/kernels/ ${BUILT_IN_NAME})
  
  set(OPENCL_SRC
  ${KERNEL_STR_FILES}
 diff --git a/src/cl_context.h b/src/cl_context.h
 index 782a9af..24281be 100644
 --- a/src/cl_context.h
 +++ b/src/cl_context.h
 @@ -46,14 +46,22 @@ enum _cl_internal_ker_type {
CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
CL_ENQUEUE_COPY_BUFFER_RECT,
 -  CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image 2d
 -  CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image 2d
 -  CL_ENQUEUE_COPY_IMAGE_2, //copy image 2d to image 3d
 -  CL_ENQUEUE_COPY_IMAGE_3, //copy image 3d to image 3d
 -  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0,   //copy image 2d to buffer
 -  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
 -  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
 -  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
 +  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d
 +  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d
 +  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, //copy image 2d to 

Re: [Beignet] [V2 PATCH 1/6] Update the device info description for HSW

2014-05-09 Thread He Junyan
OK, name modified and new patch sent

On Thu, 2014-05-08 at 07:43 -0400, Jesper Pedersen wrote:
 Hi,
 
 On 05/07/2014 06:02 AM, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
 
  Split the cl_device_id description for HSW into
  GT1, GT2 and GT3, with different parameters.
 
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
src/cl_device_id.c | 135 
  +++--
1 file changed, 90 insertions(+), 45 deletions(-)
 
#define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \
STRUCT.FIELD = STRING; \
STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \
  +hsw_device = STRUCT; \
goto BREAK;
 
 
 Can't this be moved to the actual Haswell block ?
 
 It doesn't really make sense to assign it for all non-Haswell devices.
 
has_break:
  -  intel_hsw_device.vendor_id = device_id;
  -  intel_hsw_device.platform = intel_platform;
  -  ret = intel_hsw_device;
  +  hsw_device-vendor_id = device_id;
  +  hsw_device-platform = intel_platform;
  +  ret = hsw_device;
  break;
 
 
 E.g. down here.
 
 Best regards,
   Jesper
 
 
 
 ___
 Beignet mailing list
 Beignet@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/beignet



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/3] [opencl-1.2] Implement the clEnqueueFillBuffer API.

2014-04-29 Thread He Junyan


On Tue, 2014-04-29 at 13:57 +0800, Zhigang Gong wrote:
 Some minor comments as below:
 
 On Wed, Apr 23, 2014 at 04:35:25PM +0800, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
  
  We use the floatn's assigment to do the copy.
  128 pattern size is according to double16, and because
  the double problem on our platform, we use to float16
  to handle this.
  unaligned cases is not optimized now, just use the char
  assigment.
  
  Signed-off-by: Junyan He junyan...@linux.intel.com
  ---
   src/cl_api.c |  78 
   src/cl_context.c | 133 
  ++-
   src/cl_context.h |   8 
   src/cl_enqueue.c |   1 +
   src/cl_enqueue.h |   1 +
   src/cl_event.c   |   1 +
   src/cl_mem.c | 102 ++
   src/cl_mem.h |   3 ++
   8 files changed, 295 insertions(+), 32 deletions(-)
  
  diff --git a/src/cl_api.c b/src/cl_api.c
  index 1543ff4..be94bcb 100644
  --- a/src/cl_api.c
  +++ b/src/cl_api.c
  @@ -1592,6 +1592,84 @@ error:
   }
   
   cl_int
  +clEnqueueFillBuffer(cl_command_queue   command_queue,
  +cl_mem buffer,
  +const void *   pattern,
  +size_t pattern_size,
  +size_t offset,
  +size_t size,
  +cl_uintnum_events_in_wait_list,
  +const cl_event *   event_wait_list,
  +cl_event * event)
  +{
  +  cl_int err = CL_SUCCESS;
  +  enqueue_data *data, no_wait_data = { 0 };
  +  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
  +  int i = 0;
  +
  +  CHECK_QUEUE(command_queue);
  +  CHECK_MEM(buffer);
  +
  +  if (command_queue-ctx != buffer-ctx) {
  +err = CL_INVALID_CONTEXT;
  +goto error;
  +  }
  +
  +  if (offset  0 || offset + size  buffer-size) {
  +err = CL_INVALID_VALUE;
  +goto error;
  +  }
  +
  +  if (pattern == NULL) {
  +err = CL_INVALID_VALUE;
  +goto error;
  +  }
  +
  +  for (i = 0; i  sizeof(valid_sz)/sizeof(size_t); i++) {
 coding style issue, we'd better to use sizeof(valid_sz) / sizeof(size_t)
 rather than the above compact style. I noticed you mixed two styles in the
 same patch, please fix it in the new version.
OK, that needs to be refined.

 
  +if (valid_sz[i] == pattern_size)
  +  break;
  +  }
  +  if (i == sizeof(valid_sz)/sizeof(size_t)) {
  +err = CL_INVALID_VALUE;
  +goto error;
  +  }
  +
  +  if (offset%pattern_size || size%pattern_size) {
  +err = CL_INVALID_VALUE;
  +goto error;
  +  }
  +
  +  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, 
  size);
  +  if (err) {
  +goto error;
  +  }
  +
  +  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, 
  event, buffer-ctx);
  +
  +  data = no_wait_data;
  +  data-type = EnqueueFillBuffer;
  +  data-queue = command_queue;
  +
  +  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
  +   event, data, CL_COMMAND_FILL_BUFFER) == 
  CL_ENQUEUE_EXECUTE_IMM) {
  +if (event  (*event)-type != CL_COMMAND_USER
  + (*event)-queue-props  CL_QUEUE_PROFILING_ENABLE) {
  +  cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
  +}
  +
  +err = cl_command_queue_flush(command_queue);
  +  }
  +
  +  if(b_output_kernel_perf)
  +time_end(command_queue-ctx, beignet internal kernel : 
  cl_fill_buffer, command_queue);
  +
  +  return 0;
  +
  + error:
  +  return err;
  +}
  +
  +cl_int
   clEnqueueCopyBuffer(cl_command_queue command_queue,
   cl_mem   src_buffer,
   cl_mem   dst_buffer,
  diff --git a/src/cl_context.c b/src/cl_context.c
  index 8190e6a..e2dba65 100644
  --- a/src/cl_context.c
  +++ b/src/cl_context.c
  @@ -1,4 +1,4 @@
  -/* 
  +/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
  @@ -188,6 +188,7 @@ error:
   LOCAL void
   cl_context_delete(cl_context ctx)
   {
  +  int i = 0;
 if (UNLIKELY(ctx == NULL))
   return;
   
  @@ -195,6 +196,18 @@ cl_context_delete(cl_context ctx)
 if (atomic_dec(ctx-ref_n)  1)
   return;
   
  +  /* delete the internal programs. */
  +  for (i = CL_ENQUEUE_COPY_BUFFER_ALIGN4; i  CL_INTERNAL_KERNEL_MAX; i++) 
  {
 Use i = 0 here may be better or define CL_INTERNAL_KERNEL_MIN to 0 and use 
 that macro
 instead of using a specific enum number.

Because the CL_ENQUEUE_COPY_BUFFER_ALIGN4 is not the first one, and we
just handle these 4 cases, so it is hard to start from 0 or
CL_INTERNAL_KERNEL_MIN

  +if (ctx-internel_kernels[i]) {
  +  cl_kernel_delete(ctx-internel_kernels[i]);
  +  ctx-internel_kernels[i] = NULL;
  +
  +  assert(ctx-internal_prgs[i]);
  +  cl_program_delete(ctx

Re: [Beignet] [PATCH 3/3] GBE: work around baytrail-t hang issue.

2014-04-17 Thread He Junyan
The whole patch set is OK.
I use the same manner to recode the gen version for HSW enabling.
PCI device ID seems more precise when useful to handle the conner
case even in same gen version.
I will rebase to your patch.



On Thu, 2014-04-17 at 15:06 +0800, Chuanbo Weng wrote:
 From: Zhigang Gong zhigang.g...@linux.intel.com
 
 There is an unkown issue with baytrail-t platform. It will hang at
 utest's compiler_global_constant case. After some investigation,
 it turns out to be related to the DWORD GATHER READ send message
 on the constand cache data port. I change to use data cache data
 port could work around that hang issue.
 
 Now we only fail one more case on baytrail-t compare to the IVB
 desktop platform which is the:
 
 profiling_exec()[FAILED]
Error: Too large time from submit to start
 
 That may be caused by kernel related issue. And that bug will not
 cause serious issue for normal kernel. So after this patch, the
 baytrail-t platform should be in a pretty good shape with beignet.
 
 Signed-off-by: Zhigang Gong zhigang.g...@linux.intel.com
 ---
  backend/src/backend/gen_encoder.cpp | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)
 
 diff --git a/backend/src/backend/gen_encoder.cpp 
 b/backend/src/backend/gen_encoder.cpp
 index c991661..1d1b5df 100644
 --- a/backend/src/backend/gen_encoder.cpp
 +++ b/backend/src/backend/gen_encoder.cpp
 @@ -206,7 +206,11 @@ namespace gbe
   uint32_t msg_length,
   uint32_t response_length)
{
 -const GenMessageTarget sfid = GEN6_SFID_DATAPORT_CONSTANT_CACHE;
 +// FIXME there is a unknown issue with baytrail-t platform, the DWORD 
 scatter
 +// message causes a hang at unit test case compiler_global_constant.
 +// We workaround it to use DATA CACHE instead.
 +const GenMessageTarget sfid = (p-deviceID == PCI_CHIP_BAYTRAIL_T) ?
 + GEN_SFID_DATAPORT_DATA_CACHE : 
 GEN6_SFID_DATAPORT_CONSTANT_CACHE;
  setMessageDescriptor(p, insn, sfid, msg_length, response_length);
  insn-bits3.gen7_dword_rw.msg_type = msg_type;
  insn-bits3.gen7_dword_rw.bti = bti;



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context

2013-11-08 Thread He Junyan
So can I understand like this:
TLS (Thread local storage) is a global section map to each thread's
space. Each thread keep one copy of this section's copy.
And thread_specific is in heap, using sync function to manage the
resource for each thread. 
?


On Fri, 2013-11-08 at 02:58 +, Zou, Nanhai wrote:
 TLS (Thread local storage) is useful for convert legacy thread unsafe program 
 into thread-safe. 
 E.g. errno in glibc.
 
 But for this case, I think explicitly separate the thread specific data is 
 better. 
 Not only for thread safe, but also for later optimization. 
 This help us to collect all data that will be modified during NDRange.
 
 Thanks
 Zou Nanhai
 
 -Original Message-
 From: beignet-boun...@lists.freedesktop.org 
 [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Song, Ruiling
 Sent: Friday, November 08, 2013 10:38 AM
 To: Zhigang Gong; junyan...@inbox.com
 Cc: Junyan He; beignet@lists.freedesktop.org
 Subject: Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to 
 thread specific context
 
 I am really new to the keyword __thread, and have a quick look at docs on 
 the web:
 http://gcc.gnu.org/onlinedocs/gcc-3.3.1/gcc/Thread-Local.html#Thread-Local
 it says: The __thread specifier may be applied to any global, file-scoped 
 static, function-scoped static, or static data member of a class. It may not 
 be applied to block-scoped automatic or non-static data member. 
 From my understanding, this is not proper for our case.
 
 Thanks!
 Ruiling
 -Original Message-
 From: beignet-boun...@lists.freedesktop.org 
 [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Zhigang Gong
 Sent: Friday, November 08, 2013 8:29 AM
 To: junyan...@inbox.com
 Cc: Junyan He; beignet@lists.freedesktop.org
 Subject: Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to 
 thread specific context
 
 I agree with you that use thread data is better than locking.
 One comment, how about to use thread local storage to simplify this patch as 
 below:
 
 struct _cl_commonand_queue {
 ...
 __thread cl_gpgpu gpgpu;
 ...
 };
 
 Then in the initialization stage, set it to NULL;
 
 queue-gpgpu = NULL;
 
 In the head of each functions which use queue-gpgpu, add the following
 code:
 
 if (queue-gpgpu == NULL)
   TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv));
 
 Then we don't need to change any other code?
 
 What's your opinion?
 
 On Fri, Nov 08, 2013 at 12:58:00AM +0800, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
  
  We find some cases will use multi-threads to run on the same queue, 
  executing the same kernel. This will cause the gpgpu struct which is 
  very important for GPU context setting be destroyed because we do not 
  implement any sync protect on it now.
  Move the gpgpu struct into thread specific space will fix this problem 
  because the lib_drm will do the GPU command serialization for us.
  ---
   src/CMakeLists.txt  |1 +
   src/cl_command_queue.c  |   27 +++-
   src/cl_command_queue.h  |9 +-
   src/cl_command_queue_gen7.c |7 +++--
   src/cl_event.c  |6 ++--
   src/cl_thread.c |   72 
  +++
   src/cl_thread.h |   34 
   utests/CMakeLists.txt   |2 +-
   8 files changed, 144 insertions(+), 14 deletions(-)  create mode
  100644 src/cl_thread.c  create mode 100644 src/cl_thread.h
  
  diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 
  1e28c6c..59d330e 100644
  --- a/src/CMakeLists.txt
  +++ b/src/CMakeLists.txt
  @@ -39,6 +39,7 @@ set(OPENCL_SRC
   cl_command_queue.c
   cl_command_queue.h
   cl_command_queue_gen7.c
  +cl_thread.c
   cl_driver.h
   cl_driver.cpp
   cl_driver_defs.c
  diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index
  3f9d95c..3530976 100644
  --- a/src/cl_command_queue.c
  +++ b/src/cl_command_queue.c
  @@ -24,6 +24,7 @@
   #include cl_device_id.h
   #include cl_mem.h
   #include cl_utils.h
  +#include cl_thread.h
   #include cl_alloc.h
   #include cl_driver.h
   #include cl_khr_icd.h
  @@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx)
 queue-magic = CL_MAGIC_QUEUE_HEADER;
 queue-ref_n = 1;
 queue-ctx = ctx;
  -  TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv));
  +  if ((queue-thread_data = cl_thread_data_create()) == NULL) {
  +goto error;
  +  }
   
 /* Append the command queue in the list */
 pthread_mutex_lock(ctx-queue_lock);
  @@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue)
   cl_mem_delete(queue-fulsim_out);
   queue-fulsim_out = NULL;
 }
  +
  +  cl_thread_data_destroy(queue-thread_data);
  +  queue-thread_data = NULL;
 cl_mem_delete(queue-perf);
 cl_context_delete(queue-ctx);
  -  cl_gpgpu_delete(queue-gpgpu);
 cl_free(queue-wait_events);
 queue-magic = CL_MAGIC_DEAD_HEADER; /* For safety

Re: [Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context

2013-11-07 Thread He Junyan
I have tried __thread extension of GCC as you said.
It seems OK for global var but now workable for struct field.
I think struct memory may be allocated from heap, may be
on the stack, and can also be global var. So it may be impossible
for compiler to figure out how to store one of its field in thread
specific space.

As you said, it really may have a problem if one thread configure all
the queue context and then create another thread to exec NDRange.
But in current code, the gpu state will always be inited every time
when call exec NDRange.


On Fri, 2013-11-08 at 08:29 +0800, Zhigang Gong wrote:
 I agree with you that use thread data is better than locking.
 One comment, how about to use thread local storage to simplify
 this patch as below:
 
 struct _cl_commonand_queue {
 ...
 __thread cl_gpgpu gpgpu;
 ...
 };
 
 Then in the initialization stage, set it to NULL;
 
 queue-gpgpu = NULL;
 
 In the head of each functions which use queue-gpgpu, add the following
 code:
 
 if (queue-gpgpu == NULL)
   TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv));
 
 Then we don't need to change any other code?
 
 What's your opinion?
 
 On Fri, Nov 08, 2013 at 12:58:00AM +0800, junyan...@inbox.com wrote:
  From: Junyan He junyan...@linux.intel.com
  
  We find some cases will use multi-threads to run on the same queue,
  executing the same kernel. This will cause the gpgpu struct which
  is very important for GPU context setting be destroyed because we
  do not implement any sync protect on it now.
  Move the gpgpu struct into thread specific space will fix this problem
  because the lib_drm will do the GPU command serialization for us.
  ---
   src/CMakeLists.txt  |1 +
   src/cl_command_queue.c  |   27 +++-
   src/cl_command_queue.h  |9 +-
   src/cl_command_queue_gen7.c |7 +++--
   src/cl_event.c  |6 ++--
   src/cl_thread.c |   72 
  +++
   src/cl_thread.h |   34 
   utests/CMakeLists.txt   |2 +-
   8 files changed, 144 insertions(+), 14 deletions(-)
   create mode 100644 src/cl_thread.c
   create mode 100644 src/cl_thread.h
  
  diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
  index 1e28c6c..59d330e 100644
  --- a/src/CMakeLists.txt
  +++ b/src/CMakeLists.txt
  @@ -39,6 +39,7 @@ set(OPENCL_SRC
   cl_command_queue.c
   cl_command_queue.h
   cl_command_queue_gen7.c
  +cl_thread.c
   cl_driver.h
   cl_driver.cpp
   cl_driver_defs.c
  diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
  index 3f9d95c..3530976 100644
  --- a/src/cl_command_queue.c
  +++ b/src/cl_command_queue.c
  @@ -24,6 +24,7 @@
   #include cl_device_id.h
   #include cl_mem.h
   #include cl_utils.h
  +#include cl_thread.h
   #include cl_alloc.h
   #include cl_driver.h
   #include cl_khr_icd.h
  @@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx)
 queue-magic = CL_MAGIC_QUEUE_HEADER;
 queue-ref_n = 1;
 queue-ctx = ctx;
  -  TRY_ALLOC_NO_ERR (queue-gpgpu, cl_gpgpu_new(ctx-drv));
  +  if ((queue-thread_data = cl_thread_data_create()) == NULL) {
  +goto error;
  +  }
   
 /* Append the command queue in the list */
 pthread_mutex_lock(ctx-queue_lock);
  @@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue)
   cl_mem_delete(queue-fulsim_out);
   queue-fulsim_out = NULL;
 }
  +
  +  cl_thread_data_destroy(queue-thread_data);
  +  queue-thread_data = NULL;
 cl_mem_delete(queue-perf);
 cl_context_delete(queue-ctx);
  -  cl_gpgpu_delete(queue-gpgpu);
 cl_free(queue-wait_events);
 queue-magic = CL_MAGIC_DEAD_HEADER; /* For safety */
 cl_free(queue);
  @@ -119,13 +124,15 @@ LOCAL cl_int
   cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
   {
 uint32_t i;
  +  GET_QUEUE_THREAD_GPGPU(queue);
  +
 for (i = 0; i  k-image_sz; i++) {
   int id = k-images[i].arg_idx;
   struct _cl_mem_image *image;
   assert(gbe_kernel_get_arg_type(k-opaque, id) == GBE_ARG_IMAGE);
   image = cl_mem_image(k-args[id].mem);
   set_image_info(k-curbe, k-images[i], image);
  -cl_gpgpu_bind_image(queue-gpgpu, k-images[i].idx, image-base.bo, 
  image-offset,
  +cl_gpgpu_bind_image(gpgpu, k-images[i].idx, image-base.bo, 
  image-offset,
   image-intel_fmt, image-image_type,
   image-w, image-h, image-depth,
   image-row_pitch, image-tiling);
  @@ -136,6 +143,8 @@ cl_command_queue_bind_image(cl_command_queue queue, 
  cl_kernel k)
   LOCAL cl_int
   cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   {
  +  GET_QUEUE_THREAD_GPGPU(queue);
  +
 /* Bind all user buffers (given by clSetKernelArg) */
 uint32_t i;
 enum gbe_arg_type arg_type; /* kind of argument */
  @@ -147,9 +156,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, 
  cl_kernel k)
   offset

Re: [Beignet] [REFINED PATCH 4/4] Add a test case for binary load.

2013-09-12 Thread He Junyan
Fix the problem of out-source building


On Thu, 2013-09-12 at 14:06 +0800, junyan...@inbox.com wrote:
 From: Junyan He junyan...@linux.intel.com
 
 Signed-off-by: Junyan He junyan...@linux.intel.com
 ---
  utests/CMakeLists.txt|   13 +++
  utests/load_program_from_bin.cpp |   77 
 ++
  utests/utest_helper.cpp  |8 ++--
  utests/utest_helper.hpp  |3 ++
  4 files changed, 97 insertions(+), 4 deletions(-)
  create mode 100644 utests/load_program_from_bin.cpp
 
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
 index ffabc39..06188a6 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
 @@ -141,11 +141,22 @@ set (utests_sources
compiler_long_mult.cpp
compiler_long_cmp.cpp
compiler_bool_cross_basic_block.cpp
 +  load_program_from_bin.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
utest_helper.cpp)
  
 +SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
 +ADD_CUSTOM_COMMAND(
 +OUTPUT ${kernel_bin}.bin
 +COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater 
 ${kernel_bin}.cl -o${kernel_bin}.bin
 +DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater 
 ${kernel_bin}.cl
 +)
 +
 +ADD_CUSTOM_TARGET(kernel_bin.bin
 +DEPENDS ${kernel_bin}.bin)
 +
  if (EGL_FOUND AND MESA_SOURCE_FOUND)
  SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
  SET(CMAKE_CXX_FLAGS -DHAS_EGL ${CMAKE_CXX_FLAGS})
 @@ -158,7 +169,9 @@ TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} 
 ${CMAKE_THREAD_LIBS_INIT})
  
  ADD_EXECUTABLE(utest_run utest_run.cpp)
  TARGET_LINK_LIBRARIES(utest_run utests)
 +ADD_DEPENDENCIES (utest_run kernel_bin.bin)
  
  ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
  TARGET_LINK_LIBRARIES(flat_address_space utests)
  
 +
 diff --git a/utests/load_program_from_bin.cpp 
 b/utests/load_program_from_bin.cpp
 new file mode 100644
 index 000..d45c2bd
 --- /dev/null
 +++ b/utests/load_program_from_bin.cpp
 @@ -0,0 +1,77 @@
 +#include utest_helper.hpp
 +#include utest_file_map.hpp
 +#include cmath
 +#include algorithm
 +
 +using namespace std;
 +
 +static void cpu(int global_id, float *src, float *dst) {
 +dst[global_id] = ceilf(src[global_id]);
 +}
 +
 +static void test_load_program_from_bin(void)
 +{
 +const size_t n = 16;
 +float cpu_dst[16], cpu_src[16];
 +cl_int status;
 +cl_int binary_status;
 +char *ker_path = NULL;
 +
 +cl_file_map_t *fm = cl_file_map_new();
 +ker_path = cl_do_kiss_path(compiler_ceil.bin, device);
 +OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
 +
 +const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
 +const size_t sz = cl_file_map_size(fm);
 +
 +program = clCreateProgramWithBinary(ctx, 1,
 +  device, sz, src, binary_status, status);
 +
 +OCL_ASSERT(program  status == CL_SUCCESS);
 +
 +/* OCL requires to build the program even if it is created from a binary 
 */
 +OCL_ASSERT(clBuildProgram(program, 1, device, NULL, NULL, NULL) == 
 CL_SUCCESS);
 +
 +kernel = clCreateKernel(program, compiler_ceil, status);
 +OCL_ASSERT(status == CL_SUCCESS);
 +
 +OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
 +OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
 +OCL_SET_ARG(0, sizeof(cl_mem), buf[0]);
 +OCL_SET_ARG(1, sizeof(cl_mem), buf[1]);
 +globals[0] = 16;
 +locals[0] = 16;
 +
 +// Run random tests
 +for (uint32_t pass = 0; pass  8; ++pass) {
 +OCL_MAP_BUFFER(0);
 +for (int32_t i = 0; i  (int32_t) n; ++i)
 +cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand()  15) - 
 .75f;
 +OCL_UNMAP_BUFFER(0);
 +
 +// Run the kernel on GPU
 +OCL_NDRANGE(1);
 +
 +// Run on CPU
 +for (int32_t i = 0; i  (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
 +
 +// Compare
 +OCL_MAP_BUFFER(1);
 +
 +#if 0
 +printf( GPU:\n);
 +for (int32_t i = 0; i  (int32_t) n; ++i)
 +printf( %f, ((float *)buf_data[1])[i]);
 +printf(\n CPU:\n);
 +for (int32_t i = 0; i  (int32_t) n; ++i)
 +printf( %f, cpu_dst[i]);
 +printf(\n);
 +#endif
 +
 +for (int32_t i = 0; i  (int32_t) n; ++i)
 +OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
 +OCL_UNMAP_BUFFER(1);
 +}
 +}
 +
 +MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin);
 diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
 index b4f61df..8089799 100644
 --- a/utests/utest_helper.cpp
 +++ b/utests/utest_helper.cpp
 @@ -205,8 +205,8 @@ clpanic(const char *msg, int rval)
exit(-1);
  }
  
 -static char*
 -do_kiss_path(const char *file, cl_device_id device)
 +char*
 +cl_do_kiss_path(const char *file, cl_device_id device)
  {
cl_int ver;
const char *sub_path = NULL;
 @@ -239,7 +239,7 @@ cl_kernel_init

  1   2   >