[Beignet] [PATCH] intel: Export pooled EU and min no. of eus in a pool.

2016-07-18 Thread Yang Rong
Update kernel interface with new I915_GETPARAM ioctl entries for
pooled EU and min no. of eus in a pool. Add a wrapping function
for each parameter. Userspace drivers need these values when decide
the thread count. This kernel enabled pooled eu by default for BXT
and for fused down 2x6 parts it is advised to turn it off.

But there is another HW issue in these parts (fused
down 2x6 parts) before C0 that requires Pooled EU to be enabled as a
workaround. In this case the pool configuration changes depending upon
which subslice is disabled and the no. of eus in a pool is different,
So userspace need to know min no. of eus in a pool.

V2: use return value as the query results.
ret < 0 when error, ret = 0 when not support, and ret > 0 indicate
query results.(Chris)
V3: Correct V2 errors.

Signed-off-by: Yang Rong 
---
 include/drm/i915_drm.h   |  2 ++
 intel/intel_bufmgr.h |  3 +++
 intel/intel_bufmgr_gem.c | 30 ++
 3 files changed, 35 insertions(+)

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index c4ce6b2..eb611a7 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -357,6 +357,8 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_GPU_RESET35
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 #define I915_PARAM_HAS_EXEC_SOFTPIN 37
+#define I915_PARAM_HAS_POOLED_EU 38
+#define I915_PARAM_MIN_EU_IN_POOL39
 
 typedef struct drm_i915_getparam {
__s32 param;
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index a1abbcd..96a4d9d 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -273,6 +273,9 @@ int drm_intel_get_reset_stats(drm_intel_context *ctx,
 int drm_intel_get_subslice_total(int fd, unsigned int *subslice_total);
 int drm_intel_get_eu_total(int fd, unsigned int *eu_total);
 
+int drm_intel_get_pooled_eu(int fd);
+int drm_intel_get_min_eu_in_pool(int fd);
+
 /** @{ Compatibility defines to keep old code building despite the symbol 
rename
  * from dri_* to drm_intel_*
  */
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 0a4012b..b0a0eb9 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -3237,6 +3237,36 @@ drm_intel_get_eu_total(int fd, unsigned int *eu_total)
return 0;
 }
 
+int
+drm_intel_get_pooled_eu(int fd)
+{
+   drm_i915_getparam_t gp;
+   int ret;
+
+   memclear(gp);
+   gp.param = I915_PARAM_HAS_POOLED_EU;
+   gp.value = 
+   if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, ))
+   return -errno;
+
+   return ret;
+}
+
+int
+drm_intel_get_min_eu_in_pool(int fd)
+{
+   drm_i915_getparam_t gp;
+   int ret;
+
+   memclear(gp);
+   gp.param = I915_PARAM_MIN_EU_IN_POOL;
+   gp.value = 
+   if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, ))
+   return -errno;
+
+   return ret;
+}
+
 /**
  * Annotate the given bo for use in aub dumping.
  *
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Utest: Add check for OpenCL 2.0 extension

2016-07-18 Thread Xiuli Pan
From: Pan Xiuli 

Beignet has some opencl2.0 builtin functions, add check for OpenCL 2.0
or Beignet for these test cases.

Signed-off-by: Pan Xiuli 
---
 utests/builtin_global_linear_id.cpp |  2 ++
 utests/builtin_local_linear_id.cpp  |  2 ++
 utests/utest_helper.cpp | 29 +
 utests/utest_helper.hpp |  3 +++
 4 files changed, 36 insertions(+)

diff --git a/utests/builtin_global_linear_id.cpp 
b/utests/builtin_global_linear_id.cpp
index 06807c2..cda7e84 100644
--- a/utests/builtin_global_linear_id.cpp
+++ b/utests/builtin_global_linear_id.cpp
@@ -29,6 +29,8 @@ dimension:3
 #include "utest_helper.hpp"
 static void builtin_global_linear_id(void)
 {
+  if (!cl_check_ocl20())
+return;
 
   // Setup kernel and buffers
   int dim, err, i, buf_len=1;
diff --git a/utests/builtin_local_linear_id.cpp 
b/utests/builtin_local_linear_id.cpp
index 8d706d0..88cb357 100644
--- a/utests/builtin_local_linear_id.cpp
+++ b/utests/builtin_local_linear_id.cpp
@@ -30,6 +30,8 @@ dimension:3
 #include "utest_helper.hpp"
 static void builtin_local_linear_id(void)
 {
+  if (!cl_check_ocl20())
+return;
 
   // Setup kernel and buffers
   int dim, i, buf_len=1;
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 0ecbea3..da4cfbf 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -896,3 +896,32 @@ int cl_check_subgroups(void)
   return 1;
 }
 
+int cl_check_ocl20(void)
+{
+  size_t param_value_size;
+  size_t ret_sz;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_OPENCL_C_VERSION, 0, 0, 
_value_size);
+  if(param_value_size == 0) {
+printf("Not OpenCL 2.0 device, ");
+if(cl_check_beignet()) {
+  printf("Beignet extension test!");
+  return 1;
+}
+return 0;
+  }
+  char* device_version_str = (char* )malloc(param_value_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_OPENCL_C_VERSION, 
param_value_size, (void*)device_version_str, _sz);
+  OCL_ASSERT(ret_sz == param_value_size);
+
+  if(!strstr(device_version_str, "2.0")) {
+free(device_version_str);
+printf("Not OpenCL 2.0 device, ");
+if(cl_check_beignet()) {
+  printf("Beignet extension test!");
+  return 1;
+}
+return 0;
+  }
+  free(device_version_str);
+  return 1;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 7d05056..421e356 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -296,4 +296,7 @@ typedef cl_int(clGetKernelSubGroupInfoKHR_cb)(cl_kernel, 
cl_device_id,
   const void *, size_t, void *,
   size_t *);
 extern clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR;
+
+/* Check is cl version 2.0. */
+extern int cl_check_ocl20(void);
 #endif /* __UTEST_HELPER_HPP__ */
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad

2016-07-18 Thread Grigore Lupescu
From: Grigore Lupescu 

Affected functions:
__gen_ocl_internal_log
__gen_ocl_internal_log10
__gen_ocl_internal_log2
__kernel_sinf
__kernel_cosf
__gen_ocl_internal_cbrt
__gen_ocl_asin_util
tan
log1p

Signed-off-by: Grigore Lupescu 
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl | 385 +--
 1 file changed, 159 insertions(+), 226 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl 
b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 9ea0817..149ba01 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -164,7 +164,7 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, 
float y) {
   return ux.f;
 }
 
-OVERLOADABLE float __gen_ocl_internal_log(float x) {
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
 /*
  *  Conversion to float by Ian Lance Taylor, Cygnus Support, i...@cygnus.com
  * 
@@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
  */
   union { unsigned int i; float f; } u;
   const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =3.355443200e+07, /* 0x4c00 */
+  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =  3.355443200e+07, /* 0x4c00 */
   Lg1 = 6.668653e-01, /* 3F2B */
   Lg2 = 4.000596e-01, /* 3ECD */
   Lg3 = 2.8571429849e-01, /* 3E924925 */
   Lg4 = 2.198546e-01; /* 3E638E29 */
 
   const float zero   =  0.0;
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
+  float fsq, f, s, z, R, w, t1, t2, partial;
+  int k, ix, i, j;
 
   u.f = x;  ix = u.i;
-  k=0;
-  if (ix < 0x0080) {  /* x < 2**-126  */
-  if ((ix&0x7fff)==0)
-return -two25/zero;   /* log(+-0)=-inf */
-  if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
-  return -INFINITY;  /* Gen does not support subnormal number now */
-  //k -= 25; x *= two25; /* subnormal number, scale up x */
-  //u.f = x;  ix = u.i;
-  }
-  if (ix >= 0x7f80) return x+x;
-  k += (ix>>23)-127;
+  k = 0;
+
+  k += (ix>>23) - 127;
   ix &= 0x007f;
-  i = (ix+(0x95f64<<3))&0x80;
-  u.i = ix|(i^0x3f80); x = u.f;
+  i = (ix + (0x95f64<<3)) & 0x80;
+  u.i = ix | (i^0x3f80); x = u.f;
   k += (i>>23);
-  f = x-(float)1.0;
-  if((0x007f&(15+ix))<16) { /* |f| < 2**-20 */
-  if(f==zero) {
-if(k==0) return zero;
-else {
-  dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
-}
-  }
-  R = f*f*((float)0.5-(float)0.3*f);
-  if(k==0)
-return f-R;
-  else {
-dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
-  }
+  f = x - 1.0f;
+  fsq = f * f;
+
+  if((0x007f & (15 + ix)) < 16) { /* |f| < 2**-20 */
+  R = fsq * (0.5f - 0.3f * f);
+  return k * ln2_hi + k * ln2_lo + f - R;
   }
-  s = f/((float)2.0+f);
-  dk = (float)k;
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*Lg4);
-  t2= z*(Lg1+w*Lg3);
+
+  s = f / (2.0f + f);
+  z = s * s;
+  i = ix - (0x6147a << 3);
+  w = z * z;
+  j = (0x6b851 << 3) - ix;
+  t1= w * mad(w, Lg4, Lg2);
+  t2= z * mad(w, Lg3, Lg1);
   i |= j;
-  R = t2+t1;
-  if(i>0) {
-  hfsq=(float)0.5*f*f;
-  if(k==0) return f-(hfsq-s*(hfsq+R)); else
- return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
-  } else {
-  if(k==0) return f-s*(f-R); else
- return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
-  }
+  R = t2 + t1;
+  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
 }
 
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+  union { unsigned int i; float f; } u;
+  u.f = x;
+  int ix = u.i;
 
-OVERLOADABLE float __gen_ocl_internal_log10(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, i...@cygnus.com
- * 
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * 
- */
+  if (ix < 0 )
+   return NAN;  /* log(-#) = NaN */
+  if (ix >= 0x7f80)
+return NAN;
 
-  union {float f; unsigned i; }u;
+  return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+  union { float f; unsigned i; } u;
   const float
-  zero   = 0.0,
-  two25  =  3.3554432000e+07, /* 0x4c00 */
   ivln10 =  4.3429449201e-01, /* 0x3ede5bd9 */
   log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
   log10_2lo  =  7.9034151668e-07; /* 0x355427db */
 
-  float y,z;
-  int i,k,hx;
+  float y, z;
+  int i, k, hx;
 
   u.f = x; hx = u.i;
-  k=0;
-  

[Beignet] [PATCH] do not use const pointer

2016-07-18 Thread Guo Yejun
Signed-off-by: Guo Yejun 
---
 utests/builtin_acos_asin.cpp | 8 ++--
 utests/builtin_exp.cpp   | 8 ++--
 utests/builtin_pow.cpp   | 7 ++-
 utests/utest_generator.py| 8 ++--
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
index 395460b..21fe461 100644
--- a/utests/builtin_acos_asin.cpp
+++ b/utests/builtin_acos_asin.cpp
@@ -10,7 +10,9 @@
   printf("\033[0m");\
 }
 
-const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 
1, 30};
+namespace {
+
+float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
 const int count_input = sizeof(input_data) / sizeof(input_data[0]);
 const int max_function = 5;
 
@@ -44,7 +46,8 @@ static void builtin_acos_asin(void)
   locals[0] = 1;
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * 
sizeof(float), input_data, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), _function 
, 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), , 0, 
NULL, NULL);
 
// Run the kernel
   OCL_NDRANGE( 1 );
@@ -85,3 +88,4 @@ static void builtin_acos_asin(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
+}
diff --git a/utests/builtin_exp.cpp b/utests/builtin_exp.cpp
index 6d51c33..2c214bd 100644
--- a/utests/builtin_exp.cpp
+++ b/utests/builtin_exp.cpp
@@ -15,7 +15,9 @@
   printf("\033[0m");\
 }
 
-const float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 
3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
+namespace{
+
+float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, 
-3.14, -0.5, 0.5, 1, -1, 0.0 };
 const int count_input = sizeof(input_data) / sizeof(input_data[0]);
 const int max_function = 5;
 
@@ -51,7 +53,8 @@ static void builtin_exp(void)
   locals[0] = 1;
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * 
sizeof(float), input_data, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), _function 
, 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), , 0, 
NULL, NULL);
 
// Run the kernel
   OCL_NDRANGE( 1 );
@@ -100,3 +103,4 @@ static void builtin_exp(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_exp)
+}
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
index 21fa895..1f6af0e 100644
--- a/utests/builtin_pow.cpp
+++ b/utests/builtin_pow.cpp
@@ -10,6 +10,9 @@
   printf( __VA_ARGS__ );\
   printf("\033[0m");\
 }
+
+namespace {
+
 const float ori_data[] = {-20.5, -1, -0.9, -0.01, 0, 0.01, 0.9, 1.0, 20.5};
 const int count_input_ori = sizeof(ori_data) / sizeof(ori_data[0]);
 const int count_input = count_input_ori * count_input_ori;
@@ -59,7 +62,8 @@ static void builtin_pow(void)
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * 
sizeof(float), input_data1, 0, NULL, NULL);
   clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, count_input * 
sizeof(float), input_data2, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), _function, 
0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), , 0, 
NULL, NULL);
 
// Run the kernel
   OCL_NDRANGE( 1 );
@@ -100,3 +104,4 @@ static void builtin_pow(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_pow)
+}
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index bcb9ac4..2c02ad6 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -280,9 +280,9 @@ which can print more values and information to assist 
debuging the issue.
   vals = vals[0:128]
   break
 vals += self.values[i]
-  self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
+  self.cpplines += [ "%s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
 self.cpplines += [ "const int count_input = sizeof(input_data1) / 
sizeof(input_data1[0]);" ]
-self.cpplines += [ "const int vector = %s;\n"%(vlen) ]
+self.cpplines += [ "int vector = %s;\n"%(vlen) ]
 
 #Cpu Function
   def GenCpuCompilerMath(self,index):
@@ -457,6 +457,8 @@ static void %s_%s(void)
   #The head:
   self.cpplines += [self.Head]
 
+  self.cpplines += ["namespace {\n"]
+
   #Parameters:
   self.GenInputValues(i)
 
@@ -469,6 +471,8 @@ static void %s_%s(void)
   #utest function
   self.utestFunc(i)
 
+  self.cpplines += ["}\n"]
+
   #kernel cl
   self.genCL(i)
 
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Utest: Add check for utest multithread run

2016-07-18 Thread Xiuli Pan
From: Pan Xiuli 

Now utest has some test case with issues, utest multithread run should
also ignore them.

Signed-off-by: Pan Xiuli 
---
 utests/utest.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utests/utest.cpp b/utests/utest.cpp
index 6d25db1..dac6c28 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -127,6 +127,7 @@ void *multithread(void * arg)
 sem_post();
 
 const  UTest  = (*UTest::utestList)[SerialNumber];
+if (utest.fn == NULL || utest.haveIssue || utest.isBenchMark) continue;
// printf("thread%lu  %d, utests.name is %s\n",PhtreadNumber, 
SerialNumber,utest.name);
 
 UTest::do_run(utest);
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 2/2] Utest: Add test case for sub_group_shuffle_down/up/xor

2016-07-18 Thread Xiuli Pan
From: Pan Xiuli 

V2:Add subgroups extension check.
Signed-off-by: Pan Xiuli 
---
 kernels/compiler_sub_group_shuffle_down.cl | 19 
 kernels/compiler_sub_group_shuffle_up.cl   | 19 
 kernels/compiler_sub_group_shuffle_xor.cl  | 19 
 utests/CMakeLists.txt  |  3 ++
 utests/compiler_sub_group_shuffle_down.cpp | 48 ++
 utests/compiler_sub_group_shuffle_up.cpp   | 48 ++
 utests/compiler_sub_group_shuffle_xor.cpp  | 48 ++
 7 files changed, 204 insertions(+)
 create mode 100644 kernels/compiler_sub_group_shuffle_down.cl
 create mode 100644 kernels/compiler_sub_group_shuffle_up.cl
 create mode 100644 kernels/compiler_sub_group_shuffle_xor.cl
 create mode 100644 utests/compiler_sub_group_shuffle_down.cpp
 create mode 100644 utests/compiler_sub_group_shuffle_up.cpp
 create mode 100644 utests/compiler_sub_group_shuffle_xor.cpp

diff --git a/kernels/compiler_sub_group_shuffle_down.cl 
b/kernels/compiler_sub_group_shuffle_down.cl
new file mode 100644
index 000..769fc3f
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle_down.cl
@@ -0,0 +1,19 @@
+__kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+dst[0] = get_max_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  int o0 = intel_sub_group_shuffle_down(123, 456, c);
+  int o1 = intel_sub_group_shuffle_down(123, from, c);
+  int o2 = intel_sub_group_shuffle_down(from, -from, k);
+  int o3 = intel_sub_group_shuffle_down(from, 321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/kernels/compiler_sub_group_shuffle_up.cl 
b/kernels/compiler_sub_group_shuffle_up.cl
new file mode 100644
index 000..5c5cee1
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle_up.cl
@@ -0,0 +1,19 @@
+__kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+dst[0] = get_max_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_sub_group_local_id() + 1;
+  int k = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int o0 = intel_sub_group_shuffle_up(123, 456, c);
+  int o1 = intel_sub_group_shuffle_up(123, from, c);
+  int o2 = intel_sub_group_shuffle_up(from, -from, k);
+  int o3 = intel_sub_group_shuffle_up(from, 321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/kernels/compiler_sub_group_shuffle_xor.cl 
b/kernels/compiler_sub_group_shuffle_xor.cl
new file mode 100644
index 000..8bc15d3
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle_xor.cl
@@ -0,0 +1,19 @@
+__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+dst[0] = get_max_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  int o0 = get_sub_group_local_id();
+  int o1 = intel_sub_group_shuffle_xor(from, c);
+  int o2 = intel_sub_group_shuffle_xor(from, j);
+  int o3 = intel_sub_group_shuffle_xor(from, k);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index f56c497..a711bd6 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -277,6 +277,9 @@ set (utests_sources
   compiler_get_max_sub_group_size.cpp
   compiler_get_sub_group_local_id.cpp
   compiler_sub_group_shuffle.cpp
+  compiler_sub_group_shuffle_down.cpp
+  compiler_sub_group_shuffle_up.cpp
+  compiler_sub_group_shuffle_xor.cpp
   builtin_global_linear_id.cpp
   builtin_local_linear_id.cpp
   compiler_mix.cpp
diff --git a/utests/compiler_sub_group_shuffle_down.cpp 
b/utests/compiler_sub_group_shuffle_down.cpp
new file mode 100644
index 000..8b23234
--- /dev/null
+++ b/utests/compiler_sub_group_shuffle_down.cpp
@@ -0,0 +1,48 @@
+#include "utest_helper.hpp"
+
+void compiler_sub_group_shuffle_down(void)
+{
+  if(!cl_check_subgroups())
+return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_down");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+
+  int c = 13;
+  OCL_SET_ARG(1, sizeof(int), );
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+((int*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  int* dst = (int *)buf_data[0];
+  int suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < 

[Beignet] [PATCH 1/2] Backend: Add intel_sub_group_shuffle_down/up/xor with shuffle

2016-07-18 Thread Xiuli Pan
From: Pan Xiuli 

We first get two result for the two src with shuffle then selection then
select the result with range condition, if we use if else with shuffle
the src may be influced by the if else prediction and the result may be
wrong.
Using the old shuffle with xor for shuffle_xor.

Signed-off-by: Pan Xiuli 
---
 backend/src/libocl/script/ocl_simd.def   |  9 +++
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 40 
 backend/src/libocl/tmpl/ocl_simd.tmpl.h  |  9 +++
 3 files changed, 58 insertions(+)

diff --git a/backend/src/libocl/script/ocl_simd.def 
b/backend/src/libocl/script/ocl_simd.def
index e26243e..aa47735 100644
--- a/backend/src/libocl/script/ocl_simd.def
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -2,3 +2,12 @@
 floatn intel_sub_group_shuffle(floatn x, uint c)
 intn intel_sub_group_shuffle(intn x, uint c)
 uintn intel_sub_group_shuffle(uintn x, uint c)
+floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_down(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_down(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_up(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_up(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_up(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_xor(floatn x, uint c)
+intn intel_sub_group_shuffle_xor(intn x, uint c)
+uintn intel_sub_group_shuffle_xor(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index b066502..ad30c3d 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -18,6 +18,7 @@
 
 #include "ocl_simd.h"
 #include "ocl_workitem.h"
+#include "ocl_as.h"
 
 uint get_max_sub_group_size(void)
 {
@@ -216,3 +217,42 @@ OVERLOADABLE void intel_sub_group_block_write8(image2d_t 
p, int2 cord, uint8 dat
 {
   __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
 }
+
+PURE CONST uint __gen_ocl_sub_group_shuffle_delta(uint x, uint y, uint c, uint 
inRange);
+static OVERLOADABLE INLINE uint as_uint(uint x)
+{
+  return x;
+}
+#define SHUFFLE_DOWN(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
+  TYPE res0, res1; \
+  res0 = intel_sub_group_shuffle(x, (get_sub_group_local_id() + 
c)%get_max_sub_group_size()); \
+  res1 = intel_sub_group_shuffle(y, (get_sub_group_local_id() + 
c)%get_max_sub_group_size()); \
+  bool inRange = ((int)c + (int)get_sub_group_local_id() > 0) && (((int)c + 
(int)get_sub_group_local_id() < (int) get_max_sub_group_size())); \
+  return inRange ? res0 : res1; \
+}
+SHUFFLE_DOWN(float)
+SHUFFLE_DOWN(int)
+SHUFFLE_DOWN(uint)
+#undef SHUFFLE_DOWN
+
+#define SHUFFLE_UP(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
+  TYPE res0, res1; \
+  res0 = intel_sub_group_shuffle(x, (get_max_sub_group_size() + 
get_sub_group_local_id() - c)%get_max_sub_group_size()); \
+  res1 = intel_sub_group_shuffle(y, (get_max_sub_group_size() + 
get_sub_group_local_id() - c)%get_max_sub_group_size()); \
+  bool inRange = ((int)c - (int)get_sub_group_local_id() > 0) && (((int)c - 
(int)get_sub_group_local_id() < (int) get_max_sub_group_size())); \
+  return inRange ? res0 : res1; \
+}
+SHUFFLE_UP(float)
+SHUFFLE_UP(int)
+SHUFFLE_UP(uint)
+#undef SHUFFLE_UP
+#define SHUFFLE_XOR(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
+  return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) % 
get_max_sub_group_size()); \
+}
+SHUFFLE_XOR(float)
+SHUFFLE_XOR(int)
+SHUFFLE_XOR(uint)
+#undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 799f772..15da0e7 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -132,6 +132,15 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
 
 /* blocak read/write */
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org